# 텍스트 데이터 - 영문

In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [129]:
# 파일에서 데이터 읽기

from sklearn.datasets import load_files
# load_files : 여러 개의 파일에서 데이터를 읽는 도구

reviews_train = load_files('../../Work-workspace-ml_basic-data_files/aclimdb/train')
reviews_test = load_files('../../Work-workspace-ml_basic-data_files/aclimdb/test')

In [130]:
# 데이터 기본 정보 탐색

print( reviews_train.keys() )
print( len(reviews_train['data']), len(reviews_test['data']) )
print( np.unique(reviews_train['target'], return_counts = True) )
print( reviews_train['target_names'] )
print( reviews_train['data'][0])    # b" > 바이너리 형식의 데이터 > 변환 필요

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
25000 25000
(array([0, 1]), array([12500, 12500]))
['neg', 'pos']
b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."


In [151]:
# 효과적인 전처리를 위해 DataFrame으로 변환
reviews_train_df = pd.DataFrame({ 'label': reviews_train['target'], 'review': reviews_train['data'] })
reviews_test_df = pd.DataFrame({ 'label': reviews_test['target'], 'review': reviews_test['data'] })

In [152]:
# 데이터 프레임 확인
reviews_train_df.head()

Unnamed: 0,label,review
0,1,"b""Zero Day leads you to think, even re-think w..."
1,0,b'Words can\'t describe how bad this movie is....
2,1,b'Everyone plays their part pretty well in thi...
3,0,b'There are a lot of highly talented filmmaker...
4,0,b'I\'ve just had the evidence that confirmed m...


In [153]:
# binary text -> char text
reviews_train_df['review'] = reviews_train_df['review'].str.decode(encoding='utf-8')
# str : 문자열을 제어하는 함수를 담고있는 객체
reviews_test_df['review'] = reviews_test_df['review'].str.decode(encoding='utf-8')


In [154]:
# decode 확인
reviews_train_df.head()

Unnamed: 0,label,review
0,1,"Zero Day leads you to think, even re-think why..."
1,0,Words can't describe how bad this movie is. I ...
2,1,Everyone plays their part pretty well in this ...
3,0,There are a lot of highly talented filmmakers/...
4,0,I've just had the evidence that confirmed my s...


In [155]:
# 처리된 데이터를 파일로 저장 ( 개별 파일에서 읽기 속도가 느려서 별도 파일로 저장)

reviews_train_df.to_csv('data-files/imdb_reviews_train.csv', index=False)
reviews_test_df.to_csv('data-files/imdb_reviews_test.csv', index=False)

In [156]:
# 저장된 파일 테스트
pd.read_csv('data-files/imdb_reviews_train.csv').head()

Unnamed: 0,label,review
0,1,"Zero Day leads you to think, even re-think why..."
1,0,Words can't describe how bad this movie is. I ...
2,1,Everyone plays their part pretty well in this ...
3,0,There are a lot of highly talented filmmakers/...
4,0,I've just had the evidence that confirmed my s...


In [157]:
# 데이터 정보 탐색
reviews_train_df.info()
reviews_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   25000 non-null  int64 
 1   review  25000 non-null  object
dtypes: int64(1), object(1)
memory usage: 390.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   25000 non-null  int64 
 1   review  25000 non-null  object
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [158]:
# 데이터 전처리 : 대문자 -> 소문자

reviews_train_df['review'] = reviews_train_df['review'].str.lower()
reviews_test_df['review'] = reviews_test_df['review'].str.lower()

In [159]:
# review에 markup(tag)가 포함된 것 확인 : <br / > 등
reviews_train_df['review'][0]

"zero day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />it is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. in terms of explaining the motives and actions of the two young suicide/murderers it is better than 'elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />flawed but honest with a terrible honesty."

In [160]:
# 데이터 전처리 2 : markup 제거
# reviews_train_df['review'] = reviews_train_df['review'].str.replace('<br />', '')
# <br />은 ''처리 > <br />만 작업 다른 마크업에 대해서는 못해 줌

from bs4 import BeautifulSoup   # 마크업을 제외한 나머지만 추출

# BeautifulSoup("<div>test</div>").get_text() # --> test만 추출
reviews_train_df['review'] = \
    reviews_train_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())
reviews_test_df['review'] = \
    reviews_test_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())

  reviews_train_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())
  reviews_test_df['review'].map(lambda v: BeautifulSoup(v, 'html.parser').get_text())


In [161]:
reviews_train_df['review'][0]

"zero day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.it is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. in terms of explaining the motives and actions of the two young suicide/murderers it is better than 'elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. flawed but honest with a terrible honesty."

In [162]:
# 데이터 전처리 : 3 특수문자 제거 -> 영문자만 남겨두기 (정규표현식 사용)
import re

reviews_train_df['review'] = reviews_train_df['review'].map(lambda v: re.sub('[^A-Za-z]', ' ', v))
reviews_test_df['review'] = reviews_test_df['review'].map(lambda v: re.sub('[^A-Za-z]', ' ', v))

In [163]:
# 데이터 전처리 4 : 문장(single string) -> 단어(토큰) 리스트(string list) 잘라내기
import nltk # nltk는 자연어 처리를 위한 패키지

reviews_train_df['review'] = reviews_train_df['review'].map(nltk.word_tokenize)
reviews_test_df['review'] = reviews_test_df['review'].map(nltk.word_tokenize)

In [164]:
reviews_train_df

Unnamed: 0,label,review
0,1,"[zero, day, leads, you, to, think, even, re, t..."
1,0,"[words, can, t, describe, how, bad, this, movi..."
2,1,"[everyone, plays, their, part, pretty, well, i..."
3,0,"[there, are, a, lot, of, highly, talented, fil..."
4,0,"[i, ve, just, had, the, evidence, that, confir..."
...,...,...
24995,1,"[footlight, parade, released, viewed, the, ice..."
24996,1,"[deeply, humorous, yet, honest, comedy, about,..."
24997,0,"[st, watched, out, of, dir, sydney, pollack, d..."
24998,0,"[i, watch, lots, of, scary, movies, or, at, le..."


In [165]:
# 데이터 전처리 5 : 불용어(stopwords) 제거
# 분석에 사용하지 않아야할 문자나 단어들(관사 a, the 등)

# nltk.download('stopwords')
from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')   # english 불용어 사전 준비
print(en_stopwords[:10], len(en_stopwords))
# en_stopwords += ['test', 'xyz']     # 기존 stopwords에 새 단어 추가

# reviews_train_df['review'].map(lambda v: [ token for token in v if token not in en_stopwords] )

reviews_train_df['review'] = \
    reviews_train_df['review'].map(lambda v: [ token for token in v if token not in en_stopwords] )
reviews_test_df['review'] = \
    reviews_test_df['review'].map(lambda v: [ token for token in v if token not in en_stopwords] )

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"] 179


In [166]:
# 데이터 전처리 6 : 어근 추출 (단어 활용에 의해 철자가 달라진 단어를 표준화 또는 통합)

from nltk import PorterStemmer

stemmer = PorterStemmer()
# print( stemmer.stem('happy'), stemmer.stem('apples'), stemmer.stem('apple') )

reviews_train_df['review'] = reviews_train_df['review'].map(lambda v: [stemmer.stem(w) for w in v])
reviews_test_df['review'] = reviews_test_df['review'].map(lambda v: [stemmer.stem(w) for w in v])

In [167]:
# 데이터 전처리 7 : 토큰 리스트(string list) -> 단일 문장 (single string)

# print('$'.join(['abc', 'def', 'xyz', '123']))
# $를 사용해서 리스트의 단어를 묶어라

reviews_train_df['review'] = \
    reviews_train_df['review'].map(lambda v: ' '.join(v))
reviews_test_df['review'] = \
    reviews_test_df['review'].map(lambda v: ' '.join(v))

In [168]:
reviews_train_df['review'][0]

'zero day lead think even think two boy young men would commit mutual suicid via slaughter classmat captur must beyond bizarr mode two human decid withdraw common civil order defin mutual world via coupl destruct perfect movi given money time filmmak actor remark product term explain motiv action two young suicid murder better eleph term film get rationalist skin far far better film almost anyth like see flaw honest terribl honesti'

In [172]:
# 데이터 전처리 8 : string token -> numeric value 수치형 변환

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

cv = CountVectorizer()
sparse_train_counts = cv.fit_transform(reviews_train_df['review'])
sparse_test_counts = cv.transform(reviews_test_df['review'])    # 테스트는 별도 훈련 없이 훈련 모델 사용


In [None]:
# token -> number 변환기 확인
# print( sparse_train_counts )
print( "단어 사전의 크기 : {0}".format( len( cv.vocabulary_.keys() ) ) )
idx_to_word = { idx: word for word, idx in cv.vocabulary_.items() } # 문자:숫자 아닌 숫자:문자 구조로 치환

단어 사전의 크기 : 50573


In [183]:
idx_to_word[1234]

'amatuerish'

In [184]:
sparse_test_counts

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2265400 stored elements and shape (25000, 50573)>

In [185]:
# 데이터 준비
from sklearn.model_selection import train_test_split

X_train, y_train = sparse_train_counts, reviews_train_df['label']
X_test, y_test = sparse_test_counts, reviews_test_df['label']


In [192]:
# 모델 훈련 (학습)

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=0.1, max_iter=10000, random_state=42)
logreg.fit(X_train, y_train)

logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.96112, 0.87204)

In [193]:
print( reviews_train_df['review'][0] )
print( sparse_train_counts.toarray()[0] )

zero day lead think even think two boy young men would commit mutual suicid via slaughter classmat captur must beyond bizarr mode two human decid withdraw common civil order defin mutual world via coupl destruct perfect movi given money time filmmak actor remark product term explain motiv action two young suicid murder better eleph term film get rationalist skin far far better film almost anyth like see flaw honest terribl honesti
[0 0 0 ... 0 0 0]


##### 상대빈도 사용

In [204]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

# tv = TfidfVectorizer()    # raw string -> tfidf
# sparse_train_counts2 = tv.fit_transform(reviews_train_df['review'])
# sparse_test_counts2 = tv.transform(reviews_test_df['review'])

tt = TfidfTransformer() # int count data -> tfidf
sparse_train_counts2 = tt.fit_transform(sparse_test_counts)
sparse_test_counts2 = tt.transform(sparse_test_counts)

In [205]:
# 데이터 준비
X_train2, y_train2 = sparse_train_counts2, reviews_train_df['label']
X_test2, y_test2 = sparse_test_counts2, reviews_test_df['label']


In [206]:
# 모델 훈련 (학습)

from sklearn.linear_model import LogisticRegression

logreg2 = LogisticRegression(C=0.1, max_iter=10000, random_state=42)
logreg2.fit(X_train, y_train)

logreg2.score(X_train, y_train), logreg2.score(X_test, y_test)

(0.96112, 0.87204)

### 리뷰 적용해보기

In [None]:
# 전처리 해야하는데, 스킵
review = pd.Series(["""This movie reminds me of the value of family. 
I agree with my friend's comments.
This movie would be one of the best animated movies in my life, too."""])
# 반드시 2차원 데이터로 넣어야한다. [[]]

transformed_review = cv.transform(review)

logreg.predict(transformed_review)

array([1])