In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer

## 데이터 불러오기

In [2]:
reviews_train = load_files("data/aclImdb/train/")

In [3]:
text_train, y_train = reviews_train.data, reviews_train.target

In [4]:
print(text_train[6])

b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.<br /><br />Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life. <br /><br />I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."


In [5]:
text_train = [doc.replace(b"<br />",b" ") for doc in text_train]

In [6]:
print(text_train[6])

b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.  Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life.   I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."


## 텍스트 임베딩

In [7]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)

In [8]:
feature_names = vect.get_feature_names_out()
feature_names[:10]

array(['00', '000', '0000000000001', '00001', '00015', '000s', '001',
       '003830', '006', '007'], dtype=object)

#### Baseline

In [9]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(max_iter=1000), X_train, y_train, n_jobs=-1)
print(np.mean(scores))

0.8811200000000001


In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_score_)

0.88796


## 불용어

In [11]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train) # 5개 이하 단어는 안씀
X_train = vect.transform(text_train)
features_names = vect.get_feature_names_out()

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_score_)

0.88296


#### [BOW & n-Gram을 주로 사용]
#### 1. BOW (토큰화->어휘사전구축->인코딩) (V)
- 문자카운팅 -> 확률
- 장점 : 계산이 쉬움
- 단점 : 데이터 편향
#### 2. tf-idf(조건부확률) : 단어빈도-역문서빈도
- 특정 문서에서 자주, 다른 문서들에서 그렇지 않음 -> 높은 가중치
- 보통명사를 거르기도 함
#### 3. n-Gram (V)
> BOW, tf-idf에 매개변수로 들어가는 것
- 연속된 토큰 (연관검색어)
- 대부분 n=2로 잡는 것이 유용

## N-그램

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline

In [14]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression(max_iter=100))

In [15]:
param_grid = {'logisticregression__C':[10,100],
              'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)]}

In [16]:
grid = GridSearchCV(pipe, param_grid, n_jobs=-1)
grid.fit(text_train, y_train)
grid.best_score_

0.9069200000000001