# IMDB 영화평 감성분석

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../00.data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [4]:
df.review[0][:1000] #첫번째글. 1000번캐릭터까지확인
#<br /><br /> 태그가 있다

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [5]:
# <br />태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')

In [6]:
# 영어 이외의 문자는 공백으로 변환(제거)
import re

df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x)) # X에 대해서 [^a-zA-Z]아닌놈^들은 공백으로 바꿔라

In [7]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size=0.3, random_state=156 #feature_df데이터프레임을 줬더니, 아래서 컬럼지점해줘야하더라.. /y값 시리즈 df.sentiment 로 줌
)
X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [8]:
X_train.head()

Unnamed: 0,review
3724,This version moved a little slow for my taste...
23599,I really enjoyed this film because I have a t...
11331,Saw this in the theater in and fell out o...
15745,Recently I was looking for the newly issued W...
845,Escaping the life of being pimped by her fath...


In [9]:
y_train.head()

3724     0
23599    1
11331    1
15745    1
845      1
Name: sentiment, dtype: int64

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

- CountVectorizer의 경우,

In [11]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
count_vect.fit(X_train.review) #df다보니 컬럼지정해줘야함 .review
X_train_count = count_vect.transform(X_train.review) # 꼭 두단계를 거쳐야 train, test같은 인덱스를 가짐
X_test_count = count_vect.transform(X_test.review)

In [12]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_count, y_train)
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test, pred) #결과값 정확도0.886 88%

0.886

- TfidfVectorizer

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train.review)
X_train_tfidf = tfidf_vect.transform(X_train.review) # 꼭 두단계를 거쳐야 train, test같은 인덱스를 가짐
X_test_tfidf = tfidf_vect.transform(X_test.review)

In [14]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test, pred)#결과값 정확도0.8936 89% 조금올랐음;ㅋㅋ확실하게좋다는아닌데!

0.8936

시간들여 모은 tfidf_vect 자료
### 모델 저장하고 불러오기

In [15]:
import joblib
joblib.dump(tfidf_vect, 'model/imdb_vect.pkl') #피클데이터라 뒤에 .pkl붙여주는게 관례
joblib.dump(lr_clf, 'model/imdb_lr.pkl')

['model/imdb_lr.pkl']

In [16]:
#함부로지우는거아니다아ㅠㅠㅠㅠ
#del tfidf_vect
#del lr_clf

In [17]:
#모델에서 읽어오기
new_vect = joblib.load('model/imdb_vect.pkl')
new_lr = joblib.load('model/imdb_lr.pkl')

In [18]:
#트레인은 할필요없다 #new_X_train = new_vect.transform(X_train.review) # 꼭 두단계를 거쳐야 train, test같은 인덱스를 가짐
new_X_test = new_vect.transform(X_test.review)
#시간이 쫌 걸림

In [19]:
pred = new_lr.predict(new_X_test)
accuracy_score(y_test, pred)
#결과값0.8936 위에서한것과 모델불러온것과 값이 똑같! 시간절약을위해 이렇게하는것!

0.8936

### Pipeline을 써서 학습/예측/평가

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train.review, y_train)
pred = pipeline.predict(X_test.review)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도: {acc:.4f}')
#시간 좀 걸림; 결과값 0.8860

In [None]:
new_pipe = joblib.load('model/pipline.pkl')

In [23]:
pred = new_pipe.predict(X_test.review)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도: {acc:.4f}')
#저장한 모델도 위와같은 결과 정확도 0.8860

Count Vectorizer + Logistic Regression 정확도: 0.8860


In [29]:
new_vect.get_params().keys()

dict_keys(['analyzer', 'binary', 'decode_error', 'dtype', 'encoding', 'input', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'norm', 'preprocessor', 'smooth_idf', 'stop_words', 'strip_accents', 'sublinear_tf', 'token_pattern', 'tokenizer', 'use_idf', 'vocabulary'])

In [31]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('new_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('new_lr', LogisticRegression(C=10))
])
params = {
    'new_vect__max_df': [100, 300, 500],
    'new_lr__C': [1, 5, 10]
}

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
# 결과값: Fitting 3 folds for each of 9 candidates, totalling 27 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  6.0min finished
# {'new_lr__C': 5, 'new_vect__max_df': 500} 0.8664572301433043

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  6.0min finished
{'new_lr__C': 5, 'new_vect__max_df': 500} 0.8664572301433043


In [32]:
pred = grid_pipe.predict(X_test.review)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도: {acc:.4f}') #결과값: 정확도: 0.8788

Count Vectorizer + Logistic Regression 정확도: 0.8788


In [33]:
#지우연씨 0.8949 능가해보고싶은데 못미침 ㅋ
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('new_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('new_lr', LogisticRegression(C=10))
])
params = {
    'new_vect__max_df': [800, 1000],
    'new_vect__min_df': [2],
    'new_lr__C': [8, 20, 30]
}

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_) #결과0.8732 되려떨어지면우짜냐 ㅋㅋ

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  1.3min finished
{'new_lr__C': 8, 'new_vect__max_df': 1000, 'new_vect__min_df': 2} 0.8732002767071286
