## IMDB 영화평 감성 분석 _ 모델만드는데 상당시간소요

In [1]:
import re
import joblib
import numpy as np 
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [4]:
df = pd.read_csv('../../../Machine-Learning/00.data/IMDB/labeledTrainData.tsv',
                 header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [None]:
df.info()

In [None]:
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')

In [None]:
# 영어 이외의 문자는 공백으로 변환
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [None]:
df.isna().sum()

In [None]:
df[df.review == ''].count()

In [None]:
feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size=0.25, random_state=2021
)
X_train.shape, X_test.shape #교수님결과값((18750, 1), (6250, 1))

In [None]:
X_train[:3]

In [None]:
X_test[:3]

In [None]:
df_test = pd.DataFrame(X_test, columns=['review'])
df_test['sentiment'] = y_test
df_test.to_csv('../static/data/IMDB_test.csv', index=False)

In [None]:
df_test = pd.read_csv('../static/data/IMDB_test.csv')
df_test.head(3)

## Case 1. CountVectorizer + LogisticRegression

In [None]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})

In [None]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
#교수님결과값:Fitting 3 folds for each of 9 candidates, totalling 27 fits
#[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
#[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  5.2min finished
#{'count_vect__max_df': 500, 'lr_clf__C': 1} 0.8667199999999999

In [None]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_lr)
#교수님결과값:0.87344

In [None]:
joblib.dump(best_count_lr, '../static/model/imdb_count_lr.pkl')

## Case 2. CountVectorizer + SupportVectorMachine

In [None]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [None]:
%time pipeline.fit(X_train.review, y_train)
#교수님결과값:Wall time: 20min 22s

In [None]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
#Fitting 3 folds for each of 9 candidates, totalling 27 fits
#[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

In [None]:
best_count_sv = pipeline
pred_count_sv = best_count_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_count_sv)
#교수님결과값 0.87312

In [None]:
joblib.dump(best_count_sv, '../static/model/imdb_count_sv.pkl')

## Case 3. TfidfVectorizer + LogisticRegression

In [None]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [0.1, 1, 10]
})

In [None]:
%time pipeline.fit(X_train.review, y_train)
#교수님결과값 40.4s

In [None]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
#Fitting 3 folds for each of 9 candidates, totalling 27 fits
#[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
#[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  9.5min finished
#{'lr_clf__C': 10, 'tfidf_vect__max_df': 500} 0.8776533333333333

In [None]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_lr)
#교수님결과값 0.88144

In [None]:
joblib.dump(best_tfidf_lr, '../static/model/imdb_tfidf_lr.pkl')

## Case 4. TfidfVectorizer + SupportVectorMachine

In [None]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('sv_clf', SVC())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'sv_clf__C': [0.1, 1, 10]
})

In [None]:
%time pipeline.fit(X_train.review, y_train)
#교수님결과값 25min 8s

In [None]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

In [None]:
best_tfidf_sv = pipeline
pred_tfidf_sv = best_tfidf_sv.predict(df_test.review.values)
accuracy_score(df_test.sentiment.values, pred_tfidf_sv)
#교수님결과값 0.8832

In [None]:
joblib.dump(best_tfidf_sv, '../static/model/imdb_tfidf_sv.pkl')

# TEST

In [None]:
index = 1000

In [None]:
#인덱스일경우 test_data만드는방법(vectorizer가받을수있는 1차원으로)
test_data = []
test_data.append(df_test.iloc[index, 0])

In [None]:
review_string = '''Really enjoyed this series. One reviewer gave a low rating mentioning how the first episode showed her using pills in the orphanage....yes they had to as it shapes the rest of her future. The characters are great and the acting on is superb. Kept me hooked!'''

In [None]:
#리뷰입력일경우 test_data만드는방법(vectorizer가받을수있는 1차원으로)
test_data = []
test_data.append(review_string)

In [None]:
test_data

In [None]:
label = df_test.sentiment[index]
label #정답확인

In [None]:
#모델불러오기 의외로 시간걸림
model_cl = joblib.load('../static/model/imdb_count_lr.pkl')
model_cs = joblib.load('../static/model/imdb_count_sv.pkl')
model_tl = joblib.load('../static/model/imdb_tfidf_lr.pkl')
model_ts = joblib.load('../static/model/imdb_tfidf_sv.pkl')

In [None]:
pred_cl = model_cl.predict(test_data)

In [None]:
pred_cl[0] #예측값확인

In [None]:
pred_cl = model_cl.predict(test_data)
pred_cs = model_cs.predict(test_data)
pred_tl = model_tl.predict(test_data)
pred_ts = model_ts.predict(test_data)

In [None]:
pred_cl[0], pred_cs[0], pred_tl[0], pred_ts[0]