In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [1]:
import pandas as pd
import numpy as np

imdb_data = pd.read_csv('./data/IMDB Dataset.csv')
print(imdb_data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [6]:
# pos, neg 값을 숫자로 변경하는 작업

imdb_data['sentiment'] = imdb_data['sentiment'].replace('positive', 1)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('negative', 0)
print(imdb_data)

                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful little production. <br /><br />The...          1
2      I thought this was a wonderful way to spend ti...          1
3      Basically there's a family where a little boy ...          0
4      Petter Mattei's "Love in the Time of Money" is...          1
...                                                  ...        ...
49995  I thought this movie did a down right good job...          1
49996  Bad plot, bad dialogue, bad acting, idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I'm going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0

[50000 rows x 2 columns]


In [7]:
# 정규표현식을 사용하여 단어가 아닌것을 삭제

imdb_data['review'] = imdb_data['review'].str.replace("[^\w]|br", " ")
print(imdb_data)

                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful little production            The f...          1
2      I thought this was a wonderful way to spend ti...          1
3      Basically there s a family where a little boy ...          0
4      Petter Mattei s  Love in the Time of Money  is...          1
...                                                  ...        ...
49995  I thought this movie did a down right good job...          1
49996  Bad plot  bad dialogue  bad acting  idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I m going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0

[50000 rows x 2 columns]


In [11]:
# 데이터에 따른 선택사항
# 혹시 전처리 이후 공백으로만 이루어진 데이터가 있을 경우 삭제를 할 필요가 있다.
imdb_data['review'] = imdb_data['review'].replace('', np.nan)
imdb_data['sentiment'] = imdb_data['sentiment'].replace('', np.nan)
print(imdb_data)

imdb_data = imdb_data.dropna(how='any')
print(imdb_data)

                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful little production            The f...          1
2      I thought this was a wonderful way to spend ti...          1
3      Basically there s a family where a little boy ...          0
4      Petter Mattei s  Love in the Time of Money  is...          1
...                                                  ...        ...
49995  I thought this movie did a down right good job...          1
49996  Bad plot  bad dialogue  bad acting  idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I m going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0

[50000 rows x 2 columns]
                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful lit

In [14]:
from sklearn.model_selection import train_test_split

review_train, review_test, y_train, y_test = train_test_split(imdb_data['review'], imdb_data['sentiment'], test_size = 0.25, shuffle=True, random_state=3)
review_train.shape, review_test.shape, y_train.shape, y_test.shape

((37500,), (12500,), (37500,), (12500,))

In [17]:
# 예시용 stopwords (불용어)
stopwords = ['a', 'an', 'the', 'very']

# 토큰화 진행

X_train = []
for stc in review_train:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_train.append(token)

X_test = []
for stc in review_test:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_test.append(token)

In [21]:
X_train[0]

['This',
 'was',
 'really',
 'nightmare',
 'of',
 'film',
 'i',
 'saw',
 'it',
 'about',
 'nine',
 'years',
 'ago',
 'on',
 'cable',
 'TV',
 'and',
 'haven',
 't',
 'forgotten',
 'it',
 'since',
 'Pixote',
 'is',
 '10',
 'year',
 'old',
 'boy',
 'who',
 'lives',
 'in',
 'the',
 'streets',
 'of',
 'Sao',
 'Paulo',
 'Brazil',
 'and',
 'leads',
 'criminal',
 'life',
 'in',
 'the',
 'company',
 'of',
 'his',
 'teenage',
 'friends',
 'Lilica',
 'Dito',
 'and',
 'Chico',
 'they',
 'steal',
 'pimp',
 'sell',
 'drugs',
 'and',
 'murder',
 'in',
 'order',
 'to',
 'survive',
 'each',
 'day',
 'In',
 'the',
 'first',
 'half',
 'of',
 'the',
 'film',
 'Pixote',
 'is',
 'caught',
 'by',
 'the',
 'police',
 'and',
 'sent',
 'to',
 'sadistic',
 'foster',
 'home',
 'where',
 'he',
 'witnesses',
 'every',
 'kind',
 'of',
 'abuse',
 'from',
 'the',
 'older',
 'inmates',
 'and',
 'guards',
 'to',
 'the',
 'rest',
 'of',
 'the',
 'kids',
 'one',
 'night',
 'Lilica',
 's',
 'boyfriend',
 'is',
 'killed',
 

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer

# fit on texts는 train기준으로 한번만 한다. 
# train에 해당되지 않는 단어는 사라진다.
# 데이터가 사라지는게 싫으면 전체데이터를 stopwords한 다음 fit_on_text를 진행한다.
# 안에 값을 넣어주면 해당되는 값만큼 인덱스를 부여한다.
# 무의미하게 아무렇게 단어를 고르는게 아니라 단어 빈도수가 너무 적은 필요없는 단어를 제외하고 필요한 단어 5000개만 잡아서 넣어준다.
# 데이터를 분석하여 유의미한 데이터 갯수를 파악한 뒤어 사용해야한다. 그냥 사용하다가는 의미있는 데이터가 날라간다.

tokenizer = Tokenizer(5000) 
tokenizer.fit_on_texts(X_train)

# 동일한 단어는 동일한 인덱스
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [26]:
print(X_train[1])
print(y_train)

[8, 1589, 9, 14, 98, 2377, 1323, 582, 636, 137, 4, 1, 924, 3, 691, 396, 21, 10, 1389, 3, 1, 115, 115, 69, 49, 8, 646, 183, 1, 262, 2, 6, 1191, 1, 327, 5660, 3, 587, 69, 8, 65, 4, 791, 6, 39, 4, 62, 1, 6100, 3, 401, 4550, 30, 40, 480, 4, 2336, 40, 70, 480, 81, 14069, 5182, 8, 288, 20, 1000, 29541, 38, 45366, 16, 517, 2, 12573, 226, 17, 8, 12, 1000, 30, 218, 14070, 317, 3, 45, 473, 4, 28, 1, 281, 2609, 3, 9, 14, 69, 1059, 37, 288, 20, 233, 36, 10, 1, 327, 5660, 3, 325, 174, 540, 203, 35, 1, 456, 2, 91, 180, 72, 4441, 35, 37, 163, 61, 540, 7, 9, 634, 6, 12, 180, 2810, 3439, 2, 15454, 22, 73, 146, 17, 37, 5, 3787, 7, 9, 27, 19, 222, 521, 86, 4, 26, 3927, 21, 1, 806, 3, 9, 634, 17, 23, 150, 20, 59, 58, 3, 75, 334, 4, 26, 761, 38, 291, 16, 9, 3885, 37, 12, 54, 144, 291, 4, 26, 3787, 37, 143, 3, 75, 39, 2967, 4023, 46, 3, 1, 340, 586, 133, 6, 150, 20, 61, 1251, 1, 462, 3, 132, 14071, 344, 6, 277, 172, 382, 3, 46, 937, 133, 59, 439, 221, 17, 1, 602, 210, 20, 13, 78, 13, 6, 891, 16, 1971, 1511,

In [27]:
# 문장의 길이를 맞춰준다
# 임의로 맞추는게 아니라 데이터셋을 보면서 최대 문장 길이가 얼마인지 확인하고 거기에 맞춰서 넣어줘야한다
# 평균으로 맞추거나 최대길이로 맞춘다 보통
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [28]:
model = Sequential()
model.add(Embedding(10000, 120)) 
model.add(LSTM(120)) 
model.add(Dense(1, activation='sigmoid')) 

In [29]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) # loss가 증가할때 스탑하겠다. 5번 기다리겠음
model_check = ModelCheckpoint('the_best.h5',monitor='val_acc', mode='max', verbose=1, save_best_only=True) # acc가 가장 높을 때 확인 하겠다. 저장은 베스트일 때만

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop, model_check])