# Twitter 감정 분석 model

## module

In [None]:
#pip install nltk

In [None]:
import pandas as pd
import numpy as np
import os

# tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# pickle 
import pickle

# # Natural Language toolkit
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 감정 분석 model

In [None]:
filepath = '/content/drive/MyDrive/AI_bootcamp/tweet-sentiment-extraction/train.csv'

df = pd.read_csv(filepath)

In [None]:
# neutral 감정 제거
con = df[df['sentiment'] == 'neutral'].index
df = df.drop(con, axis=0)
df.reset_index(drop=True, inplace=True)

In [None]:
sentiment_list = []

# negative = 0, positive = 1
for i in df['sentiment']:
  if i == 'negative':
    sentiment_list.append(0)
  elif i == 'positive':
    sentiment_list.append(1)

df['label'] = pd.DataFrame(sentiment_list)
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,label
0,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0
1,088c60f138,my boss is bullying me...,bullying me,negative,0
2,9642c003ef,what interview! leave me alone,leave me alone,negative,0
3,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0
4,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,1


In [None]:
# 불용어 set
stop_words = set(stopwords.words('english'))

words = []

# 불용어 제외
for i in df['text']:
  resp = []
  result = text_to_word_sequence(str(i))
  for x in result:
    if x not in stop_words:
      resp.append(x)
  words.append(resp)

df['tokens'] = words
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,label,tokens
0,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,"[sooo, sad, miss, san, diego]"
1,088c60f138,my boss is bullying me...,bullying me,negative,0,"[boss, bullying]"
2,9642c003ef,what interview! leave me alone,leave me alone,negative,0,"[interview, leave, alone]"
3,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0,"[sons, put, releases, already, bought]"
4,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,1,"[2am, feedings, baby, fun, smiles, coos]"


In [None]:
# 필요한 columns만 선정
train_df = df.loc[:,['text','label','tokens']]
train_df.head()

Unnamed: 0,text,label,tokens
0,Sooo SAD I will miss you here in San Diego!!!,0,"[sooo, sad, miss, san, diego]"
1,my boss is bullying me...,0,"[boss, bullying]"
2,what interview! leave me alone,0,"[interview, leave, alone]"
3,"Sons of ****, why couldn`t they put them on t...",0,"[sons, put, releases, already, bought]"
4,2am feedings for the baby are fun when he is a...,1,"[2am, feedings, baby, fun, smiles, coos]"


In [None]:
# token이 2개 이상인 것들만 선정
train_df = train_df[train_df['tokens'].str.len() > 2]

In [None]:
# tokenzier
tokenizer_name = 'keras_naver_review_tokenizer.pickle'
save_path = os.path.join(os.getcwd(), tokenizer_name)

max_words = 35000
tokenizer = Tokenizer(num_words=max_words, oov_token = True)
tokenizer.fit_on_texts(train_df.tokens)
train_df.tokens = tokenizer.texts_to_sequences(train_df.tokens)

# pickle로 tokenizer 저장
with open(save_path, 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

train_df.head()

Unnamed: 0,text,label,tokens
0,Sooo SAD I will miss you here in San Diego!!!,0,"[245, 41, 30, 1309, 2273]"
2,what interview! leave me alone,0,"[871, 270, 333]"
3,"Sons of ****, why couldn`t they put them on t...",0,"[4834, 249, 4835, 108, 455]"
4,2am feedings for the baby are fun when he is a...,1,"[2587, 7220, 192, 34, 1815, 7221]"
5,Journey!? Wow... u just became cooler. hehe....,1,"[1816, 123, 16, 2274, 2588, 480, 1113]"


In [None]:
# make train, test dataset 
target = train_df['label']

x_train, x_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, shuffle=True, stratify=target, random_state=77)
x_train = x_train.tokens
x_test = x_test.tokens 
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((12090,), (3023,), (12090,), (3023,))

In [None]:
# max_len으로 padding 
max_len=40
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

print('X_train shape: ', x_train.shape)
print('X_test shape: ', x_test.shape)

X_train shape:  (12090, 40)
X_test shape:  (3023, 40)


In [None]:
# Encoding
encoder = LabelEncoder()
# Train
batch_size = y_train.shape[0]
input_dim = 1
y_train = encoder.fit_transform(y_train) # Labeling
y_train = np.reshape(y_train, (batch_size, input_dim)) # Reshape
# Test
batch_size = y_test.shape[0]
y_test = encoder.transform(y_test) # Labeling
y_test = np.reshape(y_test, (batch_size, input_dim)) # Reshape

print(y_train.shape)
print(y_test.shape)

(12090, 1)
(3023, 1)


In [None]:
# make model
model = Sequential()
model.add(Embedding(max_words, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# model train
hist = model.fit(x_train, y_train, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# loss, accuracy
loss, acc = model.evaluate(x_test, y_test, batch_size=32)

print('Test loss:', loss)
print('Test accuracy:', acc)

Test loss: 0.541661262512207
Test accuracy: 0.8316242098808289


In [None]:
save_dir = os.getcwd()
model_name = 'keras_tweet_timeline_trained_model.h5'

# Save model and weights
model_path = os.path.join(save_dir, model_name)
model.save(model_path)

print('Saved trained model at %s ' % model_path)

Saved trained model at /content/keras_tweet_timeline_trained_model.h5 
