In [1]:
from ko_text import *

# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

# Data load

In [2]:
train_df = pd.read_csv('nsmc/ratings_train.txt', sep='\t').dropna()

In [3]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [4]:
train_df.shape

(149995, 3)

In [5]:
train_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75170,75170
1,74825,74825


In [6]:
test_df = pd.read_csv('nsmc/ratings_test.txt', sep = '\t').dropna()

In [7]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [8]:
test_df.shape

(49997, 3)

In [9]:
test_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24826,24826
1,25171,25171


# Preprocess

In [10]:
nlp = NLP()

In [11]:
train_token_ls = nlp.extract_tokens_for_all_document_FAST_VERSION(train_df['document'])
train_label_ls = train_df['label'].tolist()

test_token_ls = nlp.extract_tokens_for_all_document_FAST_VERSION(test_df['document'])
test_label_ls = test_df['label'].tolist()

In [12]:
train_token_ls[1]

['포스터', '보고', '초딩', '영화', '오버', '연기', '조차', '가볍지', '않구나']

# **TF-IDF**

In [13]:
tfidf = TfidfVectorizer(min_df = 3,
                        max_features = 50000,
                        max_df = 0.5)

In [14]:
X_train = tfidf.fit_transform([' '.join(doc) for doc in train_token_ls])

In [23]:
X_train.shape

(149995, 30532)

In [24]:
X_test = tfidf.transform([' '.join(doc) for doc in test_token_ls])

In [27]:
X_test.shape

(49997, 30532)

## **Logistic Regression**

In [18]:
clf = LogisticRegression()
clf.fit(X_train, train_label_ls)

y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, test_label_ls))

Accuracy :  0.8353101186071165


## **SVM**

In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [20]:
clf = LinearSVC()
clf.fit(X_train, train_label_ls)

y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, test_label_ls))

Accuracy :  0.8294697681860912


# **Doc2Vec**

In [28]:
nlp.make_Doc2Vec_model(window = 3, 
                       vector_size= 30, 
                       min_count= 3)
nlp.build_and_train_Doc2Vec_model(train_token_ls, train_label_ls)



In [29]:
X_train = nlp.infer_vectors_with_Doc2Vec(train_token_ls)
X_test = nlp.infer_vectors_with_Doc2Vec(test_token_ls)

## **Logistic Regression**

In [30]:
clf = LogisticRegression()
clf.fit(X_train, train_label_ls)

y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, test_label_ls))

Accuracy :  0.7483849030941857


## **SVM**

In [31]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [32]:
clf = LinearSVC()
clf.fit(X_train, train_label_ls)

y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, test_label_ls))

Accuracy :  0.7486049162949777


# **MLP**

In [33]:
from sklearn.metrics import confusion_matrix
import tensorflow as tf

  from ._conv import register_converters as _register_converters


## **Data Preprocessing**

In [41]:
X_train_nn = np.array(X_train).astype('float32')
y_train_nn = pd.get_dummies(train_label_ls).values.astype('float32')


X_test_nn = np.array(X_test).astype('float32')
y_test_nn = pd.get_dummies(test_label_ls).values.astype('float32')

In [42]:
print(X_train_nn.shape, y_train_nn.shape, X_test_nn.shape, y_test_nn.shape)

(149995, 30) (149995, 2) (49997, 30) (49997, 2)


## **Build Layers**

In [43]:
# reset graphs
tf.reset_default_graph() 

n_class = y_train_nn.shape[1]

# mini-batches
batch_size = X_train_nn.shape[0] // 10
dataset = tf.data.Dataset.from_tensor_slices((X_train_nn, y_train_nn))
dataset = dataset.batch(batch_size)

# building placeholder
X = tf.placeholder(tf.float32, shape = [None, nlp.Doc2Vec_model.vector_size])
Y = tf.placeholder(tf.float32, shape = [None, n_class])
keep_prob = tf.placeholder(tf.float32)

# building layers
n_neuron = 100

W1 = tf.get_variable('W1', shape = ([nlp.Doc2Vec_model.vector_size, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable('W2', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W3 = tf.get_variable('W3', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W4 = tf.get_variable('W4', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W5 = tf.get_variable('W5', shape = ([n_neuron, n_class]), initializer = tf.contrib.layers.xavier_initializer())

b1 = tf.Variable(tf.random_normal([n_neuron]))
b2 = tf.Variable(tf.random_normal([n_neuron]))
b3 = tf.Variable(tf.random_normal([n_neuron]))
b4 = tf.Variable(tf.random_normal([n_neuron]))
b5 = tf.Variable(tf.random_normal([n_class]))

L1 = tf.nn.relu(tf.matmul(X,W1) + b1)
L1 = tf.nn.dropout(L1, keep_prob = keep_prob)

L2 = tf.nn.relu(tf.matmul(L1,W2) + b2)
L2 = tf.nn.dropout(L2, keep_prob = keep_prob)

L3 = tf.nn.relu(tf.matmul(L2,W3) + b3)
L3 = tf.nn.dropout(L3, keep_prob = keep_prob)

L4 = tf.nn.relu(tf.matmul(L3,W4) + b4)
L4 = tf.nn.dropout(L4, keep_prob = keep_prob)

logit = tf.matmul(L4,W5) + b5
hypothesis = tf.nn.softmax(tf.matmul(L4,W5) + b5)


# cost : cross - entropy cost 
lamb = 0.0001
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logit, labels = Y)) + lamb * tf.reduce_sum(tf.square(W5))

# optimize
learning_rate = 0.0001
train = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)

# prediction
prediction = tf.argmax(hypothesis,1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(Y,1), prediction), dtype= tf.float32))



# restore results
train_cost_list = []
train_acc_list = []

test_cost_list = []
test_acc_list = []


# **Run**

### **mini-batch**

In [None]:
# train_dict = {X: X_train, Y: y_train}
test_dict = {X: X_test_nn, Y: y_test_nn, keep_prob : 1}

training_epochs = 1500

iterator = dataset.make_initializable_iterator()
f, l = iterator.get_next()

# launch graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    
    
    for epoch in range(training_epochs):
        # iterator initialize
        sess.run(iterator.initializer)
        avg_cost = 0

        while True:
            # mini-batch
            try:
                batch_x,  batch_y = sess.run([f, l])
                feed_dict = {X : batch_x, Y: batch_y, keep_prob : 0.7}
                
                c, _ = sess.run([cost, train], feed_dict = feed_dict)
                avg_cost += c
            
            except tf.errors.OutOfRangeError:
                    break
        
        acc, _, test_cost = sess.run([accuracy, prediction, cost], feed_dict = test_dict)
        
        train_cost_list.append(avg_cost)
        test_cost_list.append(test_cost)
            
        if (epoch+1) % (100) == 0 :
            
            test_acc_list.append(acc)
            
            print('Epoch : %s'%(epoch+1), 'cost :',test_cost)
            print('Accuracy :', acc)
            
        
    
    acc, y_pred, test_cost = sess.run([accuracy, prediction, cost], feed_dict = test_dict)
    

In [None]:
plt.figure(figsize = (12,6))
plt.plot(train_cost_list, label = 'train_cost')
plt.plot(test_cost_list, label = 'test_cost')
plt.legend(loc = 'best')