In [1]:
import tensorflow as tf
from konlpy.tag import Twitter
import numpy as np
import gensim

In [2]:
def read_data(filename):    
    with open(filename, 'r',encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]        
        data = data[1:]   # header 제외 #    
    return data 
    
train_data = read_data('data/ratings_train3.txt')
test_data = read_data('data/ratings_test.txt')

In [3]:
twitter = Twitter() 

def tokenize(doc):

    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

tokens = [tokenize(row[1]) for row in train_data]

In [4]:
model = gensim.models.Word2Vec(size=300, sg = 1, alpha=0.025, min_alpha=0.025, seed=410)
model.build_vocab(tokens)
model.train(tokens, model.corpus_count, epochs=model.epochs)

(933610, 1291235)

In [5]:
# '외계인/Noun' in model.wv.vocab

In [6]:
model.save('word2vec.model')

In [7]:
model = gensim.models.word2vec.Word2Vec.load('word2vec.model')

In [8]:
len(model.wv.vocab)

def convert2Vec(model, doc):  ## Convert corpus into vectors
    word_vec = []
    word_num = 0
    unk_num = 0
#     print('doc : ', doc)
    for sent in doc:
#         print('sent : ', sent)
        sub = []
        for word in sent:
            word_num += 1
            if(word in model.wv.vocab):
#                 print('word in sent')
                sub.append(model.wv[word])
            else:
                unk_num += 1
#                 print('word not in sent')
                sub.append(np.zeros(300)) ## used for OOV words
        word_vec.append(sub)
    print('word_num :', word_num, 'unk_num :', unk_num)
    return word_vec


In [9]:
tokens = [[tokenize(row[1]),int(row[2])] for row in train_data if tokenize(row[1]) != []]
tokens = np.array(tokens)
train_X = tokens[:,0]
train_Y = tokens[:,[-1]]
train_X = convert2Vec(model, train_X)
# train_Y = np.array([[y] for y in train_Y])
seq_length = [len(x) for x in train_X]
maxseq_length = max(seq_length)
maxseq_length


word_num : 258247 unk_num : 20510


76

In [10]:
def zero_padding(maxseq_length, seq):
#     print(len(seq))
    zero_pad = np.zeros((maxseq_length) * 300)
    seq_flat = np.reshape(seq, [-1])
#     print(len(seq_flat))
    zero_pad[0:len(seq) * 300] = seq_flat
#     print(len(zero_pad))
    return zero_pad
train_X = [zero_padding(maxseq_length, seq) for seq in train_X]
len(train_X[0])

22800

In [11]:
X = tf.placeholder(tf.float32, shape=[None, maxseq_length * 300])
Y = tf.placeholder(tf.float32, shape=[None, 1])

In [12]:
W = tf.Variable(tf.random_normal([maxseq_length * 300, 1]), name='weight1')
b = tf.Variable(tf.random_normal([1]), name='bias')

In [13]:
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

In [14]:
# cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) *
#                        tf.log(1 - hypothesis))
cost = -tf.reduce_mean(Y * tf.log(hypothesis + 1e-7) + (1 - Y) * tf.log(1 - hypothesis + 1e-7))

In [15]:
train = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

In [16]:
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

In [17]:
training_epochs = 2
Batch_size = 32
total_batch = int(len(train_X) / Batch_size)
save_path = './saved/model'
saver = tf.train.Saver()

with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for epoch in range(training_epochs):

        avg_loss = 0
        for step in range(total_batch):

            train_batch_X = train_X[step*Batch_size : step*Batch_size+Batch_size]
            train_batch_Y = train_Y[step*Batch_size : step*Batch_size+Batch_size]
            
            cost_val, _ = sess.run([cost, train], feed_dict={X: train_batch_X, Y: train_batch_Y})
            avg_loss += cost_val
            acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y})
            print('Batch : ', step + 1, '/', total_batch,
                  ', BCE in this minibatch: ', cost_val, 'accuracy: ', float(acc))

            # print("step:", '%04d' %(step+1), "loss = {:.6f} accuracy= {:.6f}".format(loss_, acc))
        acc = sess.run(accuracy , feed_dict={X: train_batch_X, Y: train_batch_Y})
        print('epoch:', epoch, ' train_loss:', float(avg_loss/total_batch))
        saver.save(sess, save_path, epoch)


Batch :  1 / 547 , BCE in this minibatch:  3.8150682 accuracy:  0.5
Batch :  2 / 547 , BCE in this minibatch:  1.916998 accuracy:  0.625
Batch :  3 / 547 , BCE in this minibatch:  3.0094695 accuracy:  0.53125
Batch :  4 / 547 , BCE in this minibatch:  2.846408 accuracy:  0.5625
Batch :  5 / 547 , BCE in this minibatch:  2.1061368 accuracy:  0.625
Batch :  6 / 547 , BCE in this minibatch:  3.1970906 accuracy:  0.59375
Batch :  7 / 547 , BCE in this minibatch:  3.4192293 accuracy:  0.53125
Batch :  8 / 547 , BCE in this minibatch:  2.1101043 accuracy:  0.59375
Batch :  9 / 547 , BCE in this minibatch:  2.6893737 accuracy:  0.5
Batch :  10 / 547 , BCE in this minibatch:  5.491139 accuracy:  0.375
Batch :  11 / 547 , BCE in this minibatch:  3.3090222 accuracy:  0.59375
Batch :  12 / 547 , BCE in this minibatch:  2.8181481 accuracy:  0.53125
Batch :  13 / 547 , BCE in this minibatch:  3.096433 accuracy:  0.375
Batch :  14 / 547 , BCE in this minibatch:  5.285328 accuracy:  0.3125
Batch :  1

Batch :  124 / 547 , BCE in this minibatch:  1.6427948 accuracy:  0.53125
Batch :  125 / 547 , BCE in this minibatch:  2.7571898 accuracy:  0.375
Batch :  126 / 547 , BCE in this minibatch:  2.1638477 accuracy:  0.5625
Batch :  127 / 547 , BCE in this minibatch:  2.6534967 accuracy:  0.59375
Batch :  128 / 547 , BCE in this minibatch:  1.5504336 accuracy:  0.6875
Batch :  129 / 547 , BCE in this minibatch:  2.0994406 accuracy:  0.46875
Batch :  130 / 547 , BCE in this minibatch:  1.6022122 accuracy:  0.59375
Batch :  131 / 547 , BCE in this minibatch:  2.0599794 accuracy:  0.65625
Batch :  132 / 547 , BCE in this minibatch:  1.8044555 accuracy:  0.6875
Batch :  133 / 547 , BCE in this minibatch:  2.678706 accuracy:  0.53125
Batch :  134 / 547 , BCE in this minibatch:  1.653291 accuracy:  0.53125
Batch :  135 / 547 , BCE in this minibatch:  2.0278342 accuracy:  0.5625
Batch :  136 / 547 , BCE in this minibatch:  2.1203585 accuracy:  0.625
Batch :  137 / 547 , BCE in this minibatch:  2.2

Batch :  255 / 547 , BCE in this minibatch:  1.21929 accuracy:  0.65625
Batch :  256 / 547 , BCE in this minibatch:  2.6528137 accuracy:  0.59375
Batch :  257 / 547 , BCE in this minibatch:  1.3504732 accuracy:  0.6875
Batch :  258 / 547 , BCE in this minibatch:  2.6135185 accuracy:  0.53125
Batch :  259 / 547 , BCE in this minibatch:  1.2740926 accuracy:  0.6875
Batch :  260 / 547 , BCE in this minibatch:  2.7351022 accuracy:  0.5
Batch :  261 / 547 , BCE in this minibatch:  1.5932068 accuracy:  0.65625
Batch :  262 / 547 , BCE in this minibatch:  1.5020453 accuracy:  0.6875
Batch :  263 / 547 , BCE in this minibatch:  1.8987094 accuracy:  0.65625
Batch :  264 / 547 , BCE in this minibatch:  1.2794857 accuracy:  0.71875
Batch :  265 / 547 , BCE in this minibatch:  2.3368104 accuracy:  0.5625
Batch :  266 / 547 , BCE in this minibatch:  4.288334 accuracy:  0.375
Batch :  267 / 547 , BCE in this minibatch:  1.9370562 accuracy:  0.53125
Batch :  268 / 547 , BCE in this minibatch:  1.9406

Batch :  382 / 547 , BCE in this minibatch:  1.9630158 accuracy:  0.5625
Batch :  383 / 547 , BCE in this minibatch:  1.6154516 accuracy:  0.53125
Batch :  384 / 547 , BCE in this minibatch:  2.554329 accuracy:  0.46875
Batch :  385 / 547 , BCE in this minibatch:  3.2316122 accuracy:  0.4375
Batch :  386 / 547 , BCE in this minibatch:  2.1989243 accuracy:  0.5
Batch :  387 / 547 , BCE in this minibatch:  1.7614831 accuracy:  0.59375
Batch :  388 / 547 , BCE in this minibatch:  1.6335137 accuracy:  0.5625
Batch :  389 / 547 , BCE in this minibatch:  1.8252844 accuracy:  0.65625
Batch :  390 / 547 , BCE in this minibatch:  1.7839243 accuracy:  0.59375
Batch :  391 / 547 , BCE in this minibatch:  2.0027409 accuracy:  0.625
Batch :  392 / 547 , BCE in this minibatch:  1.418097 accuracy:  0.6875
Batch :  393 / 547 , BCE in this minibatch:  1.5885834 accuracy:  0.6875
Batch :  394 / 547 , BCE in this minibatch:  1.8582612 accuracy:  0.65625
Batch :  395 / 547 , BCE in this minibatch:  1.6361

Batch :  507 / 547 , BCE in this minibatch:  2.3555007 accuracy:  0.53125
Batch :  508 / 547 , BCE in this minibatch:  1.9280005 accuracy:  0.65625
Batch :  509 / 547 , BCE in this minibatch:  1.5178778 accuracy:  0.65625
Batch :  510 / 547 , BCE in this minibatch:  1.3256528 accuracy:  0.6875
Batch :  511 / 547 , BCE in this minibatch:  1.4475266 accuracy:  0.6875
Batch :  512 / 547 , BCE in this minibatch:  2.1945148 accuracy:  0.59375
Batch :  513 / 547 , BCE in this minibatch:  2.403811 accuracy:  0.5
Batch :  514 / 547 , BCE in this minibatch:  1.8743888 accuracy:  0.5625
Batch :  515 / 547 , BCE in this minibatch:  1.9523164 accuracy:  0.5625
Batch :  516 / 547 , BCE in this minibatch:  1.8057905 accuracy:  0.71875
Batch :  517 / 547 , BCE in this minibatch:  1.8980514 accuracy:  0.5625
Batch :  518 / 547 , BCE in this minibatch:  1.9261918 accuracy:  0.6875
Batch :  519 / 547 , BCE in this minibatch:  2.3149638 accuracy:  0.59375
Batch :  520 / 547 , BCE in this minibatch:  2.90

Batch :  89 / 547 , BCE in this minibatch:  2.3036737 accuracy:  0.6875
Batch :  90 / 547 , BCE in this minibatch:  2.367723 accuracy:  0.625
Batch :  91 / 547 , BCE in this minibatch:  0.85992193 accuracy:  0.65625
Batch :  92 / 547 , BCE in this minibatch:  1.734122 accuracy:  0.65625
Batch :  93 / 547 , BCE in this minibatch:  2.5604339 accuracy:  0.53125
Batch :  94 / 547 , BCE in this minibatch:  1.7469177 accuracy:  0.65625
Batch :  95 / 547 , BCE in this minibatch:  2.2860813 accuracy:  0.6875
Batch :  96 / 547 , BCE in this minibatch:  1.4182512 accuracy:  0.625
Batch :  97 / 547 , BCE in this minibatch:  1.6920792 accuracy:  0.6875
Batch :  98 / 547 , BCE in this minibatch:  2.6766608 accuracy:  0.5
Batch :  99 / 547 , BCE in this minibatch:  1.613502 accuracy:  0.6875
Batch :  100 / 547 , BCE in this minibatch:  1.6173221 accuracy:  0.5625
Batch :  101 / 547 , BCE in this minibatch:  1.352586 accuracy:  0.65625
Batch :  102 / 547 , BCE in this minibatch:  0.7963024 accuracy: 

Batch :  206 / 547 , BCE in this minibatch:  1.1157347 accuracy:  0.71875
Batch :  207 / 547 , BCE in this minibatch:  1.281855 accuracy:  0.78125
Batch :  208 / 547 , BCE in this minibatch:  2.7809525 accuracy:  0.46875
Batch :  209 / 547 , BCE in this minibatch:  0.94019413 accuracy:  0.8125
Batch :  210 / 547 , BCE in this minibatch:  2.5615954 accuracy:  0.53125
Batch :  211 / 547 , BCE in this minibatch:  1.2717577 accuracy:  0.71875
Batch :  212 / 547 , BCE in this minibatch:  1.437867 accuracy:  0.53125
Batch :  213 / 547 , BCE in this minibatch:  0.6189003 accuracy:  0.78125
Batch :  214 / 547 , BCE in this minibatch:  1.6026788 accuracy:  0.59375
Batch :  215 / 547 , BCE in this minibatch:  1.2750576 accuracy:  0.59375
Batch :  216 / 547 , BCE in this minibatch:  1.4920273 accuracy:  0.65625
Batch :  217 / 547 , BCE in this minibatch:  1.7441157 accuracy:  0.625
Batch :  218 / 547 , BCE in this minibatch:  2.1491537 accuracy:  0.53125
Batch :  219 / 547 , BCE in this minibatch

Batch :  334 / 547 , BCE in this minibatch:  1.0202911 accuracy:  0.6875
Batch :  335 / 547 , BCE in this minibatch:  1.3659376 accuracy:  0.6875
Batch :  336 / 547 , BCE in this minibatch:  1.806206 accuracy:  0.59375
Batch :  337 / 547 , BCE in this minibatch:  2.7615092 accuracy:  0.5
Batch :  338 / 547 , BCE in this minibatch:  1.9115509 accuracy:  0.53125
Batch :  339 / 547 , BCE in this minibatch:  2.7875924 accuracy:  0.59375
Batch :  340 / 547 , BCE in this minibatch:  0.85243857 accuracy:  0.8125
Batch :  341 / 547 , BCE in this minibatch:  1.6249338 accuracy:  0.6875
Batch :  342 / 547 , BCE in this minibatch:  1.1868324 accuracy:  0.625
Batch :  343 / 547 , BCE in this minibatch:  3.1720862 accuracy:  0.5625
Batch :  344 / 547 , BCE in this minibatch:  1.2654145 accuracy:  0.6875
Batch :  345 / 547 , BCE in this minibatch:  1.3570037 accuracy:  0.65625
Batch :  346 / 547 , BCE in this minibatch:  1.9931029 accuracy:  0.6875
Batch :  347 / 547 , BCE in this minibatch:  2.3985

Batch :  462 / 547 , BCE in this minibatch:  1.4675262 accuracy:  0.78125
Batch :  463 / 547 , BCE in this minibatch:  2.4081597 accuracy:  0.5625
Batch :  464 / 547 , BCE in this minibatch:  1.7786875 accuracy:  0.65625
Batch :  465 / 547 , BCE in this minibatch:  1.0247794 accuracy:  0.75
Batch :  466 / 547 , BCE in this minibatch:  2.6086252 accuracy:  0.53125
Batch :  467 / 547 , BCE in this minibatch:  0.99556684 accuracy:  0.75
Batch :  468 / 547 , BCE in this minibatch:  0.616496 accuracy:  0.78125
Batch :  469 / 547 , BCE in this minibatch:  1.389793 accuracy:  0.78125
Batch :  470 / 547 , BCE in this minibatch:  2.1905513 accuracy:  0.6875
Batch :  471 / 547 , BCE in this minibatch:  0.9452964 accuracy:  0.8125
Batch :  472 / 547 , BCE in this minibatch:  1.3872273 accuracy:  0.6875
Batch :  473 / 547 , BCE in this minibatch:  0.72407556 accuracy:  0.75
Batch :  474 / 547 , BCE in this minibatch:  2.3208532 accuracy:  0.625
Batch :  475 / 547 , BCE in this minibatch:  1.853311