In [1]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras import layers

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
path = '/Users/daeyeop/Work/KBS Drama project/Data'

TRAIN_INPUTS = path + 'train_inputs.npy'
TRAIN_LABELS = path + 'train_labels.npy'
TEST_INPUTS  = path + 'test_inputs.npy'
TEST_LABELS  = path + 'test_labels.npy'
DATA_CONFIGS = path + 'data_configs.json'


train_inputs  = np.load(open(TRAIN_INPUTS, 'rb'))
train_labels = np.load(open(TRAIN_LABELS, 'rb'))
test_inputs = np.load(open(TEST_INPUTS, 'rb'))
test_labels = np.load(open(TEST_LABELS, 'rb'))
data_configs = json.load(open(DATA_CONFIGS, 'r'))

In [38]:
model_name = 'cnn_classifer_kr'
BATCH_SIZE = 200
EPOCHS = 10
VALID_SPLIT = 0.1
MAX_LEN = train_inputs.shape[1]

kargs1 = {'model_name':model_name, 
         'vocab_size':data_configs['vocab_size'],
         'embedding_size' : 128, 
         'num_filters':100,
         'dropout_rate':0.5, 
         'hidden_dimension':250,
         'output_dimension':1
        }

kargs = {'model_name':model_name, 
         'vocab_size':data_configs['vocab_size'],
         'embedding_size' : 128, 
         'num_filters':100,
         'dropout_rate':0.5, 
         'lstm_dimension':150,
         'dense_dimension':150,
         'output_dimension':1
        }


class CNNClassifier(tf.keras.Model):
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs['model_name'])
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                         output_dim=kargs['embedding_size'])
        
        self.conv_list = [layers.Conv1D(filters=kargs['num_filters'],
                                        kernel_size=kernel_size,
                                        padding='valid',
                                        activation=relu,
                                        kernel_constraint=MaxNorm(max_value=3.)) for kernel_size in [3,4,5]]
        
        self.pooling = layers.GlobalAveragePooling1D()
        self.dropout = layers.Dropout(kargs['dropout_rate'])
        
        self.fc1 = layers.Dense(units=kargs['hidden_dimension'],
                                activation=relu,
                                kernel_constraint=MaxNorm(max_value=3.))
        
        self.fc2 = layers.Dense(units=kargs['output_dimension'],
                                activation=sigmoid,
                                kernel_constraint=MaxNorm(max_value=3.))
        
        
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [39]:
model = CNNClassifier(**kargs1)
model.compile(optimizer=Adam(),loss=BinaryCrossentropy(),metrics=[BinaryAccuracy(name='accuracy')])

In [40]:
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

checkpoint_path = model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print('{} exists'.format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print('{} is created'.format(checkpoint_dir))

cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy',
                              verbose=1,
                              save_best_only=True,
                              save_weights_only=True)

history = model.fit(train_inputs, train_labels, batch_size=BATCH_SIZE,
                    callbacks=[earlystop_callback, cp_callback],
                    validation_split=VALID_SPLIT,
                    epochs=EPOCHS)

cnn_classifer_kr exists
Train on 135000 samples, validate on 15000 samples
Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.82273, saving model to cnn_classifer_kr/weights.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.82273 to 0.82780, saving model to cnn_classifer_kr/weights.h5
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.82780
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.82780


In [41]:
blog_text = pd.read_csv(path + 'blog_text.csv')
BLOG_TEXT = np.load(open(path + 'BLOG_TEXT.npy','rb'))
pred = model.predict(BLOG_TEXT)

In [42]:
a = pd.concat((blog_text, pd.DataFrame(pred)), axis=1)

In [45]:
for i in a['drama'].unique():
    point = a[a['drama']==i][0]
    print(i,(sum(point) / len(point)) ** 2)

(아는 건 별로 없지만) 가족입니다 0.3052945017712853
외출 0.34181589366972653
반의 반 0.3367907864573493
방법 0.3595469598163189


In [87]:
YOUTUBE_NPY = path + 'YOUTUBE_TEXT.npy'
YOUTUBE_TEXT = paht + 'drama_comment.xlsx'

youtube_text = pd.read_excel(YOUTUBE_TEXT)
YOUTUBE_TEXT = np.load(open(YOUTUBE_NPY, 'rb'))
pred = model.predict(YOUTUBE_TEXT)

In [88]:
youtube_data = pd.concat((youtube_text, pd.DataFrame(pred, columns=['sentiment'])), axis=1)
youtube_data
#youtube_data = youtube_data[youtube_data['sentiment'] > 0.56]

Unnamed: 0,drama,text,sentiment
0,(아는 건 별로 없지만) 가족입니다,얼굴 늙어감을 역행하려 애써지 않고 자연스럽게 받아들이는 마인드가 참 좋다 ...,0.991688
1,(아는 건 별로 없지만) 가족입니다,편 봤는데 각각의 캐릭터들 대사들이 몰입되네요\n 둘째처럼 자기중심적이고 일단 화부...,0.795158
2,(아는 건 별로 없지만) 가족입니다,원미경씨 요즘 중년 노년 배우와 다르게 자연스럽게 나이드셔서 너무 편안하고 보기 좋...,0.986239
3,(아는 건 별로 없지만) 가족입니다,개인적으로 저는 첫째스타일이었다가 쌓이다 쌓여서 둘째 스타일 됐지만 그래도 약간 첫...,0.281782
4,(아는 건 별로 없지만) 가족입니다,미경님 성형도 안한 얼굴이 아주 친근감이 드네요 자연 스럽고요,0.403363
...,...,...,...
11953,순정에 반하다,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㄱ졸귀ㅠㅠ,0.987597
11954,순정에 반하다,꺄햐핫 귀염귀염,0.979577
11955,순정에 반하다,ㅋㅋㅋㅋ귀엽고 웃김ㅋㅋ,0.959151
11956,순정에 반하다,ㅜㅜㅜㅜㅜㅜㅜ뀌여워 최고야ㅜㅜㅜ,0.992761


In [91]:
youtube_data[youtube_data['drama'] == '외출']

Unnamed: 0,drama,text,sentiment
47,외출,회가 끝이였군 이래서 나는 국가가 육아기관을 만들어야 한다고 본다 어렵게 생각말고...,0.035216
48,외출,이 드라마가 꼭 부디 딸 엄마 여자들만이 아닌 남녀노소 공감 했으면 하네요\n드라마...,0.640648
49,외출,대한민국 사법 족구하라 그래,0.940425
50,외출,이제 개천에서 용 못나죠ㅋㅋㅋㅋ 이제 고졸이면 변호사 불가능 로스쿨가야함,0.07956
51,외출,오양촌,0.36358
52,외출,꼭 본방사수할 테니 배성우 씨 드라마에 자주 나와주세요,0.336993
53,외출,가족,0.875257


In [107]:
drama_list = youtube_data['drama'].unique()
youtube_sent = {}
for drama in drama_list:
    sent = youtube_data[youtube_data['drama']==drama]['sentiment'][:1]
    mean = sum(sent) / len(sent)
    youtube_sent[drama] = mean
    
youtube = pd.DataFrame(youtube_sent.values(), index=youtube_sent.keys(), columns=['sent'])

In [108]:
youtube.to_csv(path + 'youtube_review.csv')

In [119]:
KAIROS_NPY = path + 'KAIROS.npy'
KAIROS_TEXT = path + 'kairos_text.csv'

ka_text = pd.read_csv(KAIROS_TEXT)
KA_TEXT = np.load(open(KAIROS_NPY,'rb'))
pred = model.predict(KA_TEXT)
pred = pd.DataFrame(pred, columns=['sent'])
pred

Unnamed: 0,sent
0,0.740108
1,0.992482
2,0.927362
3,0.976327
4,0.982102
5,0.96591
6,0.876518


In [117]:
pred.to_csv(path + 'ka_review.csv')

In [121]:
pred.sum() / len(pred)

sent    0.922973
dtype: float32