In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

train_data=pd.read_csv('scaling.csv')
test_data=pd.read_csv('test_file.csv')

train_data['text'] = train_data['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data['text'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
train_data = train_data.dropna(how='any') # Null 값 제거
print('전처리 후 학습용 샘플의 개수 :',len(train_data))


test_data['text'] = test_data['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_data['text'].replace('', np.nan, inplace=True)
test_data = test_data.dropna(how='any')
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))

전처리 후 학습용 샘플의 개수 : 20846
전처리 후 테스트용 샘플의 개수 : 1629


In [2]:
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pykospacing import spacing
from soynlp.normalizer import *

stopwords = ['뇨','하다','아','를','으로','은','는','의','을','과','함','해라'] # 불용어

okt = Okt()
X_train = []
for sentence in train_data['text']:
    temp_X = []
    sentence=repeat_normalize(sentence,num_repeats=2) # 'ㅋㅋㅋㅋ', '아하하하하' 같은 반복어구 2개로 제한
    sentence=emoticon_normalize(sentence,num_repeats=2) # '옼ㅋㅋㅋㅋㅋ' -> '오ㅋㅋ'
    kospacing_sent = spacing(sentence) # 띄어쓰기
    temp_X = okt.morphs(kospacing_sent,stem=True) # 토큰화 
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)
    
X_test = []
for sentence in test_data['text']:
    temp_X = [] 
    sentence=repeat_normalize(sentence,num_repeats=2)
    sentence=emoticon_normalize(sentence,num_repeats=2)
    kospacing_sent = spacing(sentence)
    temp_X = okt.morphs(kospacing_sent,stem=True) 
    temp_X = [word for word in temp_X if not word in stopwords]
    X_test.append(temp_X)

In [3]:
from gensim.models import Word2Vec
train_model = Word2Vec(sentences = X_train, size = 100, window = 5, min_count = 1, workers = 4, sg = 0)
# 토큰화 완료된 단어 뭉치들을 바탕으로 워드투벡터 모델 이용
# 양 옆으로 이웃한 4개의 단어들로 모델 구성
# 데이터셋이 크지 않으므로 CBOW 방식 채택

In [5]:
train_model.wv.vectors.shape

(17544, 100)

In [6]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size=len(tokenizer.word_index)+2 # 0번 패딩 토큰, OOV 토큰을 고려해서 2개 추가
print(vocab_size)

tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)

X_train2 = tokenizer.texts_to_sequences(X_train)
X_test2 = tokenizer.texts_to_sequences(X_test)

#정수 인코딩

17546


In [8]:
max_len=max(len(l) for l in X_train2)

X_train2 = pad_sequences(X_train2, maxlen = max_len,padding='post')
X_test2 = pad_sequences(X_test2, maxlen = max_len,padding='post')

#같은 길이로 패딩

In [9]:
embedding_matrix=np.zeros((vocab_size,100))

In [10]:
def get_train_vector(word):
    if word in train_model:
        return train_model[word]
    else:
        return None

In [11]:
for word, i in tokenizer.word_index.items():
    temp=get_train_vector(word)
    if temp is not None:
        embedding_matrix[i]=temp

  if word in train_model:
  return train_model[word]


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

e = Embedding(vocab_size, 100,trainable=True, weights=[embedding_matrix],input_length=max_len)
model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(1, input_dim=1,activation='sigmoid'))

In [19]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)
mc = ModelCheckpoint('batch128_win5_100_notdupli_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history=model.fit(X_train2, y_train, callbacks=[es, mc], batch_size=128, epochs=30, verbose=1,validation_split=0.2)

Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.68921, saving model to batch32_win5_300_notdupli_model.h5
Epoch 2/30

Epoch 00002: val_acc improved from 0.68921 to 0.74077, saving model to batch32_win5_300_notdupli_model.h5
Epoch 3/30

Epoch 00003: val_acc improved from 0.74077 to 0.74436, saving model to batch32_win5_300_notdupli_model.h5
Epoch 4/30

Epoch 00004: val_acc improved from 0.74436 to 0.80024, saving model to batch32_win5_300_notdupli_model.h5
Epoch 5/30

Epoch 00005: val_acc improved from 0.80024 to 0.82014, saving model to batch32_win5_300_notdupli_model.h5
Epoch 6/30

Epoch 00006: val_acc did not improve from 0.82014
Epoch 7/30

Epoch 00007: val_acc did not improve from 0.82014
Epoch 8/30

Epoch 00008: val_acc did not improve from 0.82014
Epoch 9/30

Epoch 00009: val_acc did not improve from 0.82014
Epoch 10/30

Epoch 00010: val_acc did not improve from 0.82014
Epoch 11/30

Epoch 00011: val_acc did not improve from 0.82014
Epoch 00011: early stopping


In [21]:
loaded_model = load_model('batch32_win5_300_notdupli_model.h5')
loaded_model.evaluate(X_test2, y_test, batch_size=32)



[0.6106384992599487, 0.7569060921669006]

In [22]:
y_predicted=loaded_model.predict(X_test2)

In [None]:
for i in range(0,len(y_test)):
    if float(y_predicted[i])>=0.5:
        y_predicted[i]=1
    elif float(y_predicted[i])<0.5:
        y_predicted[i]=0

In [None]:
count=0

for i in range(0,len(y_test)):
    if y_predicted[i]==y_test[i]:
        count=count+1;

In [None]:
from sklearn.metrics import classification_report



print('accuracy: ', count / len(y_test))
print("Precision, Recall and F1-Score:\n\n", classification_report(y_test, y_predicted))

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = spacing(new_sentence)
  new_sentence=emoticon_normalize(new_sentence,num_repeats=2)
  new_sentence=repeat_normalize(new_sentence,num_repeats=2)
  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화 # 불용어 제거
  new_sentence = [word for word in new_sentence if not word in stopwords]
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len,padding='post') # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5):
    print("{:.2f}% 확률로 악플이 아닙니다.\n".format(score * 100 ))
  else:
    print("{:.2f}% 확률로 악플입니다.\n".format((1 - score)*100)) # 확인 파트

In [None]:
from pytchat import LiveChat
import pafy
import pandas as pd

pafy.set_api_key(' ')

video_id = ' '

v = pafy.new(video_id)
title = v.title
author = v.author
published = v.published

print(title)
print(author)
print(published)
empty_frame = pd.DataFrame(columns=['제목', '채널 명', '스트리밍 시작 시간', '댓글 작성자', '댓글 내용', '댓글 작성 시간'])
empty_frame.to_csv('./youtube.csv')

In [None]:
chat = LiveChat(video_id = video_id, topchat_only = 'FALSE')

while chat.is_alive():
    try:
        data = chat.get()
        items = data.items
        for c in items:
            new_sentence=spacing(c.message)
            new_sentence=emoticon_normalize(new_sentence,num_repeats=2)
            new_sentence=repeat_normalize(new_sentence,num_repeats=2)
            new_sentence=okt.morphs(new_sentence,stem=True)
            new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
            encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
            pad_new = pad_sequences(encoded, maxlen = max_len,padding='post') # 패딩
            score = float(loaded_model.predict(pad_new)) # 예측
            if(score > 0.5):
                print(f"{c.datetime} [{c.author.name}]- {c.message}")
            else:
                data.tick()
                data2 = {'제목' : [title], '채널 명' : [author], '스트리밍 시작 시간' : [published], '댓글 작성자' : [c.author.name], '댓글 내용' : [c.datetime], '댓글 작성 시간' : [c.message]}
                result = pd.DataFrame(data2)
    except KeyboardInterrupt:
        chat.terminate()
        break