In [1]:
import pandas as pd
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU,SimpleRNN, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [2]:
max_len = 500
loaded_model = load_model('GRU_model.h5')
index_to_word = {}
word_to_index = imdb.get_word_index()
for key, value in word_to_index.items():
    index_to_word[value+3]=key
for index, token in enumerate(("<pad>","<sos>","<unk>")):
    index_to_word[index]=token

In [20]:
df = pd.read_csv('sample_preprocessed.csv')
df

Unnamed: 0,Datetime,Text
0,2022-05-08,i was the leagueoflegends guy
1,2022-05-08,the most entertaining leagueoflegends content ...
2,2022-05-08,yes the tower is the safest place in lane but ...
3,2022-05-08,we need to collectively obliterate leagueofleg...
4,2022-05-08,potatoise arcane leagueoflegends
...,...,...
1413,2022-05-08,this has spiraled hard into irrelevant topics ...
1414,2022-05-08,the idea leagueoflegends has confirmed lore ar...
1415,2022-05-08,enemy von imagine dragons feat jid and leagueo...
1416,2022-05-08,goin to be playing arams in leagueoflegends to...


In [33]:
def sentiment_predict(new_sentence):
    # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
    encoded = []
    # 띄어쓰기 단위 토큰화 후 정수 인코딩
    for word in new_sentence.split():
        try :
            # 단어 집합의 크기를 10,000으로 제한.
            if word_to_index[word] < 10000 - 3:
                encoded.append(word_to_index[word]+3)
            else:
                # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
                encoded.append(2)
        # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
        except KeyError:
            encoded.append(2)

    pad_sequence = pad_sequences([encoded], maxlen=max_len)
    score = float(loaded_model.predict(pad_sequence)) # 예측
    
    if(score > 0.5) : return 1
    else : return -1

In [34]:
df['label'] = df['Text'].apply(lambda x : sentiment_predict(x))

In [35]:
df

Unnamed: 0,Datetime,Text,label,label2
0,2022-05-08,i was the leagueoflegends guy,-1,0
1,2022-05-08,the most entertaining leagueoflegends content ...,1,1
2,2022-05-08,yes the tower is the safest place in lane but ...,-1,-1
3,2022-05-08,we need to collectively obliterate leagueofleg...,-1,-1
4,2022-05-08,potatoise arcane leagueoflegends,-1,0
...,...,...,...,...
1413,2022-05-08,this has spiraled hard into irrelevant topics ...,-1,1
1414,2022-05-08,the idea leagueoflegends has confirmed lore ar...,1,0
1415,2022-05-08,enemy von imagine dragons feat jid and leagueo...,1,-1
1416,2022-05-08,goin to be playing arams in leagueoflegends to...,1,1


In [36]:
df.label.value_counts()

-1    876
 1    542
Name: label, dtype: int64

In [38]:
df.set_index('Datetime').to_csv('sample_preprocessed_labeled.csv')

In [39]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia=SIA()
results=[]

for line in df['Text']:
    pol_score=sia.polarity_scores(line)
    pol_score['articles']=line
    results.append(pol_score)

temp = pd.DataFrame.from_records(results)
df['label2']=0
df.loc[temp['compound']>0.2,'label2']=1
df.loc[temp['compound']<-0.2,'label2']=-1

In [51]:
count = 0

for i in range(1418) :
    if(df['label'][i] != df['label2'][i] and df['label2'][i] != 0) : count += 1
count

365