<a href="https://colab.research.google.com/github/uknowsj/Capstone_team2/blob/master/text_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [1]:
import pandas as pd
from pandas import DataFrame as df

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim
from gensim.models import Word2Vec #@
from gensim.utils import simple_preprocess #@
from gensim.models.keyedvectors import KeyedVectors #@

#Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model #모델 저장

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import time #수행시간 측정
from google.colab import files #colab에 모델 save,load

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
# =============== 셋팅 =============== #

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#전처리
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [3]:
#Colab에 연결해서 사용하기
from google.colab import drive
drive.mount('/content/gdrive') #,force_remount=True
my_path='/content/gdrive/My Drive/Colab Notebooks/'

Mounted at /content/gdrive


# Load Data

In [4]:
#학습데이터 로드
dataset=pd.read_csv(my_path+'train.csv',encoding = DATASET_ENCODING, names=DATASET_COLUMNS)
print(dataset.shape) #1600000,6

(1600000, 6)


In [5]:
dataset.head() # negative:0, positive:4

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


디코더 생성

In [6]:
decode_map = {0: NEGATIVE, 2: NEUTRAL, 4: POSITIVE} #숫자 => 분류 문장
def decode_sentiment(label):
    return decode_map[int(label)]

In [7]:
%%time
dataset.target = dataset.target.apply(lambda x: decode_sentiment(x))

CPU times: user 501 ms, sys: 1.2 ms, total: 502 ms
Wall time: 504 ms


# Pre-Processing 

클리닝 텍스트 ***이 부분 수민님 코드로 수정해주시면 됩니다.***

In [8]:
#학습 데이터 텍스트 전처리
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [9]:
dataset.text = dataset.text.apply(lambda x: preprocess(x)) #전처리 진행

학습 데이터 나누기 

In [10]:
train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

TRAIN size: 1280000
TEST size: 320000


In [11]:
documents = [_text.split() for _text in train.text] #list, 1280000*50

In [12]:
vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text) 
x_train = tk.texts_to_sequences(train.text)
x_test = tk.texts_to_sequences(test.text)

In [13]:
labels = train.target.unique().tolist() #POSITIVE NEUTRAL NEGATIVE
labels.append(NEUTRAL)
print(labels)

encoder = LabelEncoder() #문장 -> 숫자 자동으로
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1) #1열로 자동으로 만들어줍니다.
y_test = y_test.reshape(-1,1)

['POSITIVE', 'NEGATIVE', 'NEUTRAL']


# Build Model

In [14]:
max_len=max(len(l) for l in x_train) #한 문장에서 최대 단어 개수를 반환 #max_len=50

50


In [15]:
X_train = np.array(pad_sequences(x_train, maxlen=max_len, padding='post')) #max_len만큼 padding 값 설정 
print(X_train.shape, y_train.shape)

(1280000, 50) (1280000, 1)


text-CNN 모델 빌드 ***3시간 정도 걸립니다. 결과만 보려면 아래에서 로드해서 보세요***

In [None]:
#Make Model
model = Sequential()

model.add(Embedding(vocab_size, 32, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=7, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, input_shape=(1,)))
model.compile('SGD','mse',metrics=['accuracy'])
model.summary()

model.fit(X, y, epochs=10, verbose=1)
model.save('model.h5')

text-CNN 모델 로드

In [16]:
model = load_model(my_path+'text-CNN.h5')

In [21]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [22]:
def predict(ex_text, include_neutral=True):
    start_at = time.time()
    x_encoded = tk.texts_to_sequences([ex_text])
    res_test=np.array(pad_sequences(x_encoded, maxlen=max_len, padding='post'))
    # Predict
    score = model.predict([res_test])
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [25]:
predict("That's so sad")

{'elapsed_time': 0.042258501052856445,
 'label': 'NEGATIVE',
 'score': 0.05291527509689331}

# 트위터 데이터로 예측해보기

In [26]:
#트위터 샘플문장
samples=pd.read_csv(my_path+'/sample.txt', sep = "\n",encoding = "utf8",header=None)

In [103]:
df_res = pd.DataFrame({'text':[],
                   'label':[],
                   'score':[],
                  'elapsed_time':[]}) #결과 dataframe 생성

In [106]:
for col,item in samples.iterrows():
  res=predict(item[0])
  df_res.loc[col]=[ item, res['label'], res['score'],res['elapsed_time'] ]

df_res

Unnamed: 0,text,label,score,elapsed_time
0,"0 You are so great. Name: 0, dtype: object",POSITIVE,0.848741,0.041311
1,"0 I’m gonna stop tweeting coz Name: 1, dtyp...",NEUTRAL,0.489361,0.029971
2,0 If you’re a massive fan of someone and th...,POSITIVE,0.774252,0.03128
3,0 Imagine if I was actually like that hahah...,POSITIVE,0.849568,0.035429
4,0 I just can’t get over how good this song ...,NEUTRAL,0.685502,0.027289
5,0 NEXT YEARS CALENDAR ? https://t.co/eW8jPa...,POSITIVE,0.810429,0.027617
6,0 Let go of your story line. Try something ...,POSITIVE,0.761244,0.027318
7,0 WAHOOOOOOOOOOOOOO https://t.co/JvkXq5aYUF...,POSITIVE,0.738181,0.026969
8,0 OI OIIIIIIIIIIII @BBCR1 https://t.co/5gvI...,NEUTRAL,0.662776,0.03046
9,0 YESSSSSSSS HAHHAHAHA https://t.co/zqwDrCB...,POSITIVE,0.826386,0.027157
