In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

Mounted at /gdrive


# Import Library

In [2]:
import pandas as pd
from pandas import DataFrame as df

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim
from gensim.models import Word2Vec #@
from gensim.utils import simple_preprocess #@
from gensim.models.keyedvectors import KeyedVectors #@

#Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model #모델 저장

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import time #수행시간 측정
from google.colab import files #colab에 모델 save,load

# emoji패키지 설치
# 영구 설치 가능, https://sikaleo.tistory.com/m/78?category=932203
!pip install emoji
import emoji

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)
[K     |████████████████████████████████| 51kB 1.7MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49716 sha256=3ec1cf4d2f5f5e039ada820d66f975352c9cb15c72e49f4ee67396ce3f104d25
  Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


In [3]:
# =============== 셋팅 =============== #

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#전처리
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# Load Data

In [4]:
my_path = '/gdrive/My Drive/colab/'

In [5]:
#학습데이터 로드
dataset=pd.read_csv(my_path+'train.csv',encoding = DATASET_ENCODING, names=DATASET_COLUMNS)
print(dataset.shape) #1600000,6

(1600000, 6)


In [6]:
dataset.head() # negative:0, positive:4

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


디코더 생성

In [7]:
decode_map = {0: NEGATIVE, 2: NEUTRAL, 4: POSITIVE} #숫자 => 분류 문장
def decode_sentiment(label):
    return decode_map[int(label)]

In [8]:
%%time
dataset.target = dataset.target.apply(lambda x: decode_sentiment(x))

CPU times: user 503 ms, sys: 1.54 ms, total: 505 ms
Wall time: 506 ms


# Pre-Processing 

클리닝 텍스트

In [9]:
#학습 데이터 텍스트 전처리
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [10]:
#학습 데이터 전처리 진행
dataset.text = dataset.text.apply(lambda x: preprocess(x)) 

학습 데이터 나누기 

In [11]:
train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

TRAIN size: 1280000
TEST size: 320000


In [12]:
documents = [_text.split() for _text in train.text] #list, 1280000*50

In [13]:
vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text) 
x_train = tk.texts_to_sequences(train.text)
x_test = tk.texts_to_sequences(test.text)

In [14]:
labels = train.target.unique().tolist() #POSITIVE NEUTRAL NEGATIVE
labels.append(NEUTRAL)
print(labels)

encoder = LabelEncoder() #문장 -> 숫자 자동으로
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1) #1열로 자동으로 만들어줍니다.
y_test = y_test.reshape(-1,1)

['POSITIVE', 'NEGATIVE', 'NEUTRAL']


# Build Model

In [15]:
max_len=max(len(l) for l in x_train) #한 문장에서 최대 단어 개수를 반환 #max_len=50

In [16]:
X_train = np.array(pad_sequences(x_train, maxlen=max_len, padding='post')) #max_len만큼 padding 값 설정 
print(X_train.shape, y_train.shape)

(1280000, 50) (1280000, 1)


text-CNN 모델 로드

In [17]:
model = load_model(my_path+'text-CNN.h5')

In [18]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [19]:
def predict(ex_text, include_neutral=True):
    start_at = time.time()
    x_encoded = tk.texts_to_sequences([ex_text])
    res_test=np.array(pad_sequences(x_encoded, maxlen=max_len, padding='post'))
    # Predict
    score = model.predict([res_test])
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [20]:
predict("That's so sad")

{'elapsed_time': 0.46875429153442383,
 'label': 'NEGATIVE',
 'score': 0.05291527509689331}

# 트위터 데이터로 예측해보기

In [21]:
# 트윗 텍스트 전처리
def preproc_tweets(request_id):
    #사용자에게 분석 요청 받은 유명인 아이디
    id = request_id

    file_name = 'tweets_'
    fileformat = '.txt'
    filename = file_name + id + fileformat

    data_path = '/gdrive/My Drive/colab/data/'

    # 분석 요청된 유명인 트윗 파일 open
    with open(data_path + filename, 'r', encoding = "utf-8") as f:
        tweets = pd.read_csv(f, sep = "\n", names = ['data'])
    f.close()

    #소문자 변환
    tweets['preprocess'] = tweets['data'].str.lower()

    #@,#제거
    tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "(@|#)", value = " ", regex = True)

    #Emoji 변환
    tweets_raw = tweets['preprocess']

    for i in range(len(tweets_raw)):
        tweets_raw[i] = emoji.demojize(tweets_raw[i], use_aliases = True)

    tweets['preprocess'] = tweets_raw

    return tweets

@AdinaPorter
@aliciakeys
@AnneMarie
@BillGates
@birdy
@charlieputh
@ChrisEvans
@DanReynolds
@DojaCat
@DwyaneWade
@elliegoulding
@elonmusk
@IGGYAZALEA
@JaredDudley619
@jason_mraz
@jelani9
@Kehlani
@liamgallagher
@LukasGraham
@MariahCarey
@marshmellomusic
@NiallOfficial
@ParisHilton
@Pink
@rihanna
@RobertDowneyJr
@robreiner
@TheEllenShow
@tim_cook
@Zedd

In [22]:
# DB에서 분석할 트위터 텍스트 파일 가져오기
# 입력 값 ID 필요
request_id = "@tim_cook"

tweet_data = preproc_tweets(request_id)

tweet_data


Unnamed: 0,data,preprocess
0,The fires across the West Coast and storms hit...,the fires across the west coast and storms hit...
1,Grateful to the employees who joined Apple in ...,grateful to the employees who joined apple in ...
2,We’re proud to celebrate the power of unity an...,we’re proud to celebrate the power of unity an...
3,"Together we’re creating a brighter, more inclu...","together we’re creating a brighter, more inclu..."
4,Thank you @StevieWonder and @ACBnational!,thank you steviewonder and acbnational!
5,On the 10th anniversary of the passage of the ...,on the 10th anniversary of the passage of the ...
6,Humanity thrives when everyone has the ability...,humanity thrives when everyone has the ability...
7,We created the Health app because we believe e...,we created the health app because we believe e...
8,Proud to work alongside institutions in the UK...,proud to work alongside institutions in the uk...
9,“A great soul never dies. It brings us togethe...,“a great soul never dies. it brings us togethe...


In [23]:
df_res = pd.DataFrame({'text':[],
                   'label':[],
                   'score':[],
                  'elapsed_time':[]}) #결과 dataframe 생성

In [24]:
for col,item in tweet_data.iterrows():
  res=predict(item[1])
  df_res.loc[col]=[item[0], res['label'], res['score'],res['elapsed_time'] ]

df_res

Unnamed: 0,text,label,score,elapsed_time
0,The fires across the West Coast and storms hit...,NEGATIVE,0.171368,0.04778
1,Grateful to the employees who joined Apple in ...,NEUTRAL,0.437043,0.040954
2,We’re proud to celebrate the power of unity an...,POSITIVE,0.939416,0.043305
3,"Together we’re creating a brighter, more inclu...",POSITIVE,0.744879,0.042855
4,Thank you @StevieWonder and @ACBnational!,POSITIVE,0.920831,0.043294
5,On the 10th anniversary of the passage of the ...,POSITIVE,0.917318,0.037186
6,Humanity thrives when everyone has the ability...,NEGATIVE,0.349453,0.035052
7,We created the Health app because we believe e...,NEUTRAL,0.639461,0.034859
8,Proud to work alongside institutions in the UK...,POSITIVE,0.793581,0.039837
9,“A great soul never dies. It brings us togethe...,NEUTRAL,0.647015,0.035186


In [27]:
df_res['label'].value_counts()

POSITIVE    9
NEUTRAL     5
NEGATIVE    2
Name: label, dtype: int64

In [29]:
# 문장 비율 계산하기

df_res['label'].value_counts(normalize=True).mul(100).round(2).astype(str)+'%'

POSITIVE    56.25%
NEUTRAL     31.25%
NEGATIVE     12.5%
Name: label, dtype: object