In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

Mounted at /gdrive


# Import Library

In [2]:
# tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Word2vec
import gensim
from gensim.models import Word2Vec #@
from gensim.utils import simple_preprocess #@
from gensim.models.keyedvectors import KeyedVectors #@

# Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model # 모델 저장

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from google.colab import data_table
%unload_ext google.colab.data_table
import time # 수행시간 측정
from google.colab import files # colab에 모델 save,load
from collections import Counter


# emoji
!pip install emoji
import emoji

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The google.colab.data_table extension is not loaded.


In [3]:
# =============== 셋팅 =============== #

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#전처리
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# Load Data

In [4]:
my_path = '/gdrive/My Drive/Colab Notebooks/'

In [5]:
#학습데이터 로드
dataset = pd.read_csv(my_path + 'train.csv', encoding = DATASET_ENCODING, names = DATASET_COLUMNS)
print(dataset.shape) #1600000,6

(1600000, 6)


In [6]:
dataset.head() # negative:0, positive:4

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


디코더 생성

In [7]:
decode_map = {0: NEGATIVE, 2: NEUTRAL, 4: POSITIVE} #숫자 => 분류 문장
def decode_sentiment(label):
    return decode_map[int(label)]

In [8]:
%%time
dataset.target = dataset.target.apply(lambda x: decode_sentiment(x))

CPU times: user 512 ms, sys: 3.78 ms, total: 516 ms
Wall time: 517 ms


# Pre-Processing 

클리닝 텍스트

In [9]:
# 학습 데이터 텍스트 전처리
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [10]:
# 학습 데이터 전처리 진행
dataset.text = dataset.text.apply(lambda x: preprocess(x)) 

학습 데이터 나누기 

In [11]:
train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

TRAIN size: 1280000
TEST size: 320000


In [12]:
documents = [_text.split() for _text in train.text] #list, 1280000*50

In [13]:
vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text) 
x_train = tk.texts_to_sequences(train.text)
x_test = tk.texts_to_sequences(test.text)

In [14]:
labels = train.target.unique().tolist() # POSITIVE NEUTRAL NEGATIVE
labels.append(NEUTRAL)
print(labels)

encoder = LabelEncoder() # 문장 -> 숫자 자동으로
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1) # 1열로 자동으로 만들어줍니다.
y_test = y_test.reshape(-1,1)

['POSITIVE', 'NEGATIVE', 'NEUTRAL']


# Build Model

In [15]:
max_len=max(len(l) for l in x_train) # 한 문장에서 최대 단어 개수를 반환 # max_len=50

In [16]:
X_train = np.array(pad_sequences(x_train, maxlen=max_len, padding='post')) # max_len만큼 padding 값 설정 
print(X_train.shape, y_train.shape)

(1280000, 50) (1280000, 1)


text-CNN 모델 로드

In [17]:
model = load_model(my_path+'text-CNN.h5')

In [18]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [19]:
def predict(ex_text, include_neutral=True):
    start_at = time.time()
    x_encoded = tk.texts_to_sequences([ex_text])
    res_test = np.array(pad_sequences(x_encoded, maxlen=max_len, padding='post'))
    # Predict
    score = model.predict([res_test])
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [20]:
predict("That's so sad")

{'elapsed_time': 0.2158803939819336,
 'label': 'NEGATIVE',
 'score': 0.05291527509689331}

# 트위터 데이터로 예측해보기

트위터 문장 감정 분석 - 전처리

In [21]:
# 트윗 문장 전처리
class preproc_Sentence:
    def __init__(self):
        pass

    def readTweets(request_id):
        id = request_id
        file_name = 'twitter_'
        fileformat = '.txt'
        filename = file_name + id + fileformat

        data_path = '/gdrive/My Drive/Colab Notebooks/data/'

        # 분석 요청된 유명인 트윗 파일 open
        with open(data_path + filename, 'r', encoding = "utf-8") as f:
            tweets = pd.read_csv(f, sep = "\n", names = ['data'])
        f.close()

        return tweets

    def preprocTweets(tweets):        
        # URL 변환
        tweets['data'] = tweets['data'].replace(to_replace = "((www\.[^\s]+)|(https?://[^\s]+))", value = "URL ", regex = True)
        # 소문자 변환
        tweets['preprocess'] = tweets['data'].str.lower()
        # @ 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "'@[^\s]+", value = "USERID", regex = True)
        # hashtag 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "#([^\s]+)", value = "HASHTAG", regex = True)
        # 이메일 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "([a-zA-Z0-9_.+-]@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]+)", value = "EMAIL", regex = True)
       
        # Emoji 변환
        tweets_raw = tweets['preprocess']

        for i in range(len(tweets_raw)):
            tweets_raw[i] = emoji.demojize(tweets_raw[i], use_aliases = True)

        tweets['preprocess'] = tweets_raw

        return tweets

트위터 단어 카운트 - 전처리

In [22]:
class preproc_Word:
    def __init__(self):
        pass

    def readTweet(request_id):
        id = request_id
        file_name = 'twitter_'
        fileformat = '.txt'
        filename = file_name + id + fileformat

        data_path = '/gdrive/My Drive/Colab Notebooks/data/'

        # 분석 요청된 유명인 트윗 파일 open
        with open(data_path + filename, 'r', encoding = "utf-8") as file:
            tweet = file.read()
       
        return tweet

    def preprocWordTweet(tweet):
        # 소문자 변환
        tweet = tweet.lower()
        # 구두점 제거
        tweet = re.sub(r'[^\w\s]', '', tweet)
        # URL 제거
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
        # 숫자 제거
        tweet = re.sub('\s[0-9]+', '', tweet)
        # 아이디 제거
        tweet = re.sub('@[A-Za-z0-9]+', '', tweet)
        # 이메일 제거
        tweet = re.sub('([a-zA-Z0-9_.+-]@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]+)', '', tweet)

        return tweet
    
    def tokenizeWord(tweet):
        # stop_words 중복 -
        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(tweet)
 
        res = []
        for w in word_tokens: 
            if w not in stop_words: 
                res.append(w)
        return res
    
    def stemmerWord(res):
        # stemmer 중복-
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in res]
        return words

단어 카운트

In [23]:
class word_COUNT:
    def __init__(self):
        pass

    def countWord(request_id):
        tweet = preproc_Word.readTweet(request_id)
        tweet = preproc_Word.preprocWordTweet(tweet)
        res = preproc_Word.tokenizeWord(tweet)
        words = preproc_Word.stemmerWord(res)
        print('자주 사용하는 단어 TOP5')
        print(Counter(words).most_common(n=5))
        print()

트윗 감정 분석하기

In [24]:
class tweet_SentimentAnalyse :
    def __init__(self):
        pass

    def sentimentAnalyse(request_id) :
        tweets = preproc_Sentence.readTweets(request_id)
        tweets_data = preproc_Sentence.preprocTweets(tweets)

        # 결과 dataframe 생성
        df_res = pd.DataFrame({'text':[], 'label':[], 'score':[], 'elapsed_time':[]})
        for col,item in tweets_data.iterrows():
            # predict class로 수정 필요
            res = predict(item[1])
            df_res.loc[col] = [item[0], res['label'], res['score'],res['elapsed_time'] ]
        return df_res

    def countTypes(df_res):
        # 전체 수 계산
        df_res['label'].value_counts()
        # 타입별 비율 계산
        print('트윗 문장 감정 비율')
        print(df_res['label'].value_counts(normalize=True).mul(100).round(2).astype(str)+'%')
        print()

@AdinaPorter
@aliciakeys
@AnneMarie
@BillGates
@birdy
@charlieputh
@ChrisEvans
@DanReynolds
@DojaCat
@DwyaneWade
@elliegoulding
@elonmusk
@IGGYAZALEA
@JaredDudley619
@jason_mraz
@jelani9
@Kehlani
@liamgallagher
@LukasGraham
@MariahCarey
@marshmellomusic
@NiallOfficial
@ParisHilton
@Pink
@rihanna
@RobertDowneyJr
@robreiner
@TheEllenShow
@tim_cook
@Zedd

# MAIN 함수

In [25]:
if __name__ == "__main__":
    # 분석 아이디
    request_id = '@aliciakeys'
    # 문장 전처리
    preproc_Sentence()
    # 트위터 감정 분석
    df_res = tweet_SentimentAnalyse.sentimentAnalyse(request_id)
    tweet_SentimentAnalyse.countTypes(df_res)
    print()
    # 단어 전처리
    preproc_Word()
    # 단어 카운트
    word_COUNT.countWord(request_id)

트윗 문장 감정 비율
POSITIVE    59.15%
NEUTRAL     23.24%
NEGATIVE    17.61%
Name: label, dtype: object


자주 사용하는 단어 TOP5
[('love', 18), ('alicia', 15), ('album', 12), ('im', 11), ('yall', 9)]



In [26]:
data_table.DataTable(df_res)

Unnamed: 0,text,label,score,elapsed_time
0,Sending you some of that GOOD Sunday love!!!,POSITIVE,0.890622,0.032356
1,Tell me 3 things you’re grateful for.... Me fi...,POSITIVE,0.879244,0.036116
2,My brother @inglewoodSiR buggin’!!!!,NEUTRAL,0.523129,0.031741
3,"Ask and you shall receive, our special version...",NEUTRAL,0.511510,0.030600
4,I feel that #ALICIA love so deeply ya’ll.,POSITIVE,0.751576,0.031121
...,...,...,...,...
137,What does soulcare mean to you?,NEGATIVE,0.252066,0.032799
138,💫💫💫,NEGATIVE,0.094941,0.032558
139,☕️☕️☕️😉😉😉,POSITIVE,0.888256,0.032392
140,💥💥💥💥💥💥,POSITIVE,0.915786,0.032298
