# Import Library

In [None]:
import pandas as pd
from pandas import DataFrame as df

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
from  nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.tokenize import word_tokenize

#Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model #모델 저장

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import time #수행시간 측정
from google.colab import files #colab에 모델 save,load
from collections import Counter
import json

# emoji
!pip install emoji
import emoji

#Colab
from google.colab import drive

In [None]:
drive.mount('/gdrive', force_remount = True) # drive.mount('/content/gdrive') #,force_remount=True
my_path = '/gdrive/My Drive/Colab Notebooks/' # my_path='/content/gdrive/My Drive/Colab Notebooks/'

# Model

In [None]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
MAX_LEN = 50
VOCAB_SIZE = 400000

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.35, 0.7)

미리 학습된 토크나이저

In [None]:
tk = Tokenizer(num_words=VOCAB_SIZE)

with open(my_path+'wordIndex.json') as json_file:
    word_index = json.load(json_file)
    tk.word_index = word_index

prun80 모델 로드

In [None]:
model = load_model(my_path+'pruned80_tCNN.h5')

분석 결과 라벨링

In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score < SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score > SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [None]:
def predict(ex_text, include_neutral=True):
    start_at = time.time()
    x_encoded = tk.texts_to_sequences([ex_text])
    res_test=np.array(pad_sequences(x_encoded, maxlen=MAX_LEN, padding='post'))
    # Predict
    score = model.predict([res_test])
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

# 트윗 데이터로 예측

In [None]:
class preproc_Sentence:
    def __init__(self):
        pass

    def readTweets(request_id):
        id = request_id
        file_name = 'twitter_'
        fileformat = '.txt'
        filename = file_name + id + fileformat

        data_path = my_path+'data/'

        # 분석 요청된 유명인 트윗 파일 open
        with open(data_path + filename, 'r', encoding = "utf-8") as f:
            tweets = pd.read_csv(f, sep = "\n", names = ['data'])
        f.close()

        return tweets

    def preprocTweets(tweets):        
        # URL 변환
        tweets['data'] = tweets['data'].replace(to_replace = "((www\.[^\s]+)|(http?://[^\s]+)|(https?://[^\s]+))", value = "URL ", regex = True)
        # 소문자 변환
        tweets['preprocess'] = tweets['data'].str.lower()
        # @ 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "'@[^\s]+", value = "USERID", regex = True)
        # hashtag 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "#([^\s]+)", value = "HASHTAG", regex = True)
        # hashtag 변환
        tweets['preprocess'] = tweets['preprocess'].replace(to_replace = "([a-zA-Z0-9_.+-]@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]+)", value = "EMAIL", regex = True)
       
        # Emoji 변환
        tweets_raw = tweets['preprocess']

        for i in range(len(tweets_raw)):
            tweets_raw[i] = emoji.demojize(tweets_raw[i], use_aliases = True)

        tweets['preprocess'] = tweets_raw

        return tweets

In [None]:
class preproc_Word:
    def __init__(self):
        pass

    def readTweet(request_id):
        id = request_id
        file_name = 'twitter_'
        fileformat = '.txt'
        filename = file_name + id + fileformat

        data_path = my_path+'data/'

        # 분석 요청된 유명인 트윗 파일 open
        with open(data_path + filename, 'r', encoding = "utf-8") as file:
            tweet = file.read()
       
        return tweet

    def preprocWordTweet(tweet):
        # 소문자 변환
        tweet = tweet.lower()
        # URL 제거
        tweet = re.sub('((www\.[^\s]+)|(http?://[^\s]+)|(https?://[^\s]+))', '', tweet)
        # 구두점 제거
        tweet = re.sub(r'[^\w\s]', '', tweet)
        # 숫자 제거
        tweet = re.sub('\s[0-9]+', '', tweet)
        # 아이디 제거
        tweet = re.sub('@[A-Za-z0-9]+', '', tweet)
        # 이메일 제거
        tweet = re.sub('([a-zA-Z0-9_.+-]@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]+)', '', tweet)

        return tweet
    
    def tokenizeWord(tweet):
        word_tokens = word_tokenize(tweet)
 
        res = []
        for w in word_tokens: 
            if w not in stop_words: 
                res.append(w)
        return res
    
    def stemmerWord(res):
        words = [stemmer.stem(w) for w in res]
        return words

In [None]:
class word_COUNT:
    def __init__(self):
        pass

    def countWord(request_id):
        tweet = preproc_Word.readTweet(request_id)
        tweet = preproc_Word.preprocWordTweet(tweet)
        res = preproc_Word.tokenizeWord(tweet)
        words = preproc_Word.stemmerWord(res)
        print('자주 사용하는 단어 TOP5')
        print(Counter(words).most_common(n=5))
        print()

In [None]:
class tweet_SentimentAnalyse :
    def __init__(self):
        pass

    def sentimentAnalyse(tweets_data) :
        # 결과 dataframe 생성
        df_res = pd.DataFrame({'text':[], 'label':[], 'score':[], 'elapsed_time':[]})
        for col,item in tweets_data.iterrows():
            # predict class로 수정 필요
            res = predict(item[1])
            df_res.loc[col] = [item[0], res['label'], res['score'],res['elapsed_time'] ]
        return df_res

    def countTypes(df_res):
        # 전체 수 계산
        df_res['label'].value_counts()
        # 타입별 비율 계산
        print('트윗 문장 감정 비율')
        print(df_res['label'].value_counts(normalize=True).mul(100).round(2).astype(str)+'%')
        print('POSITIVE')
        print(df_res.sort_values(by="score", ascending=False).head(2))
        print('NEGATIVE')
        print(df_res.sort_values(by="score", ascending=True).head(2))
        print('NEUTRAL')
        df_res['cal'] = abs(df_res['score'] - 0.5)
        print(df_res.sort_values(by="cal", ascending=True).head(2))
        print()

트위터 계정 50

@AdinaPorter
@aliciakeys
@AllyBrooke
@altonbrown
@AnneMarie
@Ashton5SOS
@barbarastarrcnn
@BebeRexha
@iambeckyg
@BigSean
@BillGates
@chancetherapper
@charlieputh
@ChrisEvans
@ClintSmithIII
@DamonGupton
@DanRather
@DojaCat
@DUALIPA
@DwyaneWade
@TheEllenShow
@elliegoulding
@elonmusk
@GretchenCarlson
@IGGYAZALEA
@jameelajamil
@JaredDudley619
@jason_mraz
@jelani9
@Acosta
@jimcramer
@hitRECordJoe
@BBCkatyaadler
@Kehlani
@KimKardashian
@KingJames
@liamgallagher
@LukasGraham
@MariahCarey
@marshmellomusic
@megynkelly
@NiallOfficial
@Pink
@ParisHilton
@Rjeff24
@robreiner
@RobertDowneyJr
@StephenKing
@tim_cook
@Zedd

main()

In [None]:
if __name__ == "__main__":
    # 분석 아이디
    request_id = '@BBCkatyaadler'
    # 분석 시간 측정
    start_at = time.time()
    # 문장 전처리
    preproc_Sentence()
    tweets = preproc_Sentence.readTweets(request_id) # web에서는 실행 X
    tweets_data = preproc_Sentence.preprocTweets(tweets)
    # 트위터 감정 분석 및 2문장 제공
    df_res = tweet_SentimentAnalyse.sentimentAnalyse(tweets_data)
    tweet_SentimentAnalyse.countTypes(df_res)
    print()
    # 단어 전처리
    preproc_word = preproc_Word() # web에서는 실행 X
    # 단어 카운트
    word_COUNT.countWord(request_id)
    print("소요시간", time.time()-start_at)