# 2019.07.06 - Sentiment Analysis

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings


warnings.filterwarnings('ignore')

df = pd.read_csv('../data/Tweets.csv')
df.tail()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
14635,569587686496825344,positive,0.3487,,0.0,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0,Customer Service Issue,1.0,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)
14639,569587140490866689,neutral,0.6771,,0.0,American,,daviddtwu,,0,@AmericanAir we have 8 ppl so we need 2 know h...,,2015-02-22 11:58:51 -0800,"dallas, TX",


In [2]:
comments = df[['airline_sentiment', 'text']]  # Colimn name 의 list 가 들어가야하므로 2중 리스트형태가 된다.
comments.tail()

Unnamed: 0,airline_sentiment,text
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."
14639,neutral,@AmericanAir we have 8 ppl so we need 2 know h...


In [3]:
idx_not_neutral = comments.airline_sentiment != 'neutral'
comments = comments[idx_not_neutral]
comments = comments.sort_values("airline_sentiment")  # negative, positive 순서 정렬
comments.index = range(1, len(comments)+1)  # index 정렬 (1 ~ 11541)
comments.head()

Unnamed: 0,airline_sentiment,text
1,negative,@JetBlue Are there airport-wide delays at BOS ...
2,negative,"@USAirways ""We can't help you. We don't put pe..."
3,negative,@USAirways my friends at KPHL were told by you...
4,negative,@USAirways wasted a day of my vacation after p...
5,negative,@USAirways it's unacceptable the way your agen...


In [4]:
idx_negative = comments.airline_sentiment == 'negative'
comments[idx_negative].tail()  # idx 9178 까지 negative

Unnamed: 0,airline_sentiment,text
9174,negative,"@SouthwestAir my bag was lost, and according t..."
9175,negative,@united 3875 to Denver which we are supposed t...
9176,negative,@united If you have had any issues with Unite...
9177,negative,@united Classic
9178,negative,"@united I can’t go back to the airport, I’m wo..."


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score  # metric: 평가방식
from sklearn.model_selection import cross_val_score, ShuffleSplit


corpus = comments.text.values
vectorizer = CountVectorizer(
    stop_words='english',  # 영어의 관사 등을 제거
    token_pattern=r'[a-z]{2,}',  # 토근패턴 설정, 정규표현식 사용
    min_df=3,  # 단어가 포함된 문서의 최소 빈도 수
    lowercase=True,
)


document_term_matrix = vectorizer.fit_transform(corpus)
document_term_matrix = document_term_matrix.toarray()  # 기존 sparse matrix 에서 array 형태로 변환
document_term_matrix.shape

(11541, 3470)

In [6]:
words = vectorizer.get_feature_names()  # 벡터화된 단어 가져오기
words_cnt = len(words)  # 총 단어갯수
dict_vocab = vectorizer.vocabulary_
dict_idx = {idx:key for key,idx in dict_vocab.items()}  # vocab_dict 의 key, idx 를 반대로 뒤집는다.

In [7]:
document_term_matrix[0]

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
origin_doc = corpus[0]
d_idx = np.where(document_term_matrix[0])[0]
# np.where 는 argument 인 document_tetm_matrix[0] != 0 가 Default
# np.where(document_term_matrix[0]) --> tuple
# 따라서 np.where(document_term_matrix[0])[0] 으로 첫번째 document 를 가져온다.
doc = [dict_idx[i] for i in d_idx]
print(f'원문: {origin_doc}\n전처리문서: {doc}')

원문: @JetBlue Are there airport-wide delays at BOS or did we just get unlucky with 152?
전처리문서: ['airport', 'bos', 'delays', 'did', 'jetblue', 'just', 'wide']


In [9]:
labels = comments.airline_sentiment == 'positive'
labels = labels.astype(int).values
labels

array([0, 0, 0, ..., 1, 1, 1])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(document_term_matrix, labels, train_size=0.8)
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

9232
2309
9232
2309


- document_term_matrix : X_train, X_test 로 분리된다.

- labels : y_train, y_test 로 분리된다.

> 대문자는 행렬, 소문자는 벡터를 의미

In [11]:
logistic_model = LogisticRegression()
logistic_model.fit(X=X_train, y=y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_pred = logistic_model.predict(X_test)
accuracy_score(y_test, y_pred)  # y_test: True Value, y_red: Prediction Value

0.9142485924642703

In [13]:
# Cross Validation Score
cross_val_score(logistic_model, X_train, y_train, cv=4)

array([0.9177133 , 0.9177133 , 0.91504118, 0.90940616])

In [14]:
# Shuffle Split
shuffle = ShuffleSplit(n_splits=5, test_size=0.2)

models = []
scores = []
test_scores = []
for idx_train, idx_valid in shuffle.split(X_train):  # idx_train: 80%, idx_valid: 20%
    X_batch = X_train[idx_train]
    y_batch = y_train[idx_train]
    X_valid = X_train[idx_valid]
    y_valid = y_train[idx_valid]
    logistic_model.fit(X=X_batch, y=y_batch)

    y_pred = logistic_model.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    scores.append(score)

    test_pred = logistic_model.predict(X_test)
    test_score = accuracy_score(y_test, test_pred)
    test_scores.append(test_score)
    models.append(logistic_model)

In [15]:
scores, test_scores

([0.916080129940444,
  0.9214943151055766,
  0.9149972929074174,
  0.9144558743909041,
  0.9041689225771521],
 [0.9073191857947164,
  0.9081853616284106,
  0.9112169770463404,
  0.9081853616284106,
  0.9081853616284106])

In [16]:
coefs = logistic_model.coef_
coefs = coefs.flatten()
coef_dict = {words[i]:coefs[i] for i in range(len(words))}
sorted_coef = sorted(coef_dict.items(), key=lambda x:x[1])

In [17]:
# Negative Words
sorted_coef[:10]

[('worst', -2.533347610964506),
 ('hold', -1.8267891770665676),
 ('delayed', -1.7726031653692627),
 ('website', -1.7018716936414864),
 ('hours', -1.6971392001591206),
 ('screwed', -1.6894200237706132),
 ('hour', -1.6402804635422756),
 ('paid', -1.6199977323549646),
 ('hotel', -1.5583888033084952),
 ('come', -1.5423993760421097)]

In [18]:
# Positive Words
sorted_coef[-10:][::-1]

[('thank', 3.7270384481814607),
 ('thanks', 2.8981310283722266),
 ('awesome', 2.660090861221954),
 ('kudos', 2.461290429478214),
 ('amazing', 2.4043480947315254),
 ('love', 2.3522251800224185),
 ('great', 2.283855533011556),
 ('excellent', 2.2473870256742177),
 ('best', 2.2387922408434338),
 ('impressed', 2.1741860332848995)]