In [1]:
%pylab inline

import pandas as pd
import numpy as np
import os
import pymysql
import seaborn as sns
import nltk

import sqlalchemy
import tempfile
import hashlib

from nltk import corpus
from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import SMOTE
from langdetect import detect as langdetect

Populating the interactive namespace from numpy and matplotlib


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def query(sql):
    base_dir = os.path.join(tempfile.gettempdir(), 'yelp')
    hash_key = hashlib.sha1(sql.encode('utf-8')).hexdigest()
    file_path = os.path.join(base_dir, hash_key + '.csv')
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
        
    if os.path.exists(file_path):
        data = pd.read_csv(file_path, index_col=0)
    else:
        engine = sqlalchemy.create_engine('mysql+pymysql://anderson@localhost:3306/yelp')
        data = pd.read_sql(sql, con=engine)
        data.to_csv(file_path)
    return data

# Initialize Stopwords
nltk.download('stopwords')
stopwords = set(corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anderson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Sentiment Analysis Model
 - Ranking에서 가장 영향력 있었던 businiess_id = WYw3Uf56DT5IwpaLNnCH5Q 를 사용한다

## Load Review Data

In [31]:
sql = '''
select cast(stars as int) as star, useful, funny, cool, lower(text) as text
from review r
where business_id = 'na4Th5DrNauOv-c43QQFvA';
'''
data = query(sql)
data.loc[(data['star'] < 3), 'star'] = 0
data.loc[(data['star'] > 3), 'star'] = 1
data = data[data['star'] != 3]
print(data.shape)
display(data.groupby('star').count())
data.head()

(2883, 5)


Unnamed: 0_level_0,useful,funny,cool,text
star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,751,751,751,751
1,2132,2132,2132,2132


Unnamed: 0,star,useful,funny,cool,text
0,1,1.0,0.0,1.0,the bellagio is one of the older hotels on the...
1,1,7.0,0.0,0.0,"bellagio is the epitome of luxury in vegas, bu..."
2,1,0.0,0.0,0.0,our stay at the bellagio has been nothing shor...
4,0,1.0,0.0,0.0,the lady (lexi) who checked us in was extremel...
5,1,0.0,0.0,0.0,the bellagio is by far my favorite choice of h...


## Determine Country by Text

In [32]:
def determine_country(x):
    try:
        r = langdetect(x)
    except:
        r = None
    return r

data['country'] = data['text'].apply(determine_country)
data = data[data['country'] == 'en']
data.head()

Unnamed: 0,star,useful,funny,cool,text,country
0,1,1.0,0.0,1.0,the bellagio is one of the older hotels on the...,en
1,1,7.0,0.0,0.0,"bellagio is the epitome of luxury in vegas, bu...",en
2,1,0.0,0.0,0.0,our stay at the bellagio has been nothing shor...,en
4,0,1.0,0.0,0.0,the lady (lexi) who checked us in was extremel...,en
5,1,0.0,0.0,0.0,the bellagio is by far my favorite choice of h...,en


## Tokenizing

In [33]:
# Regex Tokenizer
tokenizer = RegexpTokenizer('\w+')
no_space_regex = re.compile("[.;:!?,\"()\[\]\n\-\/\d_]")
space_regex = re.compile('\s+')
def preprocess_text(t):
    t = no_space_regex.sub(' ', t)
    t = space_regex.sub(' ', t)
    t = t.strip()
    # t = tokenizer.tokenize(t)
    t = t.split(' ')
    t = ' '.join(filter(lambda w: w not in stopwords, t))
    return t

data['text2'] = data['text'].apply(preprocess_text)
data.dropna(inplace=True)
data.drop('text', axis=1)

Unnamed: 0,star,useful,funny,cool,country,text2
0,1,1.0,0.0,1.0,en,bellagio one older hotels strip relatively spe...
1,1,7.0,0.0,0.0,en,bellagio epitome luxury vegas they've around s...
2,1,0.0,0.0,0.0,en,stay bellagio nothing short wonderful employee...
4,0,1.0,0.0,0.0,en,lady lexi checked us extremely rude find reser...
5,1,0.0,0.0,0.0,en,bellagio far favorite choice hotels vegas room...
...,...,...,...,...,...,...
3293,0,0.0,0.0,0.0,en,woken morning loud party going room next door ...
3294,1,0.0,0.0,0.0,en,what's love bellagio moment valet vehicle chec...
3297,0,0.0,0.0,0.0,en,spent nights bellagio march colleague watch fo...
3298,1,0.0,0.0,0.0,en,partner stayed first time recently three night...


## Split Data to Train and Test

In [91]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

np.random.seed(3)

data_x = np.array(data['text2']).reshape(-1, 1)
data_y = np.array(data['star']).reshape(-1, 1)

# Resampling
sampler = RandomOverSampler()
data_x, data_y = sampler.fit_resample(data_x, data_y)

# Split resampled data to train data and test data
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y)
train_x = train_x.reshape(-1)
test_x = test_x.reshape(-1)

print('train_x:', len(train_x))
print('train_y:', len(train_y))
print('ratio :', round(sum(train_y)/len(train_y), 2))
print('test_x:', len(test_x))
print('test_y:', len(test_y))
print('ratio :', sum(test_y)/len(test_y))
print()

train_x: 3172
train_y: 3172
ratio : 0.5
test_x: 1058
test_y: 1058
ratio : 0.499054820415879



  y = column_or_1d(y, warn=True)


# Model

## Bernoulli Naive Bayes

In [92]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer


pipeline = Pipeline([
    ('vectorization', CountVectorizer()), 
    ('model', BernoulliNB())
])

pipeline.fit(train_x, train_y)
pred_y = pipeline.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.95      0.78      0.86       530
           1       0.81      0.95      0.88       528

    accuracy                           0.87      1058
   macro avg       0.88      0.87      0.87      1058
weighted avg       0.88      0.87      0.87      1058



## Bernoulli Naive Bayes with TF-IDF

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ('vectorization', TfidfVectorizer()),
#     ('tfidf', TfidfTransformer()),
    ('model', BernoulliNB())
])

pipeline.fit(train_x, train_y)
pred_y = pipeline.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.93      0.74      0.82       530
           1       0.78      0.94      0.86       528

    accuracy                           0.84      1058
   macro avg       0.86      0.84      0.84      1058
weighted avg       0.86      0.84      0.84      1058



In [104]:
dir(pipeline)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [120]:
vectorizer = pipeline.get_params()['vectorization']
model = pipeline.get_params()['model']

df = pd.DataFrame(model.feature_log_prob_.T,
                  columns=['neg', 'pos'],
                  index=vectorizer.get_feature_names())

df['rank'] = df.apply(lambda x: np.exp(x['neg']) - np.exp(x['pos']), axis=1)
df.sort_values('rank', inplace=True)

print('[Positive]')
print(df.iloc[:50].index)

print('\n[Negative]')
print(df.iloc[-50:].index)

[Positive]
Index(['great', 'beautiful', 'amazing', 'show', 'fountain', 'fountains',
       'strip', 'love', 'vegas', 'best', 'restaurants', 'always', 'casino',
       'nice', 'favorite', 'view', 'conservatory', 'gorgeous', 'location',
       'garden', 'wonderful', 'clean', 'pool', 'glass', 'lobby', 'loved',
       'every', 'awesome', 'stayed', 'flowers', 'buffet', 'place', 'spacious',
       'spa', 'friendly', 'ceiling', 'pools', 'comfortable', 'worth', 'area',
       'perfect', 'everything', 'definitely', 'fun', 'shopping', 'watch',
       'pretty', 'enjoyed', 'music', 'huge'],
      dtype='object')

[Negative]
Index(['people', 'parking', 'since', 'experience', 'hours', 'day', 'nothing',
       'next', 'finally', 'took', 'horrible', 'give', 'first', 'call', 'pay',
       'way', 'booked', 'came', 'get', 'worst', 'disappointed', 'got',
       'checked', 'left', 'paid', 'bad', 'went', 'charge', 'rude', 'better',
       'could', 'front', 'customer', 'money', 'manager', 'another', 'check',