In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import math
import pickle
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
# nltk.download('stopwords')

In [2]:
pickle_dir = os.path.join('pickles/sentiment_analysis')
cleaned_reviews_file = os.path.join(pickle_dir, 'cleaned_reviews_df.pkl')
df_classes_file = os.path.join(pickle_dir, 'df_classes.pkl')
vocab_file = os.path.join(pickle_dir, 'cleaned_reviews_vocab.pkl')
transformed_sentiment_file = os.path.join(pickle_dir, 'cleaned_reviews_x_sentiment.pkl')
classifier_file = os.path.join(pickle_dir, 'mnb_classifier.pkl')

#### Install missing packages

In [3]:
if 0 == 1:
    import sys
    !conda install --yes --prefix {sys.prefix} s3fs seaborn scikit-learn

In [4]:
def read_s3_bucket(bucket, data_key):
    data_location = 's3://{}/{}'.format(bucket, data_key)

    chunksize = 1000000
    chunk_list = []
    df_chunk = pd.read_csv(data_location, chunksize=chunksize)
    for chunk in df_chunk:
        chunk_list.append(chunk)

    df = pd.concat(chunk_list)
    return df

In [5]:
%%time
if 1 == 1:
    
    bucket = 'cs410-yelp'
    data_key = 'processed_data/cleaned_reviews.csv'

    df = read_s3_bucket(bucket, data_key)
    df = df.drop(labels='Unnamed: 0', axis=1)
    df['review_stars']   = df['review_stars'].astype(int)
    df['sentiment_text'] = df['sentiment_text'].astype(str)

CPU times: user 51.2 s, sys: 11.1 s, total: 1min 2s
Wall time: 9min 48s


In [6]:
df.head()
len(df.index)

Unnamed: 0,business_id,name,business_stars,review_count,review_id,review_stars,useful,text,topic_text,sentiment_text
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,128,6W0MQHmasK0IsaoDo4bmkw,3,3,My girlfriend and I went for dinner at Emerald...,girlfriend dinner thursday night workout arriv...,girlfriend dinner chinese. thursday night work...
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,128,BeeBfUxvzD4qNX4HxrgA5g,3,0,We've always been there on a Sunday so we were...,sunday saturday dim_sum luck surprise dish col...,sunday saturday dim_sum. busy. no luck surpris...
2,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,128,A1D2kUnZ0HTroFreAheNSg,3,0,"***No automatic doors, not baby friendly!*** I...",door post_partum dim_sum dish dinner time door...,no_automatic door not_baby friendly frequent c...
3,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,128,2pf45Stf-pNew-xgTababQ,1,1,"Horrible service,\nI went there tonight with m...",tonight boyfriend pass couple time want try fr...,horrible_service tonight boyfriend because pas...
4,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,2.5,128,RHhlmL07evgAdPaXQV8Omg,4,2,One of the gauges of a good Chinese restaurant...,gauge patronize patron wife dim_sum brunch wee...,gauge good chinese_number chinese_people patro...


3527902

In [7]:
stval = df.groupby('review_stars').mean()
stval

Unnamed: 0_level_0,business_stars,review_count,useful
review_stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.374313,507.893546,1.431601
2,3.529216,569.256905,1.338946
3,3.633498,615.48056,1.18983
4,3.790773,633.882706,1.206635
5,3.983838,647.55598,0.896749


In [8]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 

In [9]:
stopwords = frozenset(stop_words)

In [10]:
%%time
# CLASSIFICATION
df_classes = df[(df['review_stars'] == 1) | (df['review_stars'] == 3) | (df['review_stars'] == 5)]
df_classes = df_classes[(df_classes['useful'] == 1)]
df_classes.head()
print(df_classes.shape)

# Seperate the data set into X and Y for prediction
x = df_classes['sentiment_text']
y = df_classes['review_stars']
print(x.head())
print(y.head())

(473700, 10)
3     horrible_service tonight boyfriend because pas...
14    big chinese_mississauga solid good_food. recen...
25    time year definitely_change quality food gone_...
39    family probably_close year. almost certain cha...
41    unfortunately not_choice dimsum mississauga di...
Name: sentiment_text, dtype: object
3     1
14    3
25    1
39    3
41    3
Name: review_stars, dtype: int64
CPU times: user 718 ms, sys: 99.2 ms, total: 818 ms
Wall time: 829 ms


In [11]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords]

In [12]:
%%time
r0 = x[3]
print(r0)
vocab = CountVectorizer(analyzer=text_process,stop_words=stopwords).fit(x)
print(len(vocab.vocabulary_))
vocab0 = vocab.transform([r0])
print(vocab0)

horrible_service tonight boyfriend because pass couple time want try. bok choy_chicken fry rice lemon chicken. food chicken hard_look old like refried. complain server not_acknowledge complaint. leave tell server tell leave offering provide new. finally_woman server offer new_agree old. minute rice bok_choy time bring not_hungry consider not_want eat chicken rice cold exception sweet old_lady serve rest server rude server table check bill. much_tip couple leave. hear mention tip look bill throw angry. disgusting_will not_return.
815553
  (0, 30547)	1
  (0, 65563)	1
  (0, 73009)	2
  (0, 79829)	1
  (0, 79835)	1
  (0, 83080)	1
  (0, 85776)	1
  (0, 108636)	1
  (0, 111728)	3
  (0, 116277)	1
  (0, 127157)	1
  (0, 132886)	1
  (0, 132899)	1
  (0, 138959)	1
  (0, 148199)	2
  (0, 191543)	1
  (0, 208031)	1
  (0, 228138)	1
  (0, 255877)	1
  (0, 264997)	1
  (0, 281277)	1
  (0, 323597)	1
  (0, 327572)	1
  (0, 341484)	1
  (0, 401546)	3
  :	:
  (0, 484855)	1
  (0, 498489)	1
  (0, 504806)	1
  (0, 50945

#### Vectorization of the whole review set and and checking the sparse matrix:

In [13]:
%%time
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: {}".format(x.shape))
#Non-zero occurences:
print("Non-Zero occurences: {}".format(x.nnz))

# DENSITY OF THE MATRIX
density = (x.nnz / (x.shape[0] * x.shape[1])) * 100
print("Density of the matrix: {}".format(density))

Shape of the sparse matrix: (473700, 815553)
Non-Zero occurences: 14728803
Density of the matrix: 0.003812517792208764
CPU times: user 25.7 s, sys: 140 ms, total: 25.8 s
Wall time: 25.8 s


#### Splitting data set into training and testing set:

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [15]:
def print_results(y_true, y_pred, classifier_name):
    print("Confusion Matrix for {}:".format(classifier_name))
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['1-Star', '3-Star', '5-Star']))
    print("\nScore: {}".format(round(accuracy_score(y_true, y_pred)*100, 2)))

### Multinomial Naive Bayes

In [16]:
%%time
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
predmnb = mnb.predict(x_test)
print_results(y_test, predmnb, "Multinomial Naive Bayes")

Confusion Matrix for Multinomial Naive Bayes:
[[13316  2932   497]
 [ 2287 13353  5010]
 [  624  2262 54459]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.82      0.80      0.81     16745
      3-Star       0.72      0.65      0.68     20650
      5-Star       0.91      0.95      0.93     57345

    accuracy                           0.86     94740
   macro avg       0.82      0.80      0.81     94740
weighted avg       0.85      0.86      0.85     94740


Score: 85.63
CPU times: user 489 ms, sys: 91.8 ms, total: 581 ms
Wall time: 593 ms


### RandomForestClassifier

In [17]:
%%time
if 0 == 1:
    from sklearn.ensemble import RandomForestClassifier
    rmfr = RandomForestClassifier()
    rmfr.fit(x_train, y_train)
    p = rmfr.predict(x_test)
    print_results(y_test, p, "Random Forest Classifier")

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs


### Decision Tree

In [18]:
%%time
if 0 == 1:
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier()
    dt.fit(x_train,y_train)
    p = dt.predict(x_test)
    print_results(y_test, p, "Decision Tree")

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs


### Support Vector Machines

In [19]:
%%time
if 0 == 1:
    from sklearn.svm import SVC
    svm = SVC(random_state=101)
    svm.fit(x_train,y_train)
    p = svm.predict(x_test)
    print_results(y_test, p, "SVM")

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


### K - Nearest Neighbor Classifier

In [20]:
%%time
if 0 == 1:
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(x_train,y_train)
    p = knn.predict(x_test)
    print_results(y_test, p, "kNN")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


### Multilayer Perceptron

In [21]:
%%time
if 0 == 1:
    from sklearn.neural_network import MLPClassifier
    mlp = MLPClassifier()
    mlp.fit(x_train,y_train)
    p = mlp.predict(x_test)
    print_results(y_test, p, "Multilayer Perceptron")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [22]:
item = 11
pr = df['sentiment_text'][item]
print(pr)
print("\nActual Rating: {}".format(df['review_stars'][item]))
pr_t = vocab.transform([pr])
print("Predicted Rating: {}".format(mnb.predict(pr_t)[0]))

weekday_morning avoid weekend crowd definitely_enjoy experience food bunch_staple. lotus leaf_wrap sausage. sticky_rice preserve egg congee. egg_yolk bun. pretty_bomb. employee walk cart dim_sum. super cantonese_style item cart menu service super friendly funny_staff. willing_explain despite language_barrier environment bit outdated_lobby washroom particularly sketchy overall_will return tell weekend more_variety dim_sum.

Actual Rating: 4
Predicted Rating: 3


In [23]:
%%time
with open(cleaned_reviews_file, 'wb') as file:
    pickle.dump(df, file)

with open(df_classes_file, 'wb') as file:
    pickle.dump(df_classes, file)

with open(vocab_file, 'wb') as file:
    pickle.dump(vocab, file)

with open(transformed_sentiment_file, 'wb') as file:
    pickle.dump(x, file)

with open(classifier_file, 'wb') as file:
    pickle.dump(mnb, file)

CPU times: user 11.4 s, sys: 10.7 s, total: 22.1 s
Wall time: 25.5 s
