In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
%matplotlib inline
# nltk.download('stopwords')

#### Install missing packages

In [2]:
if 0 == 1:
    import sys
    !conda install --yes --prefix {sys.prefix} s3fs seaborn scikit-learn

In [3]:
def read_s3_bucket(bucket, data_key):
    data_location = 's3://{}/{}'.format(bucket, data_key)

    chunksize = 1000000
    chunk_list = []
    df_chunk = pd.read_csv(data_location, chunksize=chunksize)
    for chunk in df_chunk:
        chunk_list.append(chunk)

    df = pd.concat(chunk_list)
    return df

In [25]:
%%time
if 1 == 1:
    
    bucket = 'cs410-yelp'
    data_key = 'processed_data/cleaned_reviews_il.csv'

    df = read_s3_bucket(bucket, data_key)
    df = df.drop(labels='Unnamed: 0', axis=1)
    df['review_stars'] = df['review_stars'].astype(int)
    df['sentiment_text'] = df['sentiment_text'].astype(str)

CPU times: user 363 ms, sys: 144 ms, total: 507 ms
Wall time: 9.33 s


In [5]:
df.head()
len(df.index)

Unnamed: 0,business_id,name,review_count,business_stars,review_id,review_stars,text,useful,topic_text,sentiment_text
0,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,8MTptiOpUeuPUFZgtfk9Vw,1,I would have given this restaurant zero stars ...,1,give zero_star allow find roach lurch food buf...,give zero_star allow. disgusting. find roach l...
1,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,ofpfzn8LV4nJ2pE6IlTVdg,1,So...no. Just...no. Before I picked up a plate...,1,pick plate start food notice roach buffet act ...,pick plate start food. notice roach buffet....
2,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,O3yApCw95tbA0kodflQrpA,5,The best food I want to the a lot of the buffe...,0,want lot buffet,good_food want lot buffet best_eat great_service.
3,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,fFTVZE7EENdl66SX9cgc0A,1,It's impressive how filthy this place is. Some...,0,puke bathroom mess sit smell condition floor a...,impressive filthy. puke. bathroom. mess sit sm...
4,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,pDlIw6DLLjZ5G1vJnMzr9A,5,I was visiting the Champaign area with my fami...,0,visit champaign area family recently_move sugg...,visit champaign area family recently_move. sug...


19220

In [6]:
stval = df.groupby('review_stars').mean()
stval

Unnamed: 0_level_0,review_count,business_stars,useful
review_stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,150.897895,3.413684,1.790526
2,166.664529,3.553527,1.516527
3,182.922291,3.634258,0.883619
4,204.313246,3.781063,0.832649
5,245.09302,3.934302,0.93465


In [7]:
# CLASSIFICATION
df_classes = df[(df['review_stars'] == 1) | (df['review_stars'] == 3) | (df['review_stars'] == 5)]
df_classes.head()
print(df_classes.shape)

# Seperate the data set into X and Y for prediction
x = df_classes['sentiment_text']
y = df_classes['review_stars']
print(x.head())
print(y.head())

Unnamed: 0,business_id,name,review_count,business_stars,review_id,review_stars,text,useful,topic_text,sentiment_text
0,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,8MTptiOpUeuPUFZgtfk9Vw,1,I would have given this restaurant zero stars ...,1,give zero_star allow find roach lurch food buf...,give zero_star allow. disgusting. find roach l...
1,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,ofpfzn8LV4nJ2pE6IlTVdg,1,So...no. Just...no. Before I picked up a plate...,1,pick plate start food notice roach buffet act ...,pick plate start food. notice roach buffet....
2,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,O3yApCw95tbA0kodflQrpA,5,The best food I want to the a lot of the buffe...,0,want lot buffet,good_food want lot buffet best_eat great_service.
3,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,fFTVZE7EENdl66SX9cgc0A,1,It's impressive how filthy this place is. Some...,0,puke bathroom mess sit smell condition floor a...,impressive filthy. puke. bathroom. mess sit sm...
4,mofOjB6flg-eAWOFbOkHfQ,ChinaTown Buffet,72,2.5,pDlIw6DLLjZ5G1vJnMzr9A,5,I was visiting the Champaign area with my fami...,0,visit champaign area family recently_move sugg...,visit champaign area family recently_move. sug...


(11833, 10)
0    give zero_star allow. disgusting. find roach l...
1       pick plate start food. notice roach buffet....
2    good_food want lot buffet best_eat great_service.
3    impressive filthy. puke. bathroom. mess sit sm...
4    visit champaign area family recently_move. sug...
Name: sentiment_text, dtype: object
0    1
1    1
2    5
3    1
4    5
Name: review_stars, dtype: int64


In [8]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [9]:
%%time
vocab = CountVectorizer(analyzer=text_process).fit(x)
print(len(vocab.vocabulary_))
r0 = x[0]
print(r0)
vocab0 = vocab.transform([r0])
print(vocab0)

56794
give zero_star allow. disgusting. find roach lurch food. clean_dish buffet item dirty. look plate inspect authority.
  (0, 1346)	1
  (0, 2985)	1
  (0, 5913)	1
  (0, 7983)	1
  (0, 12540)	1
  (0, 12774)	1
  (0, 17324)	1
  (0, 17873)	1
  (0, 19757)	1
  (0, 25528)	1
  (0, 26191)	1
  (0, 29235)	1
  (0, 29658)	1
  (0, 39861)	1
  (0, 44117)	1
  (0, 56764)	1
CPU times: user 38.7 s, sys: 7.32 s, total: 46 s
Wall time: 46.1 s


#### Vectorization of the whole review set and and checking the sparse matrix:

In [10]:
%%time
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: {}".format(x.shape))
#Non-zero occurences:
print("Non-Zero occurences: {}".format(x.nnz))

# DENSITY OF THE MATRIX
density = (x.nnz / (x.shape[0] * x.shape[1])) * 100
print("Density of the matrix: {}".format(density))

Shape of the sparse matrix: (11833, 56794)
Non-Zero occurences: 347383
Density of the matrix: 0.05169056030699637
CPU times: user 38.2 s, sys: 7.15 s, total: 45.4 s
Wall time: 45.5 s


#### Splitting data set into training and testing set:

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [12]:
def print_results(y_true, y_pred, classifier_name):
    print("Confusion Matrix for {}:".format(classifier_name))
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['1-Star', '3-Star', '5-Star']))
    print("\nScore: {}".format(round(accuracy_score(y_true, y_pred)*100, 2)))

### Multinomial Naive Bayes

In [13]:
%%time
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
predmnb = mnb.predict(x_test)
print_results(y_test, predmnb, "Multinomial Naive Bayes")

Confusion Matrix for Multinomial Naive Bayes:
[[ 246   70   58]
 [  53  251  283]
 [   7   39 1360]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.80      0.66      0.72       374
      3-Star       0.70      0.43      0.53       587
      5-Star       0.80      0.97      0.88      1406

    accuracy                           0.78      2367
   macro avg       0.77      0.68      0.71      2367
weighted avg       0.77      0.78      0.77      2367


Score: 78.45
CPU times: user 21.2 ms, sys: 3.84 ms, total: 25.1 ms
Wall time: 23.5 ms


### RandomForestClassifier

In [14]:
%%time
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier()
rmfr.fit(x_train, y_train)
p = rmfr.predict(x_test)
print_results(y_test, p, "Random Forest Classifier")



Confusion Matrix for Random Forest Classifier:
[[ 188   65  121]
 [  78  155  354]
 [  45   81 1280]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.60      0.50      0.55       374
      3-Star       0.51      0.26      0.35       587
      5-Star       0.73      0.91      0.81      1406

    accuracy                           0.69      2367
   macro avg       0.62      0.56      0.57      2367
weighted avg       0.66      0.69      0.65      2367


Score: 68.57
CPU times: user 4.01 s, sys: 25.9 ms, total: 4.03 s
Wall time: 4.04 s


### Decision Tree

In [15]:
%%time
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
p = dt.predict(x_test)
print_results(y_test, p, "Decision Tree")

Confusion Matrix for Decision Tree:
[[ 183   89  102]
 [  89  211  287]
 [  91  158 1157]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.50      0.49      0.50       374
      3-Star       0.46      0.36      0.40       587
      5-Star       0.75      0.82      0.78      1406

    accuracy                           0.66      2367
   macro avg       0.57      0.56      0.56      2367
weighted avg       0.64      0.66      0.64      2367


Score: 65.53
CPU times: user 4.41 s, sys: 14 ms, total: 4.43 s
Wall time: 4.44 s


### Support Vector Machines

In [16]:
%%time
from sklearn.svm import SVC
svm = SVC(random_state=101)
svm.fit(x_train,y_train)
p = svm.predict(x_test)
print_results(y_test, p, "SVM")



Confusion Matrix for SVM:
[[   0    0  374]
 [   0    0  587]
 [   0    0 1406]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.00      0.00      0.00       374
      3-Star       0.00      0.00      0.00       587
      5-Star       0.59      1.00      0.75      1406

    accuracy                           0.59      2367
   macro avg       0.20      0.33      0.25      2367
weighted avg       0.35      0.59      0.44      2367


Score: 59.4
CPU times: user 22.5 s, sys: 200 ms, total: 22.7 s
Wall time: 22.8 s


  'precision', 'predicted', average, warn_for)


### K - Nearest Neighbor Classifier

In [17]:
%%time
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
p = knn.predict(x_test)
print_results(y_test, p, "kNN")

Confusion Matrix for kNN:
[[238  17 119]
 [212  48 327]
 [382  26 998]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.29      0.64      0.39       374
      3-Star       0.53      0.08      0.14       587
      5-Star       0.69      0.71      0.70      1406

    accuracy                           0.54      2367
   macro avg       0.50      0.48      0.41      2367
weighted avg       0.59      0.54      0.51      2367


Score: 54.25
CPU times: user 717 ms, sys: 169 ms, total: 886 ms
Wall time: 887 ms


### Multilayer Perceptron

In [18]:
%%time
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(x_train,y_train)
p = mlp.predict(x_test)
print_results(y_test, p, "Multilayer Perceptron")

Confusion Matrix for Multilayer Perceptron:
[[ 279   54   41]
 [  67  329  191]
 [  16   95 1295]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.77      0.75      0.76       374
      3-Star       0.69      0.56      0.62       587
      5-Star       0.85      0.92      0.88      1406

    accuracy                           0.80      2367
   macro avg       0.77      0.74      0.75      2367
weighted avg       0.80      0.80      0.80      2367


Score: 80.4
CPU times: user 10min 6s, sys: 1min 43s, total: 11min 50s
Wall time: 4min 1s


In [27]:
item = 11
pr = df['sentiment_text'][item]
print(pr)
print("\nActual Rating: {}".format(df['review_stars'][item]))
pr_t = vocab.transform([pr])
print("Predicted Rating: {}".format(mlp.predict(pr_t)[0]))

 suppose care hunger. will probably not.

Actual Rating: 3
Predicted Rating: 3
