In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
%matplotlib inline
# nltk.download('stopwords')

#### Install missing packages

In [2]:
if 0 == 1:
    import sys
    !conda install --yes --prefix {sys.prefix} s3fs seaborn scikit-learn

In [3]:
def read_s3_bucket(bucket, data_key):
    data_location = 's3://{}/{}'.format(bucket, data_key)

    chunksize = 1000000
    chunk_list = []
    df_chunk = pd.read_csv(data_location, chunksize=chunksize)
    for chunk in df_chunk:
        chunk_list.append(chunk)

    df = pd.concat(chunk_list)
    return df

In [4]:
%%time
if 1 == 1:
    
    bucket = 'cs410-yelp'
    data_key = 'processed_data/cleaned_reviews.csv'

    #df = read_s3_bucket(bucket, data_key)
    df = pd.read_csv('processed_data/cleaned_reviews.csv')
    df = df.drop(labels='Unnamed: 0', axis=1)
    df['review_stars']   = df['review_stars'].astype(int)
    df['sentiment_text'] = df['sentiment_text'].astype(str)

Wall time: 52.1 s


In [5]:
df.head()
len(df.index)

3527902

In [6]:
stval = df.groupby('review_stars').mean()
stval

Unnamed: 0_level_0,business_stars,review_count,useful
review_stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.374313,507.893546,1.431601
2,3.529216,569.256905,1.338946
3,3.633498,615.48056,1.18983
4,3.790773,633.882706,1.206635
5,3.983838,647.55598,0.896749


In [7]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 

In [8]:
stopwords = frozenset(stop_words)

In [9]:
%%time
# CLASSIFICATION
#df_classes = df[(df['review_stars'] == 1) | (df['review_stars'] == 3) | (df['review_stars'] == 5)]
df_classes = df
#df_classes = df_classes[(df_classes['useful'] == 1)]
df_classes.head()
print(df_classes.shape)

# Seperate the data set into X and Y for prediction
x = df_classes['sentiment_text']
y = df_classes['review_stars']
print(x.head())
print(y.head())

(3527902, 10)
0    girlfriend dinner chinese. thursday night work...
1    sunday saturday dim-sum. busy. luck surprise d...
2    no-automatic door not-baby friendly frequent c...
3    horrible-service boyfriend because pass couple...
4    gauge good chinese-number chinese-people patro...
Name: sentiment_text, dtype: object
0    3
1    3
2    3
3    1
4    4
Name: review_stars, dtype: int32
Wall time: 2 ms


In [10]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords]

In [11]:
%%time
vocab = CountVectorizer(analyzer=text_process,stop_words=stopwords).fit(x)
print(len(vocab.vocabulary_))
#r0 = x[0]
#print(r0)
#vocab0 = vocab.transform([r0])
#print(vocab0)

3044647
Wall time: 4min 30s


#### Vectorization of the whole review set and and checking the sparse matrix:

In [12]:
%%time
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: {}".format(x.shape))
#Non-zero occurences:
print("Non-Zero occurences: {}".format(x.nnz))

# DENSITY OF THE MATRIX
density = (x.nnz / (x.shape[0] * x.shape[1])) * 100
print("Density of the matrix: {}".format(density))

Shape of the sparse matrix: (3527902, 3044647)
Non-Zero occurences: 107319863
Density of the matrix: 0.0009991406987451647
Wall time: 4min 9s


#### Splitting data set into training and testing set:

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [14]:
def print_results(y_true, y_pred, classifier_name):
    print("Confusion Matrix for {}:".format(classifier_name))
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['1-Star', '2-Star', '3-Star', '4-Star', '5-Star']))
    print("\nScore: {}".format(round(accuracy_score(y_true, y_pred)*100, 2)))

### Multinomial Naive Bayes

In [15]:
%%time
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
predmnb = mnb.predict(x_test)
print_results(y_test, predmnb, "Multinomial Naive Bayes")

Confusion Matrix for Multinomial Naive Bayes:
[[ 54411  10963   6032   2295   1494]
 [ 18949  14185  20463   7414   2254]
 [  8547   5803  29274  39000   9407]
 [  3305   1107   9442  92304  79450]
 [  2525    392   1872  43777 240916]]

Classification Report:
              precision    recall  f1-score   support

      1-Star       0.62      0.72      0.67     75195
      2-Star       0.44      0.22      0.30     63265
      3-Star       0.44      0.32      0.37     92031
      4-Star       0.50      0.50      0.50    185608
      5-Star       0.72      0.83      0.77    289482

    accuracy                           0.61    705581
   macro avg       0.54      0.52      0.52    705581
weighted avg       0.59      0.61      0.59    705581


Score: 61.1
Wall time: 6.21 s


### RandomForestClassifier

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier(n_estimators=1000)
rmfr.fit(x_train, y_train)


In [None]:
p = rmfr.predict(x_test)
print_results(y_test, p, "Random Forest Classifier")

### Decision Tree

In [None]:
%%time
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
p = dt.predict(x_test)
print_results(y_test, p, "Decision Tree")

### Support Vector Machines

In [None]:
%%time
from sklearn.svm import SVC
svm = SVC(random_state=101)
svm.fit(x_train,y_train)
p = svm.predict(x_test)
print_results(y_test, p, "SVM")

### K - Nearest Neighbor Classifier

In [None]:
%%time
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
p = knn.predict(x_test)
print_results(y_test, p, "kNN")

### Multilayer Perceptron

In [None]:
%%time
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(x_train,y_train)
p = mlp.predict(x_test)
print_results(y_test, p, "Multilayer Perceptron")

In [None]:
item = 11
pr = df['sentiment_text'][item]
print(pr)
print("\nActual Rating: {}".format(df['review_stars'][item]))
pr_t = vocab.transform([pr])
print("Predicted Rating: {}".format(mlp.predict(pr_t)[0]))