In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import math
import pickle
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
# nltk.download('stopwords')

In [None]:
pickle_dir = os.path.join('pickles/sentiment_analysis')
cleaned_reviews_file = os.path.join(pickle_dir, 'cleaned_reviews_df.pkl')
df_classes_file = os.path.join(pickle_dir, 'df_classes.pkl')
vocab_file = os.path.join(pickle_dir, 'cleaned_reviews_vocab.pkl')
transformed_sentiment_file = os.path.join(pickle_dir, 'cleaned_reviews_x_sentiment.pkl')
classifier_file = os.path.join(pickle_dir, 'mnb_classifier.pkl')

#### Install missing packages

In [None]:
if 0 == 1:
    import sys
    !conda install --yes --prefix {sys.prefix} s3fs seaborn scikit-learn

In [None]:
def read_s3_bucket(bucket, data_key):
    data_location = 's3://{}/{}'.format(bucket, data_key)

    chunksize = 1000000
    chunk_list = []
    df_chunk = pd.read_csv(data_location, chunksize=chunksize)
    for chunk in df_chunk:
        chunk_list.append(chunk)

    df = pd.concat(chunk_list)
    return df

In [None]:
%%time
if 1 == 1:
    
    bucket = 'cs410-yelp'
    data_key = 'processed_data/cleaned_reviews.csv'

    df = read_s3_bucket(bucket, data_key)
    df = df.drop(labels='Unnamed: 0', axis=1)
    df['review_stars']   = df['review_stars'].astype(int)
    df['sentiment_text'] = df['sentiment_text'].astype(str)

In [None]:
df.head()
len(df.index)

In [None]:
stval = df.groupby('review_stars').mean()
stval

In [None]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 

In [None]:
stopwords = frozenset(stop_words)

In [None]:
%%time
# CLASSIFICATION
df_classes = df[(df['review_stars'] == 1) | (df['review_stars'] == 3) | (df['review_stars'] == 5)]
df_classes = df_classes[(df_classes['useful'] == 1)]
df_classes.head()
print(df_classes.shape)

# Seperate the data set into X and Y for prediction
x = df_classes['sentiment_text']
y = df_classes['review_stars']
print(x.head())
print(y.head())

In [None]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords]

In [None]:
%%time
r0 = x[3]
print(r0)
vocab = CountVectorizer(analyzer=text_process,stop_words=stopwords).fit(x)
print(len(vocab.vocabulary_))
vocab0 = vocab.transform([r0])
print(vocab0)

#### Vectorization of the whole review set and and checking the sparse matrix:

In [None]:
%%time
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: {}".format(x.shape))
#Non-zero occurences:
print("Non-Zero occurences: {}".format(x.nnz))

# DENSITY OF THE MATRIX
density = (x.nnz / (x.shape[0] * x.shape[1])) * 100
print("Density of the matrix: {}".format(density))

#### Splitting data set into training and testing set:

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
def print_results(y_true, y_pred, classifier_name):
    print("Confusion Matrix for {}:".format(classifier_name))
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['1-Star', '3-Star', '5-Star']))
    print("\nScore: {}".format(round(accuracy_score(y_true, y_pred)*100, 2)))

### Multinomial Naive Bayes

In [None]:
%%time
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
predmnb = mnb.predict(x_test)
print_results(y_test, predmnb, "Multinomial Naive Bayes")

### RandomForestClassifier

In [None]:
%%time
if 0 == 1:
    from sklearn.ensemble import RandomForestClassifier
    rmfr = RandomForestClassifier()
    rmfr.fit(x_train, y_train)
    p = rmfr.predict(x_test)
    print_results(y_test, p, "Random Forest Classifier")

### Decision Tree

In [None]:
%%time
if 0 == 1:
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier()
    dt.fit(x_train,y_train)
    p = dt.predict(x_test)
    print_results(y_test, p, "Decision Tree")

### Support Vector Machines

In [None]:
%%time
if 0 == 1:
    from sklearn.svm import SVC
    svm = SVC(random_state=101)
    svm.fit(x_train,y_train)
    p = svm.predict(x_test)
    print_results(y_test, p, "SVM")

### K - Nearest Neighbor Classifier

In [None]:
%%time
if 0 == 1:
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(x_train,y_train)
    p = knn.predict(x_test)
    print_results(y_test, p, "kNN")

### Multilayer Perceptron

In [None]:
%%time
if 0 == 1:
    from sklearn.neural_network import MLPClassifier
    mlp = MLPClassifier()
    mlp.fit(x_train,y_train)
    p = mlp.predict(x_test)
    print_results(y_test, p, "Multilayer Perceptron")

In [None]:
item = 11
pr = df['sentiment_text'][item]
print(pr)
print("\nActual Rating: {}".format(df['review_stars'][item]))
pr_t = vocab.transform([pr])
print("Predicted Rating: {}".format(mnb.predict(pr_t)[0]))

In [None]:
%%time
with open(cleaned_reviews_file, 'wb') as file:
    pickle.dump(df, file)

with open(df_classes_file, 'wb') as file:
    pickle.dump(df_classes, file)

with open(vocab_file, 'wb') as file:
    pickle.dump(vocab, file)

with open(transformed_sentiment_file, 'wb') as file:
    pickle.dump(x, file)

with open(classifier_file, 'wb') as file:
    pickle.dump(mnb, file)