In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import json
import seaborn as sns
from collections import Counter
import nltk
import scipy
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

  import pandas.util.testing as tm


In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [4]:
yelp = pd.read_csv("yelp_clean.csv")

In [6]:
yelp_basic = yelp[['stars', 'text_clean']]

In [7]:
yelp_basic.shape

(65724, 2)

In [8]:
yelp_features = yelp[['stars', 'text_clean', 'useful', 'funny', 'cool']]

In [10]:
df_train_raw, df_test = train_test_split(yelp_features, test_size=0.2, 
                                         random_state=42, shuffle=True)
df_train_raw, df_dev = train_test_split(df_train_raw, test_size=0.2,
                                        random_state=42, shuffle=True)

df_train = df_train_raw

In [14]:
def extract_features(df_train, df_dev, df_test):
    vectorizer = TfidfVectorizer(analyzer='word', 
                               stop_words='english',
                               ngram_range=(1, 2),
                               lowercase=True,
                               min_df=5,
                               binary=False)
    x_train = vectorizer.fit_transform(df_train.loc[:, 'text_clean'].values.astype('U'))
    x_dev = vectorizer.transform(df_dev.loc[:, 'text_clean'].values.astype('U'))
    x_test = vectorizer.transform(df_test.loc[:, 'text_clean'].values.astype('U'))
    y_train = df_train.loc[:, 'stars'].tolist()
    y_dev = df_dev.loc[:, 'stars'].tolist()
    y_test = df_test.loc[:, 'stars'].tolist()
    return x_train, x_dev, x_test, y_train, y_dev, y_test

In [15]:
x_train, x_dev, x_test, y_train, y_dev, y_test = extract_features(df_train, df_dev, df_test)

In [17]:
useful = df_train.loc[:, 'useful'].tolist()
funny = df_train.loc[:, 'funny'].tolist()
cool = df_train.loc[:, 'cool'].tolist()

#### Assess Model Performance

In [None]:
def evaluate_model_Xy(model, X, y, y_pred=None, label="Training", model_name="model"):
    if y_pred is None:
        y_pred = model.predict(X)

    print(label + ' Set')
    print("Accuracy:", accuracy_score(y, y_pred))
    print()

    print(classification_report(y, y_pred, digits=4))

    disp = plot_confusion_matrix(model, X, y, 
                               cmap=plt.cm.Blues, normalize='true')
    plt.savefig(model_name + "_" + label.lower() + ".eps")
    plt.show()
    print()

In [None]:
def evaluate_model(model, model_name="model",
                   y_train_pred=None, y_dev_pred=None, y_test_pred=None):
    evaluate_model_Xy(model, x_train, y_train, label="Training", model_name=model_name)
    evaluate_model_Xy(model, x_dev, y_dev, label="Validation", model_name=model_name)
    evaluate_model_Xy(model, x_test, y_test, label="Testing", model_name=model_name)

#### Naive Bayes

In [None]:
clf_nb = MultinomialNB(alpha=0.5,
                       fit_prior=True)
clf_nb.fit(x_train, y_train)


In [None]:
evaluate_model(clf_nb, model_name="nb")

#### SVM

In [None]:
clf_sgd = make_pipeline(StandardScaler(with_mean=False),
                        SGDClassifier(loss='hinge',
                                      penalty='l2',
                                      alpha=30,
                                      max_iter=1000, 
                                      tol=1e-3,
                                      shuffle=True,
                                      verbose=1,
                                      n_jobs=-1,
                                      random_state=0,
                                      learning_rate='optimal',
                                      early_stopping=True,
                                      class_weight='balanced'))

In [None]:
clf_sgd.fit(x_train, y_train)

In [None]:
evaluate_model(clf_sgd, model_name="sgd")