In [None]:
import pandas as pd
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter

from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.pipeline import Pipeline

from preproc import Preproc

In [2]:
reviews = pd.read_csv("Data/scraped_comments_with_professor.csv")

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

In [4]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

 5.0    176812
 4.5     58705
 4.0     52643
 1.0     45128
 2.0     26945
 3.5     22896
 1.5     19230
 2.5     17510
-1.0         1
Name: starRating, dtype: int64

In [6]:
sent_pipe = Pipeline([
    ("preprocessor", Preproc()),            # condense all preprocessing into a single class
    ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.5)), 
    ("selector"  , SelectPercentile(score_func=chi2, percentile=50)),
    ("classifer" , MultinomialNB())
])

Fit pipeline on entire `reviews` dataframe

Preprocessor (first step) doesn't need sentiment, but subsequent steps do

In [7]:
sent_pipe_fit = sent_pipe.fit(reviews, reviews["sentiment"])


>>>>>>>>>> fit() called


>>>>>>>>>> transform() called

Entering preprocessing


In [8]:
from sklearn.metrics import confusion_matrix

def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}\n".format(f1 * 100))

    prec = precision_score(y_test, y_pred, average=mode)
    print("Precision: {0}".format(prec * 100))
    rec = recall_score(y_test, y_pred, average=mode)
    print("Recall: {0}\n".format(rec * 100))

    print(confusion_matrix(y_test, y_pred))

In [10]:
old_reviews = pd.read_csv("Data/scraped_comments.csv").sample(n=100000, random_state=1)

old_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

old_reviews.dropna(subset=["comment"], inplace=True)
old_reviews = old_reviews[old_reviews["comment"] != "No Comments"]
old_reviews["comment"] = old_reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
old_reviews.dropna(subset=["comment"], inplace=True)

old_reviews.reset_index(drop=True, inplace=True)

In [11]:
old_reviews["starRating"] = (old_reviews["clarityRating"] + old_reviews["helpfulRating"]) / 2
old_reviews = old_reviews[old_reviews["starRating"] != 3.0]         # drop 3 star reviews

In [12]:
preds = sent_pipe_fit.predict(old_reviews)


>>>>>>>>>> transform() called

Entering preprocessing


In [13]:
real_sentiment = old_reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)
evalPerformance(preds, real_sentiment)

Accuracy: 91.74379682902715
F1 Score: 91.74704135466203

Precision: 91.75042840743701
Recall: 91.74379682902715

[[21670  3431]
 [ 3500 55348]]
