# Take proper star values by averaging clarity and helpful ratings
at some point, remove mixed sentiment reviews

In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_selection import chi2, SelectPercentile

In [2]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv").sample(n=25000, random_state=1)

In [3]:
reviews["helpfulRating"].value_counts()

5    12853
4     4210
1     3473
3     2345
2     2119
Name: helpfulRating, dtype: int64

In [4]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews["starRating"].value_counts()

5.0    10028
4.5     3287
4.0     2944
1.0     2460
3.0     1503
2.0     1480
3.5     1247
1.5     1092
2.5      959
Name: starRating, dtype: int64

### Barebones preprocessing

In [5]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

reviews.reset_index(drop=True, inplace=True)

In [6]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

In [7]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xNTY2MjAx,Christina,Kelson,Overall this was the best laid out course I ha...,3.0,overall this was the best laid out course i ha...,1
1,VGVhY2hlci0xNTk4MTk5,Alex,Tourlakes,The class I was dreading the most turned out t...,5.0,the class i was dreading the most turned out t...,1
2,VGVhY2hlci0xMDQwNTQz,Gaby,Fahmy,Loved & benefitted a lot from pathophysiology ...,5.0,loved benefitted a lot from pathophysiology wi...,1
3,VGVhY2hlci05NDAzNzM=,Van,Peterson,Loved this class. awesome prof. just do the ho...,4.0,loved this class awesome prof just do the home...,1
4,VGVhY2hlci0xMzgwODgx,Tara,Thompson,she's great.. easygoing and always ready to help.,3.5,she s great easygoing and always ready to help,1


### Perform professor split

In [8]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

In [9]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

In [10]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

### Prepare vectorizer and feature selector

In [11]:
cv = CountVectorizer(ngram_range=(1,2))             # consider adding min_df term (start at 0.001)
X = cv.fit_transform(reviews["cleanedComment"]).toarray()

In [12]:
percbest = SelectPercentile(score_func=chi2, percentile=25)      # select 25% most important features using chi2
percbest.fit_transform(X, reviews["sentiment"])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Create naive bayes model and evaluate performance

In [13]:
X_train = percbest.transform(cv.transform(comm_train))      # get training vector from top 25% of features
model = MultinomialNB().fit(X_train, sent_train)            # train model

In [14]:
X_test = percbest.transform(cv.transform(comm_test))
sent_pred = model.predict(X_test)

In [15]:
evalPerformance(sent_pred, sent_test)

Accuracy Score: 93.76185458377239
F1 Score: 93.58414245126114
