# Introduce Pipeline module

In [79]:
import pandas as pd
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.pipeline import Pipeline

Surprise! We are no longer practically limited wrt space and/or time

In [80]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv")#.sample(n=300000, random_state=1)

In [81]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

In [82]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

 5.0    176812
 4.5     58705
 4.0     52643
 1.0     45128
 2.0     26945
 3.5     22896
 1.5     19230
 2.5     17510
-1.0         1
Name: starRating, dtype: int64

In [83]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

In [84]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, good lectures. Obviously cares a...",5.0,good teacher good lectures obviously cares abo...,1
1,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, very lenient with grading and at...",5.0,good teacher very lenient with grading and att...,1
2,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,Very difficult class. His grading is hard to ...,1.5,very difficult class his grading is hard to un...,0
3,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Excellent mentor. Created valuable foundations...,5.0,excellent mentor created valuable foundations ...,1
4,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Awesome...Had her for two classes at baker col...,5.0,awesome had her for two classes at baker colle...,1


In [85]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

In [86]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

In [87]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [88]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.5)), 
    ("selector"  , SelectPercentile(score_func=chi2, percentile=50)),
    ("classifer" , MultinomialNB())
])

In [89]:
sentiment_fit = pipeline.fit(comm_train, sent_train)

In [90]:
sent_pred = pipeline.predict(comm_test)

In [91]:
evalPerformance(sent_pred, sent_test)

Accuracy Score: 93.42164187899216
F1 Score: 93.4152216040509


In [92]:
from joblib import dump

dump(pipeline, "NB-pipeline.joblib")

['NB-pipeline.joblib']

In [93]:
old_reviews = pd.read_csv("../Data/scraped_comments.csv").sample(n=100000, random_state=1)

old_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

old_reviews.dropna(subset=["comment"], inplace=True)
old_reviews = old_reviews[old_reviews["comment"] != "No Comments"]
old_reviews["comment"] = old_reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
old_reviews.dropna(subset=["comment"], inplace=True)

old_reviews.reset_index(drop=True, inplace=True)

In [94]:
old_reviews["starRating"] = (old_reviews["clarityRating"] + old_reviews["helpfulRating"]) / 2
old_reviews = old_reviews[old_reviews["starRating"] != 3.0]         # drop 3 star reviews

In [95]:
def basicPreproc(comments: list):
    comments_proper = []

    for review in comments:
        review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
        review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
        review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
        review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
        review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

        review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

        review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

        review = re.sub('\s+', ' ', review)
        review = review.lower()                                     # lowercase review for uniformity

        comments_proper.append(review)

    return comments_proper

In [96]:
preproc_comments = basicPreproc(old_reviews["comment"])

In [97]:
preds = pipeline.predict(preproc_comments)

In [98]:
real_sentiment = old_reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

In [99]:
evalPerformance(preds, real_sentiment)

Accuracy Score: 91.63063288425116
F1 Score: 91.63358906475055


In [100]:
# new sentences to predict on
docs = [
    "Prof Ritchey is a great guy and an even better teacher! Anyone who doesn't like his class is just an idiot lmao",
    "If you get this guy, just drop out. It's actually a better use of your time to stick forks in your eyes than trying to understand his lectures.",
    "I'm just happy to be here!",
    "This guy sucks and stinks and is bad and I hate him.",
    "Trying to use very negative words so the model picks up the bad terrible horrible hate sentiment"
]

preproc_comments = basicPreproc(docs)

In [101]:
preds = pipeline.predict(preproc_comments)

In [102]:
def printResults(docs: list, preds:list):
    # now with fancy text coloring (works best in dark mode)
    for comm, pred in zip(docs, preds):
        print("\033[2;32mComment: \033[0;37m{0}".format(comm))
        
        if pred == 0:
            print("\033[0;31mSentiment: \033[0;31m{0}".format(pred))
        else:
            print("\033[0;34mSentiment: \033[0;34m{0}".format(pred))
        
        print()

In [103]:
printResults(docs, preds)

[2;32mComment: [0;37mProf Ritchey is a great guy and an even better teacher! Anyone who doesn't like his class is just an idiot lmao
[0;34mSentiment: [0;34m1

[2;32mComment: [0;37mIf you get this guy, just drop out. It's actually a better use of your time to stick forks in your eyes than trying to understand his lectures.
[0;31mSentiment: [0;31m0

[2;32mComment: [0;37mI'm just happy to be here!
[0;34mSentiment: [0;34m1

[2;32mComment: [0;37mThis guy sucks and stinks and is bad and I hate him.
[0;31mSentiment: [0;31m0

[2;32mComment: [0;37mTrying to use very negative words so the model picks up the bad terrible horrible hate sentiment
[0;31mSentiment: [0;31m0

