# Testing using a model to predict on new reviews

In [1]:
from joblib import dump, load
import re
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score

Load most effective model, from `attempt-3.ipynb`
* vectorizer ( `NB-vectorizer` ) and feature selector ( `NB-chi2` ) needed as well to transform new comments
* important that all three have been properly fit to train and test datasets in advance

In [2]:
model = load("NB-model.joblib")
cv = load("NB-vectorizer.joblib")
percbest = load("NB-chi2.joblib")

In [3]:
# new sentences to predict on
docs = [
    "Prof Ritchey is a great guy and an even better teacher! Anyone who doesn't like his class is just an idiot lmao",
    "If you get this guy, just drop out. It's actually a better use of your time to stick forks in your eyes than trying to understand his lectures.",
    "I'm just happy to be here!",
    "This guy sucks and stinks and is bad and I hate him.",
    "Trying to use very negative words so the model picks up the bad terrible horrible hate sentiment"
]

### General functions

In [4]:
def basicPreproc(comments: list):
    comments_proper = []

    for review in comments:
        review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
        review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
        review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
        review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
        review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

        review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

        review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

        review = re.sub('\s+', ' ', review)
        review = review.lower()                                     # lowercase review for uniformity

        comments_proper.append(review)

    return comments_proper

In [5]:
def getPreds(comments: list):
    preds = []

    for review in comments:
        pred = model.predict(percbest.transform(cv.transform([review])))
        preds.append(pred[0])       # model.predict() returns a list, so isolate individual value
    
    return preds

In [6]:
def printResults(docs: list, preds:list):
    # now with fancy text coloring (works best in dark mode)
    for comm, pred in zip(docs, preds):
        print("\033[2;32mComment: \033[0;37m{0}".format(comm))
        
        if pred == 0:
            print("\033[0;31mSentiment: \033[0;31m{0}".format(pred))
        else:
            print("\033[0;34mSentiment: \033[0;34m{0}".format(pred))
        
        print()

In [7]:
def printResultsWithStars(docs: list, preds:list, ratings=[]):
    # now with fancy text coloring (works best in dark mode)
    for comm, pred, star in zip(docs, preds, ratings):
        print("\033[2;32mComment: \033[0;37m{0}".format(comm))
        
        if pred == 0:
            print("\033[0;31mSentiment: \033[0;31m{0}".format(pred))
        else:
            print("\033[0;34mSentiment: \033[0;34m{0}".format(pred))
        
        print("\033[0;33mStars Given: \033[0;33m{0}\n".format(star))

Run above functions on 5 example comments I made up

In [8]:
preproc_comments = basicPreproc(docs)

In [9]:
preds = getPreds(preproc_comments)

In [10]:
printResults(docs, preds)

[2;32mComment: [0;37mProf Ritchey is a great guy and an even better teacher! Anyone who doesn't like his class is just an idiot lmao
[0;34mSentiment: [0;34m1

[2;32mComment: [0;37mIf you get this guy, just drop out. It's actually a better use of your time to stick forks in your eyes than trying to understand his lectures.
[0;31mSentiment: [0;31m0

[2;32mComment: [0;37mI'm just happy to be here!
[0;34mSentiment: [0;34m1

[2;32mComment: [0;37mThis guy sucks and stinks and is bad and I hate him.
[0;31mSentiment: [0;31m0

[2;32mComment: [0;37mTrying to use very negative words so the model picks up the bad terrible horrible hate sentiment
[0;31mSentiment: [0;31m0



Try model on a sample from original set of scraped comments ( `scraped_comments.csv` )

In [11]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [12]:
old_reviews = pd.read_csv("../Data/scraped_comments.csv").sample(n=20000, random_state=1)

old_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

old_reviews.dropna(subset=["comment"], inplace=True)
old_reviews = old_reviews[old_reviews["comment"] != "No Comments"]
old_reviews["comment"] = old_reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
old_reviews.dropna(subset=["comment"], inplace=True)

old_reviews.reset_index(drop=True, inplace=True)

In [13]:
preproc_comments = basicPreproc(old_reviews["comment"])

In [14]:
preds = getPreds(preproc_comments)

In [15]:
real_sentiment = old_reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)

Not super feasible to print results for ~20k comments, use standard `evalPerformance()` function from other attempts to summarize accuracy

In [16]:
evalPerformance(preds, real_sentiment)

Accuracy Score: 86.29621430152756
F1 Score: 85.30637470427551


We can show results for some, so let's do that on the first 5 reviews, also taking into account their star ratings bc they're real :)

In [17]:
comm_print = old_reviews["comment"].iloc[0:5]
preds_print = preds[0:5]
stars_print = old_reviews["clarityRating"].iloc[0:5]

In [18]:
printResultsWithStars(comm_print, preds_print, stars_print)

[2;32mComment: [0;37mgood teacher! Class gets boring but keep busy and stay half-listening and you'll be good for the lecture questions. The lectures do have the potential to be interesting :) and yes, attendance is NOT mandatory!
[0;34mSentiment: [0;34m1
[0;33mStars Given: [0;33m5

[2;32mComment: [0;37mHeis really helpful.He is themost helpful teacher I ve ever seen.He teaches well and I enjoy his class.His examples make it easier to understand.Just do what he tells you.You ll get an A
[0;34mSentiment: [0;34m1
[0;33mStars Given: [0;33m4

[2;32mComment: [0;37mVery patient non tolerant and understanding lovely lady very concern about all her students
[0;34mSentiment: [0;34m1
[0;33mStars Given: [0;33m5

[2;32mComment: [0;37mI hated the course and had to drop it but he is an alright guy. He even gives you his home number to call if you have problems. He makes you read a lot and gives tons of assignments. Not my fav, but ok.
[0;34mSentiment: [0;34m1
[0;33mStars Given