# Testing using a model to predict on new reviews

In [3]:
from joblib import dump, load
import re

Load most effective model, from `attempt-3.ipynb`, along with `CountVectorizer` and `SelectPercentile` (feature selector)

In [6]:
model = load("NB-model.joblib")
cv = load("NB-vectorizer.joblib")
percbest = load("NB-chi2.joblib")

In [23]:
# new sentences to predict on
docs = [
    "Prof Ritchey is a great guy and an even better teacher! Anyone who doesn't like his class is just an idiot lmao",
    "If you get this guy, just drop out. It's actually a better use of your time to stick forks in your eyes than trying to understand his lectures.",
    "I'm just happy to be here!",
    "This guy sucks and stinks and is bad and I hate him.",
    "Trying to use very negative words so the model picks up the bad terrible horrible hate sentiment"
]

Feed new comments into basic preprocessing pipeline

In [24]:
comments_proper = []

for review in docs:
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

comments_proper

['prof ritchey is a great guy and an even better teacher anyone who doesn t like his class is just an idiot lmao',
 'if you get this guy just drop out it s actually a better use of your time to stick forks in your eyes than trying to understand his lectures ',
 'i m just happy to be here ',
 'this guy sucks and stinks and is bad and i hate him ',
 'trying to use very negative words so the model picks up the bad terrible horrible hate sentiment']

In [25]:
preds = []

for review in comments_proper:
    pred = model.predict(percbest.transform(cv.transform([review])))
    preds.append(pred[0])       # model.predict() returns a list, so isolate individual value

In [26]:
preds

[1, 1, 1, 1, 0]