In [1]:
import pandas as pd
import re
import spacy

from datetime import datetime
dateparse = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S %z %Z")

In [2]:
reviews = pd.read_csv("https://rmp-bucket.s3.amazonaws.com/scraped_comments.csv", parse_dates=['date'], date_parser=dateparse).sample(n = 50000, random_state=1)

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
reviews = reviews[reviews["comment"] != "No Comments"]
# replace all comments with less than 5 words with a NaN
reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
# drop rows containing NaN comment
reviews.dropna(subset=["comment"], inplace=True)

reviews.reset_index(drop=True, inplace=True)

In [9]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)               # remove HTML codes
    review = re.sub('&#63;?', '', review)
    review = re.sub('[^a-zA-Z\s\d{1,3}]+', ' ', review)             # TODO: keep 3 digit numbers
    review = re.sub('\s+', ' ', review)                             # get rid of excess whitespace generated by spaCy's less-than-ideal tokenization
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses
    review = review.lower()                                         # lowercase review for uniformity
    comments_proper.append(review)

In [5]:
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])
no_stopwords = []

for doc in nlp.pipe(comments_proper):
    token_list = []
    for token in doc:
        token_list.append(token.lemma_)

    filtered_sent = []
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sent.append(word)
    
    no_stopwords.append(" ".join(filtered_sent))

In [6]:
reviews = reviews.loc[:, ["firstName", "lastName", "comment", "clarityRating"]]
reviews["cleaned comment"] = pd.Series(no_stopwords)
reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleaned comment
0,Keith,Halladay,Keith is the best. Not afraid to give his hone...,5,keith good afraid honest opinion grading paper...
1,Charles,Lankau,Lankau was great for 4000 and 5110. He makes c...,4,lankau great class fun hilarious prepare study...
2,James,McGivern,I hated every second of my life that I had to ...,2,hate second life sit class intelligent man ide...
3,Howard G.,Tucker,Crazy Man. No grading system. You dont know yo...,5,crazy man grading system know grade tell stude...
4,Paul,Cohen,This teacher is the worst I've ever taken in m...,1,teacher bad ve entire schooling career rude be...


In [19]:
#reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 3 else 0 if x == 3 else -1)       # accuracy: 0.7518830305715551
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)                        # accuracy: 0.8635356668143553
reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleaned comment,sentiment
0,Keith,Halladay,Keith is the best. Not afraid to give his hone...,5,keith good afraid honest opinion grading paper...,1
1,Charles,Lankau,Lankau was great for 4000 and 5110. He makes c...,4,lankau great class fun hilarious prepare study...,1
2,James,McGivern,I hated every second of my life that I had to ...,2,hate second life sit class intelligent man ide...,0
3,Howard G.,Tucker,Crazy Man. No grading system. You dont know yo...,5,crazy man grading system know grade tell stude...,1
4,Paul,Cohen,This teacher is the worst I've ever taken in m...,1,teacher bad ve entire schooling career rude be...,0


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(comments_proper).toarray()
y = pd.get_dummies(reviews)     # isolate sentiments
y = y.loc[:, "sentiment"]

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [23]:
# Test accuracy of model
from sklearn.metrics import confusion_matrix, accuracy_score

conf_m = confusion_matrix(y_test, y_pred)

acc_score = accuracy_score(y_test, y_pred)

print("Accuracy Score: " + str(acc_score))
print(conf_m)

Accuracy Score: 0.8635356668143553
[[1853  536]
 [ 696 5943]]
