Testing RMP Analysis

Starting with simple sentiment analysis and hoping it goes somewhere :)

In [1]:
import pandas as pd
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemm = WordNetLemmatizer()
reviews = pd.read_csv("RMP-data.csv")

In [2]:
# keep only necessary columns
reviews = reviews.loc[:, ["professor_name", "student_star", "comments"]]

reviews.comments.fillna("Empty", inplace=True)
reviews.student_star.fillna(-1, inplace=True)           # to denote it was empty without causing a runtime error

reviews["sentiment"] = 1            # create new 'sentiment' column and set it to all 1s (denoting positive sentiment)

# FIXME: need to find a way to remove backslashes from comments effectively
reviews.comments.replace("\\", "", inplace=True)        # this will search for comments whose value is "\\", but I want to search within a comment for a "\\" soooo...

reviews.head()

#empty_comments = reviews[pd.isnull(reviews.comments)]       # both frames have 0 rows, so all NaNs were successfully terminated
#empty_stars = reviews[pd.isnull(reviews.student_star)]

Unnamed: 0,professor_name,student_star,comments,sentiment
0,Leslie Looney,5.0,"This class is hard, but its a two-in-one gen-e...",1
1,Leslie Looney,5.0,Definitely going to choose Prof. Looney\'s cla...,1
2,Leslie Looney,4.0,I overall enjoyed this class because the assig...,1
3,Leslie Looney,5.0,"Yes, it\'s possible to get an A but you\'ll de...",1
4,Leslie Looney,5.0,Professor Looney has great knowledge in Astron...,1


In [3]:
# Appropriately fill sentiment column using student_star as a basis
reviews["sentiment"] = reviews["student_star"].apply(lambda x: 1 if x > 2.5 else 0)
# if student rated more than 2.5 stars, take their review as positive; otherwise, negative
reviews.head()

Unnamed: 0,professor_name,student_star,comments,sentiment
0,Leslie Looney,5.0,"This class is hard, but its a two-in-one gen-e...",1
1,Leslie Looney,5.0,Definitely going to choose Prof. Looney\'s cla...,1
2,Leslie Looney,4.0,I overall enjoyed this class because the assig...,1
3,Leslie Looney,5.0,"Yes, it\'s possible to get an A but you\'ll de...",1
4,Leslie Looney,5.0,Professor Looney has great knowledge in Astron...,1


In [4]:
# Ensure that negative stopwords are kept (important for sentiment analysis)
eng_stopwords = stopwords.words("english")
# matter of fact, may end up being better to not remove stopwords at all to ensure no loss of depth in model, but this'll do for now
eng_stopwords.remove("no")
eng_stopwords.remove("not")
eng_stopwords.remove("nor")
eng_stopwords.remove("couldn't")
eng_stopwords.remove("doesn't")
eng_stopwords.remove("haven't")
eng_stopwords.remove("hadn't")
eng_stopwords.remove("weren't")
eng_stopwords.remove("won't")
eng_stopwords.remove("wouldn't")

In [5]:
# Basic preprocessing
corpus = []

"""
for i in range(0, reviews.shape[0]):
    comm = reviews.iloc[i].comments         # isolate student comment from row
    sentences = nltk.sent_tokenize(comm)    # break into sentences
    sentences = [nltk.word_tokenize(sentence) for sentence in sentences]        # break into words
    for j in range(0, len(sentences)):
        sentences[j] = [lemm.lemmatize(word) for word in sentences[j] if word not in eng_stopwords]         # lemmatize each non-stopword
    corpus.append(sentences)
"""

for i in range(len(reviews.index)):
    review = re.sub('[^a-zA-Z]', ' ', reviews["comments"][i])       # keep only alphabet characters
    review = review.lower()         # make lowercase
    review = review.split()         # split into words
    review = [lemm.lemmatize(word) for word in review if word not in eng_stopwords]         # lemmatize and remove stopwords
    review = " ".join(review)       # combine back
    corpus.append(review)           # add to list of sentence

In [6]:
# Create Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(reviews)     # isolate sentiments
y = y.loc[:, "sentiment"]

In [7]:
# Train-Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
# Train model using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
# Test accuracy of model
from sklearn.metrics import confusion_matrix, accuracy_score

conf_m = confusion_matrix(y_test, y_pred)

acc_score = accuracy_score(y_test, y_pred)

print("Accuracy Score: " + str(acc_score))
print(conf_m)

Accuracy Score: 0.8295
[[ 853  267]
 [ 415 2465]]
