# Use negation tagging

In [2]:
import pandas as pd
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter

from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.pipeline import Pipeline

import sys
sys.path.append("..")

from preproc import Preproc
import corpora

In [3]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv")

In [4]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

In [5]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

 5.0    176812
 4.5     58705
 4.0     52643
 1.0     45128
 2.0     26945
 3.5     22896
 1.5     19230
 2.5     17510
-1.0         1
Name: starRating, dtype: int64

In [6]:
negative_words = ["not", "no", "nor", "neither"]
puncts = [".", ",", "!", "?"]

def negationTag(comment):
    new_comment = []
    negation_flag = False
    for token in nltk.word_tokenize(comment):
        if token in negative_words:
            negation_flag = True
            continue
        if negation_flag == True:
            if token in puncts:
                negation_flag = False
            else:
                token = "neg_" + token
        
        new_comment.append(token)
    
    return " ".join(new_comment)

In [7]:
negationTag("This class is not very hard. According to a friend, neither are our other classes.")
negationTag("This class is not great. nor was it informational.")

'This class is neg_great . neg_was neg_it neg_informational .'

In [8]:
# helper functions to convert emoticons and expand contractions using large dictionaries (defined in corpora.py)
def convert_emoticons(text):
    return corpora.EMOTICONS.get(text, text)

def expand_contractions(text):
  return corpora.CONTRACTIONS.get(text, text)

# helper functions to perform simple regex substitutions
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

def remove_html_entities(text):
    text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
    return re.sub('&#63;?', '', text)

def remove_html_tags(text):
    return re.sub('<.{1,6}?>', ' ', text)

In [9]:
def preproc(reviews: pd.DataFrame):
    comments_proper = []
    unseen = Counter()

    for index, review in reviews.iterrows():
        comment = review['comment']
        fname = review['firstName'].lower().split(' ')
        lname = review['lastName'].lower().split(' ')
        names = set(fname + lname)

        comment = remove_urls(comment)
        comment = remove_phones(comment)
        comment = remove_emails(comment)
        comment = remove_html_entities(comment)
        comment = remove_html_tags(comment)

        comment_split = comment.split(' ')
        new_comment_split = []
        for i, word in enumerate(comment_split):
            word = convert_emoticons(word)
            word = word.lower()
            word = expand_contractions(word)
            word = re.sub("[^a-zA-Z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
            word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
            word = re.sub('\s+', ' ', word)
            word = word.strip() # trailing whitespace because punctuation replaced by space
            new_comment_split.extend(word.split(' '))

        # Remove names from the comment
        for i, word in enumerate(new_comment_split):
            if word in names:
                new_comment_split[i] = ''

        # putting negation tagging line here improves accuracy *a lot*
        # probably bc putting it here essentially leads to negation tagging being ignored lol

        comment = ' '.join(new_comment_split)
        comment = re.sub('\s+', ' ', comment)
        comment = comment.strip()

        comment = negationTag(comment)        # tag words following negation words as negated

        comment = ' '.join(word for word in comment.split() if len(word) > 1)

        comments_proper.append(comment)

    return comments_proper

In [10]:
comments_proper = preproc(reviews)

In [11]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, good lectures. Obviously cares a...",5.0,good teacher good lectures obviously cares abo...,1
1,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, very lenient with grading and at...",5.0,good teacher very lenient with grading and att...,1
2,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,Very difficult class. His grading is hard to ...,1.5,very difficult class his grading is hard to un...,0
3,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Excellent mentor. Created valuable foundations...,5.0,excellent mentor created valuable foundations ...,1
4,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Awesome...Had her for two classes at baker col...,5.0,awesome had her for two classes at baker colle...,1


In [12]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

In [13]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=6)), 
    ("selector"  , SelectPercentile(score_func=chi2, percentile=26)),
    ("classifer" , LogisticRegression(solver="liblinear", C=6))
])

In [15]:
from sklearn.metrics import confusion_matrix

def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}\n".format(f1 * 100))

    prec = precision_score(y_test, y_pred, average=mode)
    print("Precision: {0}".format(prec * 100))
    rec = recall_score(y_test, y_pred, average=mode)
    print("Recall: {0}\n".format(rec * 100))

    print(confusion_matrix(y_test, y_pred))

In [16]:
sentiment_fit = pipeline.fit(comm_train, sent_train)

In [17]:
sent_pred = sentiment_fit.predict(comm_test)

In [18]:
evalPerformance(sent_pred, sent_test)

Accuracy: 93.59252362882127
F1 Score: 93.53080418969013

Precision: 93.52123049182329
Recall: 93.59252362882127

[[18711  3342]
 [ 2095 60706]]


In [19]:
old_reviews = pd.read_csv("../Data/scraped_comments.csv").sample(n=100000, random_state=1)

old_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

old_reviews.dropna(subset=["comment"], inplace=True)
old_reviews = old_reviews[old_reviews["comment"] != "No Comments"]
old_reviews["comment"] = old_reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
old_reviews.dropna(subset=["comment"], inplace=True)

old_reviews.reset_index(drop=True, inplace=True)

In [20]:
old_reviews["starRating"] = (old_reviews["clarityRating"] + old_reviews["helpfulRating"]) / 2
old_reviews = old_reviews[old_reviews["starRating"] != 3.0]         # drop 3 star reviews

In [21]:
old_reviews = old_reviews[old_reviews["clarityRating"] == old_reviews["helpfulRating"]]
old_reviews.reset_index(inplace=True, drop=True)

In [22]:
preproc_comments = preproc(old_reviews)

In [23]:
preds = sentiment_fit.predict(preproc_comments)

In [24]:
real_sentiment = old_reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)
evalPerformance(preds, real_sentiment)

Accuracy: 95.09139716221398
F1 Score: 95.05241925825666

Precision: 95.05625205695857
Recall: 95.09139716221398

[[13294  1710]
 [  978 38779]]


In [25]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
text = nltk.word_tokenize("This class is pretty easy ngl.")
nltk.pos_tag(text)

[('This', 'DT'),
 ('class', 'NN'),
 ('is', 'VBZ'),
 ('pretty', 'JJ'),
 ('easy', 'JJ'),
 ('ngl', 'NN'),
 ('.', '.')]

In [27]:
tokenized = []
for comm in reviews["comment"]:
    tokenized.append(nltk.word_tokenize(comm))

KeyboardInterrupt: 

In [None]:
parts_of_speech = []
for comm in tokenized:
    parts_of_speech.append(nltk.pos_tag(comm))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import FreqDist

In [None]:
parts_of_speech

In [None]:
parts_of_speech = [item for sublist in parts_of_speech for item in sublist]
parts_of_speech

In [None]:
fdist = FreqDist(tag for (word, tag) in parts_of_speech)
words_df = pd.DataFrame({"POS":list(fdist.keys()), "count":list(fdist.values())})

d = words_df.nlargest(columns="count", n=45)
plt.figure(figsize=(25,5))
# plt.xticks(rotation=45)
ax = sns.barplot(data=d, x="POS", y="count")
ax.set(ylabel="Count")
plt.show()

In [None]:
words_df["word"].describe()