In [1]:
#The following tutorial at https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
#shows how to do sentiment analysis using the nltk toolkit for python. This was used for the comment dataset for NoSleep
#https://www.nltk.org/api/nltk.tokenize.html was also used
import pandas as pd
import re
import string
import random
import nltk
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

np.random.seed(2018)

In [None]:
#This only has to be run once. It downloads the modules that are necessary
#to perform sentiment analysis

#The punkt module is a pre-trained model that helps you tokenize words and sentences.
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [8]:
#This loads the entire user Comments into a Panda DataFrame
#Note it has to be in the same directory as the notebook unless change
#This also fills all the blank sentiment scores to -1
comment_df = pd.read_csv('user_comments.csv')
comment_df['sentiment'] = comment_df['sentiment'].fillna(-1)

comment_df

Unnamed: 0,link_id,sortKey,score,permalink,author_fullname,id,storyId,author,parent_id,body,sentiment,prediction
0,t3_bs22s7,1558760400,10,/r/nosleep/comments/bs22s7/i_work_on_a_boat_ou...,t2_wcuxx,eok7qb9,bs22s7,Wolf_of_WV,t3_bs22s7,You are a dead man walking. The people who ar...,0.0,-1
1,t3_dukiqw,1573621200,1,/r/nosleep/comments/dukiqw/a_childs_method_for...,t2_86jlh,f77papf,dukiqw,Sporkazm,t1_f77p7a1,&amp;#x200B;\n\nSomehow I came to return the e...,0.0,-1
2,t3_e4lcwk,1575349200,4,/r/nosleep/comments/e4lcwk/everyone_knows_the_...,t2_j5mx0,f9erm02,e4lcwk,LiKenun,t3_e4lcwk,"&gt;My neck was sticky, and stank, stank like...",1.0,-1
3,t3_au1bdu,1551157200,1,/r/nosleep/comments/au1bdu/conditions_of_entry...,t2_nguj2,eh6cdqn,au1bdu,Reddit__Herring,t3_au1bdu,That was really awesome. Very interesting conc...,1.0,-1
4,t3_d64uh2,1568955600,1,/r/nosleep/comments/d64uh2/the_188minute_man/f...,t2_2vpmh2gy,f0rojyz,d64uh2,jcammarato,t1_f0ridzm,"Yes, but she also has no choice and will event...",-1.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
169583,t3_b8zif8,1554440400,7,/r/nosleep/comments/b8zif8/the_lynch_house/ek2...,t2_20igrc4i,ek27hib,b8zif8,BlondeRR1717,t3_b8zif8,Are you aware that you wrote natural causes fo...,-1.0,-1
169584,t3_bgj62b,1556168400,73,/r/nosleep/comments/bgj62b/my_first_breath_too...,t2_lg9ark7,ellzl0a,bgj62b,Bismuthie,t1_ellx4dp,Omg smart,-1.0,-1
169585,t3_btonzl,1559106000,2,/r/nosleep/comments/btonzl/her_eye_was_a_spira...,t2_gkau4,ep15bo3,btonzl,thejollyden,t1_ep12y0d,How do you delete comments?,-1.0,-1
169586,t3_cn3nbl,1565326800,1,/r/nosleep/comments/cn3nbl/straight_to_vhs_sun...,t2_17gt7w,ew7ctpd,cn3nbl,LadyGrey1174,t3_cn3nbl,"Holy hannah, time for a Disney movie...",-1.0,-1


In [3]:
#This takes the rated comments and splits them into a positive
#and negative data set. It then uses NLTK tweet tokenizer
#to tokenize the comments of each user storing them into
#a list of bag of words
#NB - Positive Comments = 1. 'Not' Positive Comments = 0
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

positive_comments = comment_df[comment_df['sentiment'] == 1]
negative_comments = comment_df[comment_df['sentiment'] == 0]

positive_tokens = []
negative_tokens = []

for count in range(len(positive_comments)):
    positive_tokens.append(tknzr.tokenize(positive_comments.iloc[count]["body"]))
    
for count in range(len(negative_comments)):
    negative_tokens.append(tknzr.tokenize(negative_comments.iloc[count]["body"]))

In [4]:
#This section cleanses and normalizes the positive and negative tokens obtained above
#Note that the cleansing part still needs some work.
def remove_noise(comment_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(comment_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
    return cleaned_tokens

#This loads nltk's stopwords
stop_words = stopwords.words('english')
negative_tokens_c = []
positive_tokens_c = []

for count in range(len(negative_tokens)):
    negative_tokens_c.append(remove_noise(negative_tokens[count], stop_words))
    
for count in range(len(positive_tokens)):
    positive_tokens_c.append(remove_noise(positive_tokens[count], stop_words))

In [5]:
#This section takes the cleansed tokens and put them into
#dictionary objects to be used later on in the models
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
def get_comments_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

pos_tokens_mod = get_comments_for_model(positive_tokens_c)
neg_tokens_mod = get_comments_for_model(negative_tokens_c)

#Below shows the top 10 common words in the Positive and
#Negative Bag of words. This can be commented out once satisfied
all_pos_words = get_all_words(positive_tokens_c)
freq_dist_pos = FreqDist(all_pos_words)

all_neg_words = get_all_words(negative_tokens_c)
freq_dist_neg = FreqDist(all_neg_words)

print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))

[('’', 27), ('get', 17), ('good', 14), ('like', 12), ('read', 12), ('kill', 11), ('think', 11), ('go', 10), ('op', 10), ("i'm", 10)]
[('like', 15), ('r', 15), ('nosleep', 15), ('#x200b', 14), ('get', 14), ('question', 14), ('must', 14), ('message', 14), ('moderator', 14), ('submission', 13)]


In [6]:
#NB - You CAN'T run this multiple times in a row. It basically removes the data from the
# *_tokens_mod variables. If you need to rerun this, please rerun the code above first
# to repopulate these variables

#This takes the positive/negative dictionaries created above and put them into data sets
#It then merges them together, shuffle them and create train/test versions of the data sets
#using 70/30% of the total data set respectively
positive_dataset = [(comment_dict, "Positive")
                     for comment_dict in pos_tokens_mod]

negative_dataset = [(comment_dict, "Negative")
                     for comment_dict in neg_tokens_mod]

dataset = positive_dataset + negative_dataset
train_size = int(len(dataset)*0.7)

random.shuffle(dataset)

train_data = dataset[:train_size]
test_data = dataset[train_size:]

#This is a sanity check to ensure that the total # equal to the comments that were rated
print(len(positive_dataset), len(negative_dataset), len(dataset))
print(len(train_data), len(test_data))

107 89 196
137 59


In [7]:
#This creates a NaiveBayesClassifier model based on the train_data created
#above then it tests it using the test_data set. Ideally, we want the accuracy
#to be as high as possible. Right now it is about 55%
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.5423728813559322
Most Informative Features
                    post = True           Negati : Positi =      7.0 : 1.0
               subreddit = True           Negati : Positi =      7.0 : 1.0
                       r = True           Negati : Positi =      7.0 : 1.0
                    must = True           Negati : Positi =      7.0 : 1.0
                    look = True           Negati : Positi =      6.1 : 1.0
                 comment = True           Negati : Positi =      5.1 : 1.0
                   check = True           Negati : Positi =      4.2 : 1.0
                    need = True           Negati : Positi =      3.6 : 1.0
                    kind = True           Negati : Positi =      3.3 : 1.0
                    part = True           Negati : Positi =      3.3 : 1.0
None


In [12]:
#This section creates a new dataframe based on the un-rated comments
#and then apply the model on the 1st 10 comments. The results of the analysis
#is then shown with each comment. Once the model is more robust, this can
#be updated to run against the entire comment list with the predicted value
#stored in the prediction column
random_comments = comment_df[comment_df['sentiment'] == -1]
random_comments

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

for count in range(10):
    custom_tokens = remove_noise(tknzr.tokenize(random_comments.iloc[count]["body"]))
    predicted_sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
    
    print(str(count + 1) + '. ' + predicted_sentiment + ' - ' + random_comments.iloc[count]["body"])

1. Negative - Yes, but she also has no choice and will eventually crave a companion as well.
2. Negative - I'm glad your husband decided to confront her. You would have never known otherwise.
3. Negative - Thank you. I don’t know what I believe anymore, but I appreciate your words.
4. Positive - Beautiful!
5. Negative - I did in a comment,but they've allbeen removed by nosleep.

I copy/pasted here for you:

&amp;#x200B;


[This](https://www.reddit.com/r/nosleep/comments/c1wxxu/i_work_at_nasa_we_made_alien_contact_yesterday/) was posted two days ago...not saying it's related, not saying it isn't.

But I'm scared.

&amp;#x200B;

EDIT. Oh, holy shit. All alert posts have been removed. WTF.
6. Negative - Happens with far more stories on this sub than it should. I don’t know if people get bored telling their true stories and rush the ending, or if the endings are just really hard to convey. Either way it’s disappointing every time it happens
7. Negative - r/hydrohomies must be in on it
8. N