In [1]:
#The following tutorial at https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
#shows how to do sentiment analysis using the nltk toolkit for python. This was used for the comment dataset for NoSleep
#https://www.nltk.org/api/nltk.tokenize.html was also used
import pandas as pd
import re
import string
import random
import nltk
import numpy as np
import json

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

np.random.seed(2018)

In [None]:
#This only has to be run once. It downloads the modules that are necessary
#to perform sentiment analysis

#The punkt module is a pre-trained model that helps you tokenize words and sentences.
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [84]:
#This loads the entire user Comments into a Panda DataFrame
#Note it has to be in the same directory as the notebook unless change
#This also fills all the blank sentiment scores to -1
comment_full = pd.read_csv('user_comments.csv')
comment_full['sentiment'] = comment_full['sentiment'].fillna(-1)
#comment_full['prediction'] = -1
comment_full['prediction_5s'] = -1

random_story = comment_full[comment_full['storyId'].str.contains('bhokru')]
random_story = random_story[random_story['prediction'] == 1]

comment_full = comment_full[~comment_full['storyId'].str.contains('bhokru')]

comment_df = comment_full[:2000]
#comment_df = comment_full
random_df = comment_full[2001:2101]

random_df = random_df.reset_index(drop=True)
#random_df

comment_type = comment_full.groupby("sentiment_5s")["prediction"].count().reset_index().rename(columns={'prediction_y':'count'})
comment_type

Unnamed: 0,sentiment_5s,prediction
0,-1.0,2
1,0.0,73
2,1.0,251
3,2.0,1381
4,3.0,301
5,4.0,98


In [None]:
#This section cleans up the code by removing false comments as well as comments that provide
#little value (less < characters). The exception is key words like more/moar etc which indicates
#the story was good.
#THIS ONLY NEEDS TO BE RUN ONCE AS IT SAVES THE INFO BACK IN THE ORIGINAL FILE

comment_df = comment_full
comment_df['body'] = comment_df['body'].str.strip()
comment_c = comment_df[~comment_df['body'].str.contains('remove')]
comment_c = comment_c[~(comment_c['body'].str.lower()).str.contains('cake day')]
comment_c = comment_c[~comment_c['body'].str.contains('It looks like there may be more to this story')]
comment_c = comment_c[~comment_c['body'].str.contains('https://red')]
comment_c = comment_c[comment_c['body'].str.len() >= 5]

comment_more = comment_df[(comment_df['body'].str.len() < 5) & ((comment_df['body'].str.lower()).str.contains('more'))]
comment_more["sentiment"] = 1
comment_moar = comment_df[(comment_df['body'].str.len() < 5) & ((comment_df['body'].str.lower()).str.contains('moar'))]
comment_moar["sentiment"] = 1
comment_sick = comment_df[(comment_df['body'].str.len() < 5) & ((comment_df['body'].str.lower()).str.contains('sick'))]
comment_sick["sentiment"] = 1
comment_holy = comment_df[(comment_df['body'].str.len() < 5) & ((comment_df['body'].str.lower()).str.contains('holy'))]
comment_holy["sentiment"] = 1

comment_c = pd.concat([comment_c, comment_more])
comment_c = pd.concat([comment_c, comment_moar])
comment_c = pd.concat([comment_c, comment_sick])
comment_c = pd.concat([comment_c, comment_holy])

comment_df = comment_c

comment_c.to_csv (r'user_comments.csv', index = False, header=True)

In [92]:
#This takes the rated comments and splits them into a positive
#and negative data set. It then uses NLTK tweet tokenizer
#to tokenize the comments of each user storing them into
#a list of bag of words
#NB - Positive Comments = 1. 'Not' Positive Comments = 0
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

#positive_comments = comment_df[comment_df['sentiment'] == 1]
#negative_comments = comment_df[comment_df['sentiment'] == 0]

v_positive_comments = comment_df[comment_df['sentiment_5s'] == 4]
positive_comments = comment_df[comment_df['sentiment_5s'] == 3]
neutral_comments = comment_df[comment_df['sentiment_5s'] == 2]
negative_comments = comment_df[comment_df['sentiment_5s'] == 1]
v_negative_comments = comment_df[comment_df['sentiment_5s'] == 0]

v_positive_comments = v_positive_comments#[:301]
positive_comments = positive_comments#[:301]
neutral_comments = neutral_comments#[:301]
negative_comments = negative_comments#[:301]
v_negative_comments = v_negative_comments#[:301]

positive_tokens = []
negative_tokens = []
v_positive_tokens = []
v_negative_tokens = []
neutral_tokens = []

for count in range(len(positive_comments)):
    #if (len(positive_comments.iloc[count]["body"]) <= 50):
    positive_tokens.append(tknzr.tokenize(positive_comments.iloc[count]["body"]))
    
for count in range(len(negative_comments)):
    negative_tokens.append(tknzr.tokenize(negative_comments.iloc[count]["body"]))
        
for count in range(len(v_positive_comments)):
    v_positive_tokens.append(tknzr.tokenize(v_positive_comments.iloc[count]["body"]))
    
for count in range(len(v_negative_comments)):
    v_negative_tokens.append(tknzr.tokenize(v_negative_comments.iloc[count]["body"]))
    
for count in range(len(neutral_comments)):
    neutral_tokens.append(tknzr.tokenize(neutral_comments.iloc[count]["body"]))

In [86]:
#This section cleanses and normalizes the positive and negative tokens obtained above
#Note that the cleansing part still needs some work.
def remove_noise(comment_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(comment_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token != '...' and token != '…' and token != '’' and token != '‘' and token != '..' and token not in string.punctuation and token.lower() not in stop_words and token.lower() != "#x200b":            
            cleaned_tokens.append(token.lower())
            
    return cleaned_tokens

In [93]:
#This loads nltk's stopwords
stop_words = stopwords.words('english')
negative_tokens_c = []
positive_tokens_c = []
v_negative_tokens_c = []
v_positive_tokens_c = []
neutral_tokens_c = []

for count in range(len(negative_tokens)):
    negative_tokens_c.append(remove_noise(negative_tokens[count], stop_words))
    
for count in range(len(positive_tokens)):
    positive_tokens_c.append(remove_noise(positive_tokens[count], stop_words))
    
for count in range(len(v_negative_tokens)):
    v_negative_tokens_c.append(remove_noise(v_negative_tokens[count], stop_words))
    
for count in range(len(v_positive_tokens)):
    v_positive_tokens_c.append(remove_noise(v_positive_tokens[count], stop_words))
    
for count in range(len(neutral_tokens)):
    neutral_tokens_c.append(remove_noise(neutral_tokens[count], stop_words))
    
print(len(v_positive_tokens_c), len(positive_tokens_c), len(neutral_tokens_c), len(negative_tokens_c), len(v_negative_tokens_c))

94 287 1311 238 68


In [94]:
#This section takes the cleansed tokens and put them into
#dictionary objects to be used later on in the models
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
def get_comments_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

pos_tokens_mod = get_comments_for_model(positive_tokens_c)
neg_tokens_mod = get_comments_for_model(negative_tokens_c)
v_pos_tokens_mod = get_comments_for_model(v_positive_tokens_c)
v_neg_tokens_mod = get_comments_for_model(v_negative_tokens_c)
neu_tokens_mod = get_comments_for_model(neutral_tokens_c)

#Below shows the top 10 common words in the Positive and
#Negative Bag of words. This can be commented out once satisfied
all_pos_words = get_all_words(positive_tokens_c)
freq_dist_pos = FreqDist(all_pos_words)

all_neg_words = get_all_words(negative_tokens_c)
freq_dist_neg = FreqDist(all_neg_words)

all_neu_words = get_all_words(neutral_tokens_c)
freq_dist_neu = FreqDist(all_neu_words)

print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))
print(freq_dist_neu.most_common(10))

[('story', 69), ('read', 43), ('good', 37), ('part', 30), ('like', 27), ('love', 26), ('please', 23), ('get', 23), ('one', 22), ('wait', 21)]
[('think', 32), ('like', 27), ('story', 23), ('get', 20), ('know', 18), ("i'm", 18), ('r', 18), ('people', 15), ('read', 15), ('thing', 14)]
[('get', 183), ('like', 169), ('know', 159), ('go', 155), ('would', 127), ('think', 119), ('one', 113), ('say', 105), ("i'm", 97), ('good', 96)]


In [95]:
#NB - You CAN'T run this multiple times in a row. It basically removes the data from the
# *_tokens_mod variables. If you need to rerun this, please rerun the code above first
# to repopulate these variables

#This takes the positive/negative dictionaries created above and put them into data sets
#It then merges them together, shuffle them and create train/test versions of the data sets
#using 70/30% of the total data set respectively
positive_dataset = [(comment_dict, "Positive")
                     for comment_dict in pos_tokens_mod]

negative_dataset = [(comment_dict, "Negative")
                     for comment_dict in neg_tokens_mod]

v_positive_dataset = [(comment_dict, "Very Positive")
                      for comment_dict in v_pos_tokens_mod]

v_negative_dataset = [(comment_dict, "Very Negative")
                      for comment_dict in v_neg_tokens_mod]

neutral_dataset = [(comment_dict, "Neutral")
                   for comment_dict in neu_tokens_mod]

dataset = v_positive_dataset + positive_dataset + neutral_dataset + negative_dataset + v_negative_dataset
train_size = int(len(dataset)*0.8)

random.shuffle(dataset)

train_data = dataset[:train_size]
test_data = dataset[train_size:]

#This is a sanity check to ensure that the total # equal to the comments that were rated
#print(len(positive_dataset), len(negative_dataset), len(dataset))
print(len(v_positive_dataset), len(positive_dataset), len(neutral_dataset), len(negative_dataset), len(v_negative_dataset), len(dataset))
print(len(train_data), len(test_data))

94 287 1311 238 68 1998
1598 400


In [9]:
#This stores the train and test data sets created into JSON files. These
#files can be used later on to load into the model without having to go through
#the work of cleaning up the data etc again
with open('model_train_nltk.json', 'w') as json_file:
    json.dump(train_data, json_file)

with open('model_test_nltk.json', 'w') as json_file:
    json.dump(test_data, json_file)

In [10]:
#This loads the train and test data from the json files created into the
#final objects to be loaded into the NaiveBayesClassifier model
with open('model_train_nltk.json') as json_file:
    train_data = json.load(json_file)

with open('model_test_nltk.json') as json_file:
    test_data = json.load(json_file)

In [96]:
#This creates a NaiveBayesClassifier model based on the train_data created
#above then it tests it using the test_data set. Ideally, we want the accuracy
#to be as high as possible. Right now it is about 55%
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.395
Most Informative Features
                    holy = True           Very P : Neutra =     77.1 : 1.0
               fantastic = True           Very P : Neutra =     52.4 : 1.0
                   twist = True           Very N : Neutra =     33.9 : 1.0
                     wtf = True           Very N : Neutra =     33.9 : 1.0
                    lmao = True           Very N : Neutra =     33.9 : 1.0
                terrible = True           Very N : Neutra =     33.9 : 1.0
                    fake = True           Very N : Neutra =     33.9 : 1.0
                 awesome = True           Very P : Neutra =     26.5 : 1.0
              incredible = True           Very P : Neutra =     23.8 : 1.0
                 terrify = True           Very P : Neutra =     23.8 : 1.0
None


In [97]:
#This section creates a new dataframe based on the un-rated comments
#and then apply the model on the 1st 10 comments. The results of the analysis
#is then shown with each comment. Once the model is more robust, this can
#be updated to run against the entire comment list with the predicted value
#stored in the prediction column
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

predictions = []

for count in range(100):
#     if (len(random_df.iloc[count]["body"]) >= 50):
#         predictions.append(0)
#         continue
        
    custom_tokens = remove_noise(tknzr.tokenize(random_df.iloc[count]["body"]))
    predicted_sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
    
    if predicted_sentiment == "Very Positive":
        predictions.append(4)

    if predicted_sentiment == "Positive":
        predictions.append(3)

    if predicted_sentiment == "Neutral":
        predictions.append(2)

    if predicted_sentiment == "Negative":
        predictions.append(1)

    if predicted_sentiment == "Very Negative":
        predictions.append(0)
    
    #predictions.append(1 if predicted_sentiment == 'Positive' else 0)

random_df['prediction_5s'] = predictions
    
random_df.to_csv (r'user_comments_model_val.csv', index = False, header=True)

In [None]:
#This section creates a new dataframe based on the un-rated comments
#and then apply the model on the 1st 10 comments. The results of the analysis
#is then shown with each comment. Once the model is more robust, this can
#be updated to run against the entire comment list with the predicted value
#stored in the prediction column
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

predictions = []
comment_df = comment_full

for count in range(len(comment_df)):
    if (comment_df.iloc[count]["sentiment"] == 1 or comment_df.iloc[count]["sentiment"] == 0):
        predictions.append(comment_df.iloc[count]["sentiment"])
        continue
        
    custom_tokens = remove_noise(tknzr.tokenize(comment_df.iloc[count]["body"]))
    predicted_sentiment = classifier.classify(dict([token, True] for token in custom_tokens))    
    
    predictions.append(1 if predicted_sentiment == 'Positive' else 0)

comment_df['prediction'] = predictions
    
comment_df.to_csv (r'user_comments.csv', index = False, header=True)

In [44]:
pos_comments = pd.merge(random_story, comment_full, on='author', how='inner')
pos_comments = pos_comments[pos_comments['prediction_y'] == 1]
pos_comments = pos_comments.filter(like='_y')

story_power = pos_comments.groupby("storyId_y")["prediction_y"].count().reset_index().rename(columns={'prediction_y':'power'})
story_power['power'] = 4**(story_power['power'])

story_comments = comment_full[comment_full['prediction'] == 1]
story_comments = story_comments.groupby("storyId")["prediction"].count().reset_index().rename(columns={'prediction':'comments'})

user_comments = comment_full[comment_full['prediction'] == 1]
user_comments = user_comments.groupby("author")["prediction"].count().reset_index().rename(columns={'prediction':'u_comments'})
user_comments['u_comments'] = np.log(user_comments['u_comments']) * 2

user_comments

#story_comments
#user_comments.to_csv (r'user_comments_c.csv', index = False, header=True)

#Formula - 5^(Commenters liking same story - 1) + log(Commenters Comments) + Story Comments

Unnamed: 0,author,u_comments
0,#NAME?,4.795791
1,---REDACTED----,0.000000
2,--Paradigm--,3.218876
3,--Yama--,0.000000
4,-4-Z-N-,1.386294
...,...,...
14218,zorothex,0.000000
14219,zrednaxelaz1222,0.000000
14220,zschutte10,0.000000
14221,zxh01,0.000000
