In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import copy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import seaborn
import os

In [2]:
# function that reads in the seperate data files
def read_data(file_path):
    full_path = os.path.join('..', 'Cleaned Data', file_path)

    return pd.read_json(full_path, lines=True)

In [3]:
# set seaborn for better graphs
seaborn.set()

# read in the reddit submission data
df1 = read_data('one/part-00000-9076dc6d-fa59-4c36-a0cf-8808e309da7b-c000.json.gz')
df2 = read_data('two/part-00000-c7cf2076-eae1-4d0c-bce1-e7b0c43a3bf1-c000.json.gz')
df3 = read_data('three/part-00000-662e59e6-5ee7-48da-a85f-1bcf09724f97-c000.json.gz')
df4 = read_data('four/part-00000-3060ac52-6be0-4d42-a322-3e4a7954a4f4-c000.json.gz')
df5 = read_data('five/part-00000-9c42996a-80d4-4a96-b59b-228f5e241a65-c000.json.gz')
df6 = read_data('six/part-00000-5da94b81-55c0-42e7-b3dc-5a51a14e8589-c000.json.gz')
df7 = read_data('seven/part-00000-eb573e13-d85e-400c-b3db-ffa9bd2d5543-c000.json.gz')
df8 = read_data('eight/part-00000-61526e86-ba2d-4df5-a9d1-d043ca875b62-c000.json.gz')
df9 = read_data('nine/part-00000-dc7c0356-fae4-47b8-a93a-f9b401cf70f0-c000.json.gz')
df10 = read_data('ten/part-00000-f3ae3925-50c4-469e-8304-6007e9b4cdab-c000.json.gz')
df11 = read_data('eleven/part-00000-2aa1781a-723e-49c4-a488-47ca50409657-c000.json.gz')
df12 = read_data('twelve/part-00000-7ff282ff-fa5e-494c-ab5c-59f07d2a2f0d-c000.json.gz')

# get the dataframes into one dataframe
frames = [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12]
df = pd.concat(frames, ignore_index=True)

df

Unnamed: 0,name,downs,ups,hide_score,subreddit,locked,num_comments,id,score,author,...,is_self,date,datetime,word_count_self,word_count_title,preview,author_flair_css_class,author_flair_text,link_flair_css_class,link_flair_text
0,t3_41sawn,0,15,False,movies,False,22,41sawn,15,MrYoloSwaggins1,...,True,2016-01-19,2016-01-19 18:52:37-08:00,47,38,,,,,
1,t3_3zdofz,0,1,False,rupaulsdragrace,False,2,3zdofz,1,valentineboy13,...,True,2016-01-03,2016-01-03 22:39:38-08:00,47,15,{'images': [{'id': 'zQm8BsqAjehABjQzsmr8aWNi-6...,,,,
2,t3_3zjgqe,0,1,False,HuhThatsOdd,False,1,3zjgqe,1,[deleted],...,True,2016-01-05,2016-01-05 01:35:23-08:00,2,12,,,,,
3,t3_437sfn,0,1,False,mangogigi,False,1,437sfn,1,[deleted],...,True,2016-01-28,2016-01-28 23:13:17-08:00,2,13,,,,,
4,t3_40pp1m,0,2,False,IWantThatOnAShirt,False,0,40pp1m,2,infiniterebellion,...,True,2016-01-12,2016-01-12 17:48:17-08:00,39,5,{'images': [{'id': 'bS0aj6Rjz5f3kGL5qMfAJVe_2E...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,t3_5k639h,0,2,False,careerguidance,False,3,5k639h,2,greentea30,...,True,2016-12-24,2016-12-24 17:05:17-08:00,47,13,,,,,
299996,t3_5kszeg,0,0,False,FFXV,False,18,5kszeg,0,hammerblaze,...,True,2016-12-28,2016-12-28 14:11:29-08:00,19,18,,,,,
299997,t3_5i1wv6,0,6,False,migraine,False,11,5i1wv6,6,Nafetsg,...,True,2016-12-12,2016-12-12 21:40:14-08:00,112,5,,,,,
299998,t3_5h13d2,0,1,False,Toppiecers,False,0,5h13d2,1,polywaggot,...,True,2016-12-07,2016-12-07 09:10:29-08:00,12,4,,,,,


In [4]:
# get the columns we need from the dataset
columns = [
    'title',
    'score',
    'selftext'
]

df = df[columns]

In [5]:
# create the sentiment analysis tool
analyzer = SentimentIntensityAnalyzer()

In [None]:
# deepcopy the dataframe to avoid errors
df2 = copy.deepcopy(df)

# get the sentiment scores of each title and selftext
df2['title_sentiment_scores'] = df['title'].apply(analyzer.polarity_scores)
df2['selftext_sentiment_scores'] = df['selftext'].apply(analyzer.polarity_scores)

df = copy.deepcopy(df2)

In [None]:
# this function returns either 'neutral', 'positive', or 'negative' depending on compound sentiment score
def final_sentiment(sentiment):
    compound = sentiment['compound']
    
    if compound >= 0.05:
        return "positive"
    elif (compound > -0.05) and (compound < 0.05):
        return "neutral"
    elif compound  <= -0.05:
        return "negative"
    
# this function extracts the compound score from the sentiment analysis 
def compound_extractor(sentiment):
    compound = sentiment['compound']
    
    return compound

# get the final sentiment category of each title and selftext from the data
df['compound_title'] = df['title_sentiment_scores'].apply(compound_extractor)
df['compound_text'] = df['selftext_sentiment_scores'].apply(compound_extractor)
df['sentiment_final_title'] = df['title_sentiment_scores'].apply(final_sentiment)
df['sentiment_final_selftext'] = df['selftext_sentiment_scores'].apply(final_sentiment)
df

In [None]:
# get the mean of all scores
mean = df['score'].mean()
mean

In [None]:
# separate the submissions by sentiment, positive, negative, neutral
positive_posts = df[df['sentiment_final_selftext'] == 'positive']
negative_posts = df[df['sentiment_final_selftext'] == 'negative']
neutral_posts = df[df['sentiment_final_selftext'] == 'neutral']

# separate submissions by high or low score for each category
pos_post_high = positive_posts[positive_posts['score'] >= mean]
pos_post_low = positive_posts[positive_posts['score'] < mean]

neg_score_high = negative_posts[negative_posts['score'] >= mean]
neg_score_low  = negative_posts[negative_posts['score'] < mean ]

neu_score_high =  neutral_posts[neutral_posts['score'] >= mean]
neu_score_low = neutral_posts[neutral_posts['score'] < mean]

# count the total for each category
count_ph = pos_post_high['score'].count()
count_pl = pos_post_low['score'].count()
count_nh = neg_score_high['score'].count()
count_nl = neg_score_low['score'].count()
count_nuh = neu_score_high['score'].count()
count_nul = neu_score_low['score'].count()

In [None]:
# calculate the chi2_contingency
table = np.array([[count_pl, count_nl, count_nul], [count_ph, count_nh, count_nuh]])
res = stats.chi2_contingency(table)
res

In [None]:
# plot the chi results
x = np.array(["High scores", "Low Scores"])
y = np.array([[count_ph, count_pl], [count_nuh,count_nul], [count_nh, count_nl]])

X = np.arange(2)
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.grid(axis = 'x')
ax.bar(X + 0.00, y[0], color = 'skyblue', width = 0.25)
ax.bar(X + 0.25, y[1], color = 'palegreen', width = 0.25)
ax.bar(X + 0.50, y[2], color = 'lightcoral', width = 0.25)


ax.set_title('Sentiment on High vs Low Scores')
ax.set_ylabel('Number of Scores in Sentiment Range')
ax.set_xlabel('Reddit Scores')

ax.set_xticks(X+0.25)
ax.set_xticklabels(['High Scores','Low Scores'])

ax.legend(labels=['Positive', 'Neutral', 'Negative'])

# plot the chi table
columns = ['Positive', 'Neutral', 'Negative']
rows = ['High Score', 'Low Score']

data = np.array([[count_ph,count_nuh, count_nh], [count_pl, count_nul, count_nl]])

plt.table(cellText=data, rowLabels=rows, colLabels=columns, loc='bottom', bbox = [0.14, -0.4, 0.8, 0.25])
fig.savefig('../Graphs/sentiment_scores.png', bbox_inches='tight', pad_inches=0.1)