In [1]:
# Every returned Out[] is displayed, not just the last one. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import nltk
import pandas as pd
import numpy as np

This is the comment dataframe. It holds two years worth of comments from NLSS episodes. 

In [None]:
comment_df = pd.read_pickle('Pipeline/full_df.pkl')

comment_df['Tokens'] = comment_df['Comment'].apply(nltk.word_tokenize)

comment_df.head() 
comment_df.tail()
print(len(comment_df), 'comments')

In [None]:
video_df = pd.read_pickle('Pipeline/combined.pkl')
video_df.head()
video_df.tail()
print(len(video_df), 'videos')

The comments that I currently have available don't go as far back as my video stats do. Lets trim it to fit.

In [None]:
video_df.loc[video_df['Date'] == 'February 29, 2016']

In [None]:
temp_vid = video_df.loc[0:138]
temp_vid.head()
temp_vid.tail()

We need a consistent format for the dates in these two data frames. The two forms currently in use are
- Month DD, YYYY
- YYYY-MM-DD

I find the first one more readable, so let's convert the dates.

In [None]:
months = {'01':'January', '02':'February', '03':'March', '04':'April', '05':'May', '06':'June', '07':'July', '08':'August', '09':'September', '10':'October', '11':'November', '12':'December'}

oldformat = comment_df['Date']
formatted = []

for date in oldformat:
    #Split old format into units
    ymd = date.split('-')
    #Save each piece as a var
    M = ymd[1]
    D = ymd[2]
    Y = ymd[0]
    #Change month numbers to words
    M = months.get(M)
    
    entry = M+' '+D+', '+Y
    
    formatted.append(entry)

Seems to match up now

In [None]:
formatted[0]
formatted[-1]

In [None]:
comment_df['Date'] = formatted
comment_df.head()

# Analysis

Let's look at the rates of the words 'love' and 'hate' in chat.

In [None]:
import re

In [None]:
#This format oversearches, e.g. 'whatever' contains 'hate'
hate_words = comment_df[comment_df['Comment'].str.contains('hate')]
love_words = comment_df[comment_df['Comment'].str.contains('love')]

In [None]:
len(love_words)
len(hate_words)

In [None]:
with open('PublicData/bad_words.txt') as f:
    bad_words = f.readlines()
    #Remove newline char
    bad_words = [x.strip() for x in bad_words] 
len(bad_words)

Is profanity more likely to occur with the words 'love' or 'hate'?

Let's look at the most active users.

In [None]:
comment_df['User'].value_counts()

Some of the top commenters are moderators or tool bots. Ignoring these, what sorts of comments do they produce?

In [None]:
comment_df.loc[comment_df['User']=='Dr_Roc']

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
for post in comment_df.loc[comment_df['User']=='Dr_Roc']['Comment']:
    print(post)
    sentiment = sid.polarity_scores(post)
    for k in sorted(sentiment):
        if sentiment[k] != 0.0:
            print('{0}: {1}, '.format(k, sentiment[k]), end='')
        print()