In [1]:
import html
import pandas as pd

from html.parser import HTMLParser
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Helper functions for HTML clean-up.

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
df = pd.read_csv('first_week_oct_2015_comments_by_top_400.csv')

In [4]:
df.shape

(9970, 10)

In [5]:
df.head()

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,10331981,debacle,debacle,1444055082,2015-10-05 14:24:42+00:00,US is not really scared by BRICS at all. They&...,10331895,,,0
1,10343811,sarciszewski,sarciszewski,1444183995,2015-10-07 02:13:15+00:00,"I wasn&#x27;t really trying to argue, they sai...",10343761,,,0
2,10331538,debacle,debacle,1444050490,2015-10-05 13:08:10+00:00,The examples on the homepage kind of underscor...,10331008,,,12
3,10340097,debacle,debacle,1444149186,2015-10-06 16:33:06+00:00,No mention of a critical aspect of a service l...,10339965,,,22
4,10338552,debacle,debacle,1444136786,2015-10-06 13:06:26+00:00,I think some of these points are gross exagger...,10337763,,,38


In [6]:
df['text']  = df['text'].apply(html.unescape)

In [7]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
def vaderize(sentence):
    return analyzer.polarity_scores(sentence)

In [9]:
df['scores'] = df['text'].apply(vaderize)

In [10]:
df.head()

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking,scores
0,10331981,debacle,debacle,1444055082,2015-10-05 14:24:42+00:00,US is not really scared by BRICS at all. They'...,10331895,,,0,"{'neg': 0.0, 'neu': 0.744, 'pos': 0.256, 'comp..."
1,10343811,sarciszewski,sarciszewski,1444183995,2015-10-07 02:13:15+00:00,"I wasn't really trying to argue, they said the...",10343761,,,0,"{'neg': 0.195, 'neu': 0.805, 'pos': 0.0, 'comp..."
2,10331538,debacle,debacle,1444050490,2015-10-05 13:08:10+00:00,The examples on the homepage kind of underscor...,10331008,,,12,"{'neg': 0.0, 'neu': 0.864, 'pos': 0.136, 'comp..."
3,10340097,debacle,debacle,1444149186,2015-10-06 16:33:06+00:00,No mention of a critical aspect of a service l...,10339965,,,22,"{'neg': 0.214, 'neu': 0.667, 'pos': 0.119, 'co..."
4,10338552,debacle,debacle,1444136786,2015-10-06 13:06:26+00:00,I think some of these points are gross exagger...,10337763,,,38,"{'neg': 0.081, 'neu': 0.902, 'pos': 0.017, 'co..."


In [11]:
df[['neg', 'neu', 'pos', 'compound']] = df.scores.apply(pd.Series)

In [12]:
for text in df.sort_values(by='neg', ascending=False)['text'].head(20):
    print(text, '\n')

Architectural hell. 

Wrong. This is a lazy reverse-racism argument. 

Getting error. 

No, I believe you are wrong. 

No worries  It's natural to confuse cache misses with missed catches. 

This saddens me. 

Forced cooperation is always, indisputably, inferior. 

Coupled with Debt. A very nasty mix indeed. 

There is no hash collision happening. 

Make no mistake, prosecutors are politicians. 

Worse. Most murder cases aren't seriously contested. The stat looks worse if you consider the failure rate for cases involving serious doubt. 

Bit distasteful, no? 

Fear mongering. You're cooking the damn meat in a few minutes. 

Have nuclear weapons succeeded where the machine gun failed? 

I thought nuclear weapons were supposed to deter wars? 

Why is it disappointing? 

"Is it illegal to kill a person if she has no family or friends?" 

>> Ie. Fake example 

20mm AUD / year gross. 

The trap being the death at the end-of-level. 



In [13]:
df['text'] = df['text'].apply(strip_tags)

In [14]:
df.head()

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking,scores,neg,neu,pos,compound
0,10331981,debacle,debacle,1444055082,2015-10-05 14:24:42+00:00,US is not really scared by BRICS at all. They'...,10331895,,,0,"{'neg': 0.0, 'neu': 0.744, 'pos': 0.256, 'comp...",0.0,0.744,0.256,0.7859
1,10343811,sarciszewski,sarciszewski,1444183995,2015-10-07 02:13:15+00:00,"I wasn't really trying to argue, they said the...",10343761,,,0,"{'neg': 0.195, 'neu': 0.805, 'pos': 0.0, 'comp...",0.195,0.805,0.0,-0.3947
2,10331538,debacle,debacle,1444050490,2015-10-05 13:08:10+00:00,The examples on the homepage kind of underscor...,10331008,,,12,"{'neg': 0.0, 'neu': 0.864, 'pos': 0.136, 'comp...",0.0,0.864,0.136,0.2975
3,10340097,debacle,debacle,1444149186,2015-10-06 16:33:06+00:00,No mention of a critical aspect of a service l...,10339965,,,22,"{'neg': 0.214, 'neu': 0.667, 'pos': 0.119, 'co...",0.214,0.667,0.119,-0.25
4,10338552,debacle,debacle,1444136786,2015-10-06 13:06:26+00:00,I think some of these points are gross exagger...,10337763,,,38,"{'neg': 0.081, 'neu': 0.902, 'pos': 0.017, 'co...",0.081,0.902,0.017,-0.8233


In [15]:
df = df.drop('scores', axis=1)

In [16]:
df.head()

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking,neg,neu,pos,compound
0,10331981,debacle,debacle,1444055082,2015-10-05 14:24:42+00:00,US is not really scared by BRICS at all. They'...,10331895,,,0,0.0,0.744,0.256,0.7859
1,10343811,sarciszewski,sarciszewski,1444183995,2015-10-07 02:13:15+00:00,"I wasn't really trying to argue, they said the...",10343761,,,0,0.195,0.805,0.0,-0.3947
2,10331538,debacle,debacle,1444050490,2015-10-05 13:08:10+00:00,The examples on the homepage kind of underscor...,10331008,,,12,0.0,0.864,0.136,0.2975
3,10340097,debacle,debacle,1444149186,2015-10-06 16:33:06+00:00,No mention of a critical aspect of a service l...,10339965,,,22,0.214,0.667,0.119,-0.25
4,10338552,debacle,debacle,1444136786,2015-10-06 13:06:26+00:00,I think some of these points are gross exagger...,10337763,,,38,0.081,0.902,0.017,-0.8233


In [19]:
df.iloc[17]['text']

"> TL;DR: people accusing me of being sarcastic are probably missing out on one of the few bullshit-free sources of information on the Internet. My experience with Reddit has been that, like most other places on the Internet, it's a mixed bag. Specifically, some of the moderators are authoritarian dicks (e.g. /r/technology) who evaluate rules without nuance and ban people for sharing a link even when given explicit permission by another moderator to share it. Other subreddits are a bit better (e.g. /r/php), others are probably worse. My opinion of /r/netsec has changed frequently over the past few months."

In [20]:
df.to_csv('first_week_oct_2015_comments_by_top_400_with_scores.csv', index=False)