In [1]:
%%capture

# Import libraries.
import html
import pandas as pd

from html.parser import HTMLParser
from profanity_check import predict_prob

In [2]:
class tag_counter(HTMLParser):
    """
    Helper class for counting HTML tags.
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self.num_tags = 0
    def handle_starttag(self, tag, attrs):
        self.num_tags +=1
    def count_tags(self, html_text):
        self.num_tags = 0
        self.feed(html_text)
        return self.num_tags

parser = tag_counter()

In [3]:
# Load primary dataframe.
df = pd.read_csv('first_week_oct_2015_comments_by_top_400_with_scores_v3.csv')

In [4]:
# Load original html comments from file.
html_text =  pd.read_csv('first_week_oct_2015_comments_by_top_400.csv', usecols = ['text'])

In [5]:
# Preview loaded data.
html_text.head()

Unnamed: 0,text
0,US is not really scared by BRICS at all. They&...
1,"I wasn&#x27;t really trying to argue, they sai..."
2,The examples on the homepage kind of underscor...
3,No mention of a critical aspect of a service l...
4,I think some of these points are gross exagger...


In [6]:
# Remove excess whitespace from comment text.
df['text'] = df['text'].replace('\s+', ' ',regex=True)

In [7]:
# Add scores from profanity-check to dataframe.
df['pc_prob_offensive'] = predict_prob(df['text'])

In [8]:
# Examine comments identified as most likely to be offensive
# by profanity-check.
for comment in (df.sort_values(by='pc_prob_offensive', ascending=False)['text'].head(20)):
    print(comment, '\n')

commenting on this quote from the article: "‘I’m 29. This is what you do when you’re 29" I don't know what the fuck is it with this wall street culture. I find SV culture banal and disgusting but no where close to the shit show that is the wall street glamorized by movies and tv shows. I would've done the same and I have done the same when I was 25. Working like crazy. You realize one day, this is bullshit. Everyone in this shitty place is fucking bullshit. A check at the end of the fucking month that translates into a measely minimum wage if you count the long hours and the office male bravado bullshit. High school never ends. It's only after I was chewed and spit out, I realized, no amount of glamor or prestige is worth chasing if there's no chance at MY happiness and MY health. This make it till you are 30 mentality bullshit has got to go. The anxiety it gives a person in early to mid 20s to feel like you need to keep moving to feel like you are moving closer to success. Lot of youn

In [9]:
# Restore original html comments to primary dataset.
df['html_text'] = html_text['text']

In [10]:
# Remove html escape sequences.
df['html_text'] = df['html_text'].apply(html.unescape)

In [11]:
# Check updated html text column.
df['html_text'].head()

0    US is not really scared by BRICS at all. They'...
1    I wasn't really trying to argue, they said the...
2    The examples on the homepage kind of underscor...
3    No mention of a critical aspect of a service l...
4    I think some of these points are gross exagger...
Name: html_text, dtype: object

In [12]:
# Test tag counting.
parser.count_tags(df['html_text'][17])

3

In [13]:
# Add tag count feature to dataframe.
df['num_tags'] = df['html_text'].apply(parser.count_tags)

In [14]:
# Preview distribution of tag counts.
df['num_tags'].value_counts()

0     6805
1     1298
2      748
3      395
4      264
5      156
6       99
7       45
8       43
9       28
11      27
10      19
12       7
14       5
17       5
16       4
29       2
20       2
15       2
13       2
18       2
33       1
24       1
22       1
52       1
45       1
44       1
21       1
19       1
27       1
68       1
23       1
47       1
Name: num_tags, dtype: int64

In [15]:
# Add new line count to dataframe.
df['new_lines'] = df['text'].str.count('\n')

In [16]:
# Check distribution of new line counts.
df['new_lines'].value_counts()

0    9970
Name: new_lines, dtype: int64

In [17]:
# Add number of capital letters to dataframe.
df['num_caps'] = df['text'].apply(lambda text: 
                                  sum(1 for char in text if char.isupper()))

In [18]:
# Compare 'author' and 'by' fields.
# These are redundant for comments.
(df['author'] == df['by']).value_counts()

True    9970
dtype: int64

In [19]:
# Add text length feature to dataframe.
df['text_len'] = df['text'].str.len()

In [20]:
# Normalize capital letter count.
df['pct_caps'] = df['num_caps']/df['text_len']

In [21]:
# Examine comments with the highest proportion of capital 
# letters.
for comment in df.sort_values(by='pct_caps', ascending=False)['text'].head(20):
    print(comment, '\n')

K 

I'LL ALLOW IT. 

See SOPA. 

 WE ARE CURRENTLY NOT SUPPORTING YOUR BROWSER (BUT WE WILL SOON) That's the latest Firefox running on Ubuntu 14 LTS. 

See also CISC vs RISC. 

No NLTK love? 

See US Army. 

Why no USB Type-C? 

AMP is a subset of HTML. 

He's the REAL Steve Jobs 

>don't be a lemming. Don't sacrifice rationality for idealism. There are severe consequences. this should be wrapped in and displayed on every page on YC. like: DISCLAIMER: MAY BE FATAL TO STARTUP FOUNDERS WHEN FORGETTING TO BE A HUMAN. 

I'd rather just use SAML, OAuth, SQRL, etc. 

Yes. 

Why? 

Yes. 

Gah! 

Do they have UV LEDs yet? 

The following bug got changed from RESOLVED INVALID to RESOLVED FIXED. https://sourceware.org/bugzilla/show_bug.cgi?id=10134 

https://i.imgur.com/UUEDKYA.jpg 

Born. Suffered. Died. (TBA) 



In [22]:
# Normalize tag counts.
df['tags_per_char'] = df['num_tags']/df['text_len']

In [23]:
# Normalize new line counts.
df['newlines_per_char'] = df['new_lines']/df['text_len']

In [24]:
# Convert 'time_ts' column to pandas datetime.
# Overwrite Unix time (redundant).
df['time'] = pd.to_datetime(df['time_ts'])

In [25]:
# Extract hour from posting time.
df['hour_posted'] = df['time'].dt.hour

In [26]:
# Examine distribution of 'dead' values.
df.dead.value_counts()

True    62
Name: dead, dtype: int64

In [27]:
# Fill NA values - probably safe to assume comments not 
# marked as 'dead' aren't.
df['dead'] = df['dead'].fillna(False)

In [28]:
# Check results.
df.dead.value_counts()

False    9908
True       62
Name: dead, dtype: int64

In [29]:
# Examine distribution of 'deleted' values.
df['deleted'].value_counts()

Series([], Name: deleted, dtype: int64)

In [30]:
# Double-check results - nothing useful in this column!
df.deleted.isna().sum()

9970

In [31]:
# Review updated column list.
df.columns

Index(['id', 'by', 'author', 'time', 'time_ts', 'text', 'parent', 'deleted',
       'dead', 'ranking', 'neg', 'neu', 'pos', 'compound', 'tb_polarity',
       'tb_subjectivity', 'toxicity', 'pc_prob_offensive', 'html_text',
       'num_tags', 'new_lines', 'num_caps', 'text_len', 'pct_caps',
       'tags_per_char', 'newlines_per_char', 'hour_posted'],
      dtype='object')

In [32]:
# Drop unneeded columns.
df = df.drop(['author', 
              'time_ts',
              'html_text', 
              'deleted',
              'num_caps', 
              'num_tags', 
              'new_lines'], axis=1)

In [33]:
# Re-check column list
df.columns

Index(['id', 'by', 'time', 'text', 'parent', 'dead', 'ranking', 'neg', 'neu',
       'pos', 'compound', 'tb_polarity', 'tb_subjectivity', 'toxicity',
       'pc_prob_offensive', 'text_len', 'pct_caps', 'tags_per_char',
       'newlines_per_char', 'hour_posted'],
      dtype='object')

In [34]:
# Reorder columns.
df = df[['id', 'parent', 'by', 'time', 'hour_posted', 'text',
         'dead', 'ranking', 'text_len', 'pct_caps',
         'tags_per_char', 'newlines_per_char', 
         'toxicity', 'neg', 'neu', 'pos', 'compound', 
         'tb_polarity', 'tb_subjectivity', 'pc_prob_offensive']]

In [35]:
# Re-name columns to highlight score sources.
df.columns = ['id', 'parent', 'by', 'time', 'hour_posted', 'text',
         'dead', 'ranking', 'text_len', 'pct_caps',
         'tags_per_char', 'newlines_per_char', 
         'papi_toxicity', 'v_neg', 'v_neu', 'v_pos', 'v_compound', 
         'tb_polarity', 'tb_subjectivity', 'pc_prob_offensive']

In [36]:
# Review updated dataframe.
df.head()

Unnamed: 0,id,parent,by,time,hour_posted,text,dead,ranking,text_len,pct_caps,tags_per_char,newlines_per_char,papi_toxicity,v_neg,v_neu,v_pos,v_compound,tb_polarity,tb_subjectivity,pc_prob_offensive
0,10331981,10331895,debacle,2015-10-05 14:24:42+00:00,14,US is not really scared by BRICS at all. They'...,False,0,146,0.068493,0.0,0.0,0.100881,0.0,0.744,0.256,0.7859,0.15625,0.61875,0.238871
1,10343811,10343761,sarciszewski,2015-10-07 02:13:15+00:00,2,"I wasn't really trying to argue, they said the...",False,0,76,0.013158,0.0,0.0,0.048637,0.195,0.805,0.0,-0.3947,0.2,0.2,0.050161
2,10331538,10331008,debacle,2015-10-05 13:08:10+00:00,13,The examples on the homepage kind of underscor...,False,12,88,0.034091,0.0,0.0,0.044777,0.0,0.864,0.136,0.2975,0.1375,0.5,0.098511
3,10340097,10339965,debacle,2015-10-06 16:33:06+00:00,16,No mention of a critical aspect of a service l...,False,22,99,0.010101,0.0,0.0,0.035335,0.214,0.667,0.119,-0.25,0.0,0.8,0.056323
4,10338552,10337763,debacle,2015-10-06 13:06:26+00:00,13,I think some of these points are gross exagger...,False,38,868,0.013825,0.009217,0.0,0.232577,0.081,0.902,0.017,-0.8233,0.076667,0.26,0.001499


In [37]:
# Save to file.
df.to_csv('first_week_oct_2015_comments_by_top_400_with_scores_and_features', index=False)