###Becca's Notebook
####Using bigquery to read in HackerNews df. Cleaning and preprocessing for use with vaderSentiment model.

In [1]:
import sys, os
cwd = os.getcwd()
sys.path[0] = cwd[:cwd.rfind('/')]

In [2]:
#Imports
import os
import numpy as np
import pandas as pd

from google.cloud import bigquery

In [3]:
#Google cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/content/Saltiest HN Trolls.json'

# Create a "Client" object
client = bigquery.Client()

# Bigquery dataset reference
hn_ref = client.dataset('hacker_news', project='hn-dataset')

In [4]:
# Run SQL query to obtain filtered table
query_string = """
SELECT C.by
FROM (
  SELECT A.by, COUNT(*) AS count
  FROM `bigquery-public-data.hacker_news.full` AS A
  GROUP BY A.by
  ORDER BY count DESC
  LIMIT 100
) AS C
WHERE C.by != "None"
"""

frequent_users = client.query(query_string).result().to_dataframe()

In [5]:
# This is a triple-nested query that gets a number of comments made only by the most
# frequent users.  Change the "LIMIT XXX" part of the inner-most query to tune the
# number of users.  Change the outermost "LIMIT YYY" to change the total number of comments.
query_string = """
SELECT *
FROM `bigquery-public-data.hacker_news.full` AS A
WHERE A.by IN (
  SELECT C.by
  FROM (
    SELECT A.by, COUNT(*) AS count
    FROM `bigquery-public-data.hacker_news.full` AS A
    GROUP BY A.by
    ORDER BY count DESC
    LIMIT 500
  ) AS C
  WHERE C.by != "None"
)
LIMIT 5000
"""

df = client.query(query_string).result().to_dataframe()

In [6]:
# filter out by type (keep story and comment)
df[(df['type'] == 'story') | (df['type'] == 'comment')]

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,Thanks!,,AnimalMuppet,,1396550314,2014-04-03 18:38:34+00:00,comment,7525035,7519024.0,,,
1,,,Ah. To summarise those some previous companies...,,tim333,,1564825563,2019-08-03 09:46:03+00:00,comment,20599530,20597763.0,,,
2,,,&gt; which are not official standards<p>It dep...,,steveklabnik,,1432501026,2015-05-24 20:57:06+00:00,comment,9597558,9596683.0,,,
3,,,If you consider child rape and industrial empl...,,anigbrowl,,1504819459,2017-09-07 21:24:19+00:00,comment,15195872,15178662.0,,,
4,,,&gt;&gt; &quot;If someone wants to mess with y...,,k-mcgrady,,1465035042,2016-06-04 10:10:42+00:00,comment,11835749,11835283.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,,,Bypass the borked formatting (original is JS d...,,pasbesoin,,1327952938,2012-01-30 19:48:58+00:00,comment,3530344,3530104.0,,,
4996,,,You are referring to all government services a...,,yummyfajitas,,1335650993,2012-04-28 22:09:53+00:00,comment,3904275,3903923.0,,,
4997,Nuts Are a Nutritional Powerhouse,http://well.blogs.nytimes.com/2015/03/30/nuts-...,,,jseliger,1.0,1427845561,2015-03-31 23:46:01+00:00,story,9300646,,0.0,,
4998,,,"Yes, that was my experience on the nand2tetris...",,SilasX,,1528044451,2018-06-03 16:47:31+00:00,comment,17220403,17220233.0,,,


In [7]:
# drop some columns
#drop title, has 447,218 null values 
#drop ranking, has 500000 null values
#drop score, has 446968 null values 
clean_df = df.drop(['time', 'descendants', 'deleted', 'url', 'dead', 'title', 'ranking', 'score'], axis=1)

In [8]:
#only 507 comments with 5 characters or less, upon analysis doesn't look like they provide much context, 
#not helpful for model analysis so drop comments with 5 words or less

# take only the rows whose text column's length is greater than five
# this also removes the texts that are None
clean_df = clean_df[clean_df['text'].str.len() > 5]

In [9]:
# Use BS on a single piece of text to remove strange chars and URLs and tags
from bs4 import BeautifulSoup

def clean_soup(text):
    # make a soup - this takes care of strange characters
    soup = BeautifulSoup(text, "html.parser")

    # delete <a> tags as they contain URLs -- remember <a href="url_here">
    # tags contain URLs
    for s in soup.select('a'):
       s.extract()

    # the soup contains <p> tags -- here we get rid of those and join them with spaces
    strings = soup.find_all(text=True, recursive=True)
    result = ' '.join(strings)

    return result

In [10]:
# Apply that to each comment/story text
clean_text = clean_df['text'].apply(clean_soup)

In [11]:
clean_df['text'] = clean_text

In [12]:
import pandas as pd
pd.options.display.max_colwidth = 100
clean_df.sample(10)

Unnamed: 0,text,by,timestamp,type,id,parent
1804,"This has always struck me as a weak justification. If readers value succinctness, the problem sh...",smacktoward,2013-08-07 19:58:07+00:00,comment,6175202,6174676.0
4404,"I think the core complaint, though, is that people's first step should be ""search using a genera...",saurik,2014-05-04 19:14:22+00:00,comment,7695367,7692310.0
419,> The UK seems to have a flexible attitude as to whether a false rape accusation should be consi...,DanBC,2019-04-29 16:15:29+00:00,comment,19779695,19779520.0
3968,"At St. Pancras, it's not just security. You're clearing EU immigration as well. They don't do an...",ghaff,2018-02-13 11:52:43+00:00,comment,16366430,16364786.0
4205,"If you do get seriously sick, your family members would not be able to visit you anyway so it do...",PeterisP,2020-03-27 19:43:30+00:00,comment,22706110,22704560.0
2294,I would joke about adding a hyperloop between Reno and the Bay Area :-) Actually rail is pretty ...,ChuckMcM,2014-09-03 23:59:19+00:00,comment,8266079,8265924.0
4555,It may well be. Not sure if i can produce a link off hand. But Poettering seems to be on record ...,digi_owl,2016-05-27 12:45:30+00:00,comment,11785625,11783896.0
1844,"For what it's worth, the initial EJB spec was written by IBM, so I guess the answer to your ques...",fauigerzigerk,2017-02-14 18:17:48+00:00,comment,13645706,13645270.0
2892,I guess I shouldn't mention the coffee shops I know that simply refrigerate their hour-old coffe...,ghshephard,2010-05-31 01:04:27+00:00,comment,1391481,1391026.0
4049,It’s the local tv news station.,empath75,2019-05-23 10:20:15+00:00,comment,19990294,19989988.0


In [13]:
#makes sense the parent would have some null values, if they are the first comment
clean_df.isnull().sum()

text          0
by            0
timestamp     0
type          0
id            0
parent       12
dtype: int64

###Pyrom's Notebook
####Using vaderSentiment to produce 'Saltiness' score for each Username. The higher the score, the more salty. 

####VADERSentiment is a pretrained model that processes texts and calculates their sentiment values.

In [14]:
pip install vaderSentiment



In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [16]:
#Panda Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [17]:
#Drop misc columns
df = clean_df.drop(columns=['timestamp', 'type', 'parent'])
#Convert to string values
df = df.astype(str)
df.head()

Unnamed: 0,text,by,id
0,Thanks!,AnimalMuppet,7525035
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558
3,If you consider child rape and industrial employment to be in the same category then you're not ...,anigbrowl,15195872
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749


In [24]:
#Slim out users with less than 1000 comments, more manageable, cuts data in half
def slim(df):
    """
    Slim out dataset to user with 1000 or more comments.

    Returns a filtered dataframe.
    """

    X = df.copy()
    X = X.groupby('by').filter(lambda x: x['by'].count()<50)

    return X

In [25]:
#Apply function
new_df = slim(df)
print(new_df.shape)
new_df.head()

(4391, 3)


Unnamed: 0,text,by,id
0,Thanks!,AnimalMuppet,7525035
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558
3,If you consider child rape and industrial employment to be in the same category then you're not ...,anigbrowl,15195872
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749


In [26]:
#Machine Learning, using VADER Sentiment Analysis
def sentiment_score(comment):
    analyser = SentimentIntensityAnalyzer()
    
    x = 0
    score = analyser.polarity_scores(comment)
    x = x + score['pos']
    x = x + score['compound']
    x = x - score['neg'] 

    return x

In [27]:
#Apply function to 'text' column
new_df['Saltiness'] = new_df['text'].apply(sentiment_score)
new_df.head()

Unnamed: 0,text,by,id,Saltiness
0,Thanks!,AnimalMuppet,7525035,1.4926
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530,0.841
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558,0.5879
3,If you consider child rape and industrial employment to be in the same category then you're not ...,anigbrowl,15195872,-0.9846
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749,0.14


In [28]:
import copy
sentiment_df = copy.copy(new_df)

In [29]:
#Generate User_ID
new_df['User_ID'] = new_df.groupby(new_df.by.tolist(), sort=False).ngroup() + 1000
new_df.head()

Unnamed: 0,text,by,id,Saltiness,User_ID
0,Thanks!,AnimalMuppet,7525035,1.4926,1000
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530,0.841,1001
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558,0.5879,1002
3,If you consider child rape and industrial employment to be in the same category then you're not ...,anigbrowl,15195872,-0.9846,1003
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749,0.14,1004


In [30]:
new_df.sort_values(by='by')

Unnamed: 0,text,by,id,Saltiness,User_ID
3547,That is a cool feature. good use of instant torque. offsets the inertia during windup.,001sky,7543035,0.9229,1362
1209,"This is a good point, and also its worth another note (from the linked article): Palihapitiya pu...",001sky,6487404,0.7607,1362
4081,"Odd comment, in this context. Have you read the article?",001sky,10611359,-0.5222,1362
4243,Be wary of conflating corporate cash with the LP cash backing VC companies. Companies are holdin...,001sky,7642538,0.9849,1362
1126,"Its also shorter to write/speak, than the generic & generally non-sensical altenative: ""smartpho...",001sky,4740584,0.0000,1362
...,...,...,...,...,...
3858,"I didn't think disk io matters all that much if you fit your data in memory, which afaik you sho...",zokier,3947657,0.6044,1368
1652,On AWS can't you just use ACM which is supposed to be pretty painless Doesn't seem that bad,zokier,17601788,1.1597,1368
3706,Intel products are heavily segmented. Did you do your comparison within the same segment?,zokier,14539257,0.0000,1368
4134,"Just curious, has anyone on HN ever successfully claimed money from SSL certificate insurance?",zokier,11130508,0.9845,1368


In [31]:
print("Max User_ID:", new_df['User_ID'].max())

Max User_ID: 1481


In [32]:
new_df = new_df[(new_df['Saltiness'] >= 0) & (new_df['Saltiness'] <= 1.1)]
new_df.head()

Unnamed: 0,text,by,id,Saltiness,User_ID
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530,0.841,1001
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558,0.5879,1002
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749,0.14,1004
5,Makes sense. I guess my central confusion is that I thought the entire reason for bundling mor...,ethbro,10352663,1.075,1005
8,Many programming languages use references to other objects liberally. Wouldn't it be hard to kee...,icebraining,14891742,0.4923,1007


In [33]:
new_df.describe()

Unnamed: 0,Saltiness,User_ID
count,2811.0,2811.0
mean,0.513725,1203.707222
std,0.37058,131.84351
min,0.0,1000.0
25%,0.1399,1085.0
50%,0.5426,1191.0
75%,0.85165,1313.0
max,1.0995,1481.0


In [34]:
#Rename columns
new_df.columns = ['Comment', 'Username', 'Comment_ID', 'Saltiness', 'User_ID']

#Rearrange columns, sort by ranking
order = ['Comment_ID', 'User_ID', 'Username', 'Comment', 'Saltiness']
comments_score = new_df[order]

#Housekeeping
comments_score.sort_values(by=['Username'], inplace=True)
comments_score = comments_score.reset_index()
comments_score = comments_score.drop(columns=['index'])
comments_score

Unnamed: 0,Comment_ID,User_ID,Username,Comment,Saltiness
0,7642538,1362,001sky,Be wary of conflating corporate cash with the LP cash backing VC companies. Companies are holdin...,0.9849
1,8015406,1362,001sky,"Looks that way...? And not just p.1, for the first ~60 stories there is not much newer than the ...",0.0000
2,7543035,1362,001sky,That is a cool feature. good use of instant torque. offsets the inertia during windup.,0.9229
3,6487404,1362,001sky,"This is a good point, and also its worth another note (from the linked article): Palihapitiya pu...",0.7607
4,6447899,1362,001sky,This is also pretty easy to do today with an ear-piece and a Garmin. Heads up is only really hel...,0.9789
...,...,...,...,...,...
2806,11130508,1368,zokier,"Just curious, has anyone on HN ever successfully claimed money from SSL certificate insurance?",0.9845
2807,6192785,1368,zokier,Zimmermann advocating Hushmail. That's curious. Also imho deleting user data without warning is ...,0.4200
2808,4915255,1368,zokier,basically by replacing utilman you can execute arbitrary code with system(?) privileges? does th...,1.0092
2809,3947657,1368,zokier,"I didn't think disk io matters all that much if you fit your data in memory, which afaik you sho...",0.6044


In [35]:
#Create copy for clean table for salty_user
import copy
users_score = copy.copy(new_df)
users_score.head()

Unnamed: 0,Comment,Username,Comment_ID,Saltiness,User_ID
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530,0.841,1001
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558,0.5879,1002
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749,0.14,1004
5,Makes sense. I guess my central confusion is that I thought the entire reason for bundling mor...,ethbro,10352663,1.075,1005
8,Many programming languages use references to other objects liberally. Wouldn't it be hard to kee...,icebraining,14891742,0.4923,1007


In [36]:
#Convert Saltiness column to percent
comments_score['Saltiness'] = comments_score['Saltiness'].apply('{:.0%}'.format)

#Moment of truth!
comments_score.head()

Unnamed: 0,Comment_ID,User_ID,Username,Comment,Saltiness
0,7642538,1362,001sky,Be wary of conflating corporate cash with the LP cash backing VC companies. Companies are holdin...,98%
1,8015406,1362,001sky,"Looks that way...? And not just p.1, for the first ~60 stories there is not much newer than the ...",0%
2,7543035,1362,001sky,That is a cool feature. good use of instant torque. offsets the inertia during windup.,92%
3,6487404,1362,001sky,"This is a good point, and also its worth another note (from the linked article): Palihapitiya pu...",76%
4,6447899,1362,001sky,This is also pretty easy to do today with an ear-piece and a Garmin. Heads up is only really hel...,98%


In [37]:
#Not too salty comment
print(comments_score.loc[100,'Comment'])
print(comments_score.loc[100,'Saltiness'])

You're welcome : ) I voted you up just based on all the crowd-voting going on. Sorry about misrepresenting your post. It had been a while since you posted it, and it's gone now.
45%


In [38]:
#Comment is salty
print(comments_score.loc[50,'Comment'])
print(comments_score.loc[50,'Saltiness'])

Read the docket (posted up thread), but generally shareholders are debtors-in-possesion meaning they are "owed" a fraction of the company if they sold their shares. Everyone who has a share of American Apparel is listed on that docket as a creditor. As for board seats, I can't say. I've watched a couple of large companies restructure fairly closely but don't recall board seats being offered as part of that. Certainly at the point of being in Chapter 11 bankruptcy you're no longer the company you once were, one option is always to liquidate and move to Chapter 7 to disperse the assets. We had a customer company do that (one of the baby telephone companies) in the dot com bust when I was at Tut/FreeGate. As I recall from that transaction we got a check for around $3,000 for what had been something like $1.2M worth of networking gear they had deployed and not yet paid for.
98%


In [39]:
comments_score.describe()

Unnamed: 0,User_ID
count,2811.0
mean,1203.707222
std,131.84351
min,1000.0
25%,1085.0
50%,1191.0
75%,1313.0
max,1481.0


In [40]:
#Push salty_comment table to postgres
#Establish connection to database
from sqlalchemy import create_engine
engine = create_engine('postgres://aecqvjqa:JTG9-DH2xUSvObwIc98hFfw3PpzQZHJo@ruby.db.elephantsql.com:5432/aecqvjqa')
comments_score.to_sql("salty_comment", engine, if_exists='replace')

  """)


In [41]:
#Salty Users, create table of Saltiness score total by username
users_score = copy.copy(new_df)
users_score.head()

Unnamed: 0,Comment,Username,Comment_ID,Saltiness,User_ID
1,"Ah. To summarise those some previous companies, Timbercorp and Great Southern pushed planting si...",tim333,20599530,0.841,1001
2,> which are not official standards It depends on what you mean by 'official standards.' We've re...,steveklabnik,9597558,0.5879,1002
4,">> ""If someone wants to mess with you they can just keep calling you and your only choice is to ...",k-mcgrady,11835749,0.14,1004
5,Makes sense. I guess my central confusion is that I thought the entire reason for bundling mor...,ethbro,10352663,1.075,1005
8,Many programming languages use references to other objects liberally. Wouldn't it be hard to kee...,icebraining,14891742,0.4923,1007


In [42]:
#Drop comment column
users_score.drop(columns=['Comment_ID', 'Comment'], inplace=True)
users_score.head()

Unnamed: 0,Username,Saltiness,User_ID
1,tim333,0.841,1001
2,steveklabnik,0.5879,1002
4,k-mcgrady,0.14,1004
5,ethbro,1.075,1005
8,icebraining,0.4923,1007


In [43]:
#Groupby username
users_score = users_score.groupby('Username', as_index=False).mean()
users_score.head(10)

Unnamed: 0,Username,Saltiness,User_ID
0,001sky,0.623125,1362
1,Alex3917,0.61164,1377
2,AndrewKemendo,0.5424,1465
3,AnimalMuppet,0.3286,1000
4,Animats,0.414171,1065
5,Anon84,0.4598,1421
6,AnthonyMouse,0.605333,1053
7,Apocryphon,0.0,1474
8,AstralStorm,0.2295,1251
9,BurningFrog,0.33745,1329


In [44]:
#Convert Saltiness column to percent
users_score['Saltiness'] = users_score['Saltiness'].apply('{:.0%}'.format)

#Second moment of truth!
users_score.head()

Unnamed: 0,Username,Saltiness,User_ID
0,001sky,62%,1362
1,Alex3917,61%,1377
2,AndrewKemendo,54%,1465
3,AnimalMuppet,33%,1000
4,Animats,41%,1065


In [45]:
#Rearrange columns
order = ['User_ID', 'Username', 'Saltiness']
users_score = users_score[order]
users_score

Unnamed: 0,User_ID,Username,Saltiness
0,1362,001sky,62%
1,1377,Alex3917,61%
2,1465,AndrewKemendo,54%
3,1000,AnimalMuppet,33%
4,1065,Animats,41%
...,...,...,...
470,1360,yuhong,48%
471,1408,yummyfajitas,55%
472,1179,zanny,88%
473,1020,zaroth,55%


In [46]:
#Push salty_user table to postgres
users_score.to_sql("salty_user", engine, if_exists='replace')