###Becca's Notebook
####Using bigquery to read in HackerNews df. Cleaning and preprocessing for use with vaderSentiment model.

In [1]:
import sys, os
cwd = os.getcwd()
sys.path[0] = cwd[:cwd.rfind('/')]

In [2]:
#Imports
import os
import numpy as np
import pandas as pd

from google.cloud import bigquery

In [3]:
#Google cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/content/Saltiest HN Trolls.json'

# Create a "Client" object
client = bigquery.Client()

# Bigquery dataset reference
hn_ref = client.dataset('hacker_news', project='hn-dataset')

In [4]:
# Run SQL query to obtain filtered table
query_string = """
SELECT C.by
FROM (
  SELECT A.by, COUNT(*) AS count
  FROM `bigquery-public-data.hacker_news.full` AS A
  GROUP BY A.by
  ORDER BY count DESC
  LIMIT 100
) AS C
WHERE C.by != "None"
"""

frequent_users = client.query(query_string).result().to_dataframe()

In [5]:
# This is a triple-nested query that gets a number of comments made only by the most
# frequent users.  Change the "LIMIT XXX" part of the inner-most query to tune the
# number of users.  Change the outermost "LIMIT YYY" to change the total number of comments.
query_string = """
SELECT *
FROM `bigquery-public-data.hacker_news.full` AS A
WHERE A.by IN (
  SELECT C.by
  FROM (
    SELECT A.by, COUNT(*) AS count
    FROM `bigquery-public-data.hacker_news.full` AS A
    GROUP BY A.by
    ORDER BY count DESC
    LIMIT 500
  ) AS C
  WHERE C.by != "None"
)
LIMIT 50000
"""

df = client.query(query_string).result().to_dataframe()

In [6]:
# filter out by type (keep story and comment)
df[(df['type'] == 'story') | (df['type'] == 'comment')]

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"Like I said, to be honest I do not know if it ...",,davidw,,1222716762,2008-09-29 19:32:42+00:00,comment,318709,318705.0,,,
1,Modern ClojureScript,https://github.com/magomimmo/modern-cljs,,,brudgers,83.0,1438621836,2015-08-03 17:10:36+00:00,story,9997830,,11.0,,
2,,,"Well, because heat dissipation is proportional...",,stcredzero,,1340822020,2012-06-27 18:33:40+00:00,comment,4168406,4165347.0,,,
3,,,"So?<p><a href=""https:&#x2F;&#x2F;streeteasy.co...",,perl4ever,,1570229208,2019-10-04 22:46:48+00:00,comment,21162521,21160429.0,,,
4,,,"Yes, because beating someone for doing or thin...",,TeMPOraL,,1570531967,2019-10-08 10:52:47+00:00,comment,21190821,21190776.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,,,I believe that you hold your views in good fai...,,dang,,1565715946,2019-08-13 17:05:46+00:00,comment,20687789,20685847.0,,,
49996,,,Another vote for Fastmail!,,toomuchtodo,,1580831136,2020-02-04 15:45:36+00:00,comment,22236828,22236621.0,,,
49997,,,You can get a pretty decent number for the car...,,ars,,1568924058,2019-09-19 20:14:18+00:00,comment,21020254,21018030.0,,,
49998,NASA wants more humans to walk on the moon – a...,https://www.usatoday.com/story/tech/nation-now...,,,evo_9,1.0,1532008721,2018-07-19 13:58:41+00:00,story,17566297,,1.0,,


In [7]:
# drop some columns
#drop title, has 447,218 null values 
#drop ranking, has 500000 null values
#drop score, has 446968 null values 
clean_df = df.drop(['time', 'descendants', 'deleted', 'url', 'dead', 'title', 'ranking', 'score'], axis=1)

In [8]:
#only 507 comments with 5 characters or less, upon analysis doesn't look like they provide much context, 
#not helpful for model analysis so drop comments with 5 words or less

# take only the rows whose text column's length is greater than five
# this also removes the texts that are None
clean_df = clean_df[clean_df['text'].str.len() > 5]

In [9]:
# Use BS on a single piece of text to remove strange chars and URLs and tags
from bs4 import BeautifulSoup

def clean_soup(text):
    # make a soup - this takes care of strange characters
    soup = BeautifulSoup(text, "html.parser")

    # delete <a> tags as they contain URLs -- remember <a href="url_here">
    # tags contain URLs
    for s in soup.select('a'):
       s.extract()

    # the soup contains <p> tags -- here we get rid of those and join them with spaces
    strings = soup.find_all(text=True, recursive=True)
    result = ' '.join(strings)

    return result

In [10]:
# Apply that to each comment/story text
clean_text = clean_df['text'].apply(clean_soup)

  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
clean_df['text'] = clean_text

In [12]:
import pandas as pd
pd.options.display.max_colwidth = 100
clean_df.sample(10)

Unnamed: 0,text,by,timestamp,type,id,parent
49329,> The economy structured itself around near ZIRP rates. We were at .25 for a long time which gav...,toomuchtodo,2016-01-18 19:34:50+00:00,comment,10926357,10925119.0
16770,To be a good programer we need to insulate our mind from rest of the world.\nI believe women can...,known,2008-11-16 14:53:38+00:00,comment,366010,365772.0
39743,fishing expeditions are a form of overreach.,rhizome,2016-10-05 19:57:49+00:00,comment,12647239,12645742.0
46504,And there are at least two sensible ways to make lists a Monad.,eru,2010-03-10 21:12:33+00:00,comment,1182002,1181960.0
1440,"In most places I have worked, knowing the database well enough to manage it and at very least wr...",pjmlp,2020-04-25 08:18:27+00:00,comment,22976579,22949916.0
16247,I agree with you. Does it help if the question is reframed? This city has some severe problems....,DanBC,2014-01-16 18:02:48+00:00,comment,7071207,7070892.0
40591,"Yeah, maybe if he bought them a cake and rented a hooker, everything would be peachy!",pavel_lishin,2011-07-13 18:36:23+00:00,comment,2760049,2759874.0
1843,"This definitely depends on where you live. I don't know if it's still called ""swats"" like it was...",jessaustin,2016-12-04 20:38:09+00:00,comment,13101664,13101544.0
25159,"One of my professors had two PhDs: one in physics, and one in computer science. Considering that...",saagarjha,2020-04-10 08:14:02+00:00,comment,22831029,22830123.0
44817,"Utilization rate means percentage of time that is billed to a customer, I believe/presume. Wheth...",perl4ever,2018-08-07 03:33:25+00:00,comment,17703695,17701889.0


In [13]:
#makes sense the parent would have some null values, if they are the first comment
clean_df.isnull().sum()

text          0
by            0
timestamp     0
type          0
id            0
parent       79
dtype: int64

###Pyrom's Notebook
####Using vaderSentiment to produce 'Saltiness' score for each Username. The higher the score, the more salty. 

####VADERSentiment is a pretrained model that processes texts and calculates their sentiment values.

In [14]:
pip install vaderSentiment



In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [16]:
#Panda Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [17]:
df = clean_df.drop(columns=['timestamp', 'type', 'id', 'parent'])
df.head()

Unnamed: 0,text,by
0,"Like I said, to be honest I do not know if it is in any way an accurate analogy, but it seems ...",davidw
2,"Well, because heat dissipation is proportional to the square of the voltage, you end up giving u...",stcredzero
3,So?,perl4ever
4,"Yes, because beating someone for doing or thinking something is a proven way of making them stop...",TeMPOraL
5,I think downvotes are fundamentally broken. If upvotes are agree/approval and downvotes are disa...,Someone1234


In [18]:
#Convert to string values
df = df.astype(str)

In [19]:
#Slim out users with less than 1000 comments, more manageable, cuts data in half
def slim(df):
    """
    Slim out dataset to user with 1000 or more comments.

    Returns a filtered dataframe.
    """

    X = df.copy()
    X = X.groupby('by').filter(lambda x: x['by'].count()<1000)

    return X

In [20]:
new_df = slim(df)
print(new_df.shape)
new_df.head()

(44658, 2)


Unnamed: 0,text,by
0,"Like I said, to be honest I do not know if it is in any way an accurate analogy, but it seems ...",davidw
2,"Well, because heat dissipation is proportional to the square of the voltage, you end up giving u...",stcredzero
3,So?,perl4ever
4,"Yes, because beating someone for doing or thinking something is a proven way of making them stop...",TeMPOraL
5,I think downvotes are fundamentally broken. If upvotes are agree/approval and downvotes are disa...,Someone1234


In [21]:
#Machine Learning, using VADER Sentiment Analysis

def sentiment_score(comment):
    analyser = SentimentIntensityAnalyzer()
    
    x = 0
    score = analyser.polarity_scores(comment)
    x = x + score['pos']
    x = x + score['compound']
    x = x - score['neg'] 

    return x

In [22]:
#Apply function to each sample in 'text' column
new_df['Saltiness'] = new_df['text'].apply(sentiment_score)

In [23]:
#Copy of df if needed to be reset
import copy
copy_df = copy.copy(new_df)

In [27]:
new_df = copy.copy(copy_df)

In [28]:
#Multiply Saltiness by -1, the higher the score=the more salty
new_df['Saltiness'].mul(-1)

0        0.6304
2       -1.0085
3       -0.0000
4        0.8053
5       -0.2368
          ...  
49994   -0.9766
49995    1.0339
49996   -0.0000
49997   -0.6563
49999    0.8018
Name: Saltiness, Length: 44658, dtype: float64

In [29]:
#Convert Saltiness column to percent
new_df['Saltiness'] = new_df['Saltiness'].apply('{:.0%}'.format)

In [30]:
#Moment of truth!
new_df.head()

Unnamed: 0,text,by,Saltiness
0,"Like I said, to be honest I do not know if it is in any way an accurate analogy, but it seems ...",davidw,-63%
2,"Well, because heat dissipation is proportional to the square of the voltage, you end up giving u...",stcredzero,101%
3,So?,perl4ever,0%
4,"Yes, because beating someone for doing or thinking something is a proven way of making them stop...",TeMPOraL,-81%
5,I think downvotes are fundamentally broken. If upvotes are agree/approval and downvotes are disa...,Someone1234,24%


In [31]:
#Rename columns
new_df.columns = ['Comment', 'Username', 'Saltiness']

#Rearrange columns, sort by ranking
order = ['Username', 'Comment', 'Saltiness']
comments_score = new_df[order]

In [32]:
comments_score.sort_values(by=['Username'], inplace=True)

In [33]:
comments_score = comments_score.reset_index()

In [34]:
comments_score = comments_score.drop(columns=['index'])

In [35]:
comments_score

Unnamed: 0,Username,Comment,Saltiness
0,001sky,Stuff like this doesn't just pop out of nowhere. It's very clear this is the start of a campaign...,-75%
1,001sky,"How do they pay the cost of shut down, if they ""can't pay"" the cost of operation? Just curious. ...",38%
2,001sky,Seawater is not acidic. There is no such thing as 'acidification' in the literal sense. The wate...,-37%
3,001sky,isn't the temperature variability of li-ion a rookie mistake? have you ever taken a camera skiin...,-18%
4,001sky,I don't think this is relevant. The reason that people for the most are leaving is not politics ...,91%
...,...,...,...
44653,zokier,Spam for $99 shim for gdb.,-69%
44654,zokier,"Because the defaults come from 70's, and lots of nice stuff has added since, but nobody dared to...",31%
44655,zokier,I don't see the need to make that distinction here. It should be trivial to port from Moblin to ...,76%
44656,zokier,xsv for doing queries against CSV files probably belongs to the list too:,0%


In [36]:
#Random comment test
comments_score.loc[600,'Comment']

"> Do you really believe that? Not many do - most think it is a deliberately racist strategy, and most see that quote as an explanation of it. How is it racist to remove the racist component from a policy? It's literally the exact opposite of that. That's what optimizing to reduce the amount of racism looks like -- you find things that are partly desired and partly racist and replace them with something to achieve the desired objective without discriminating based on race."

In [37]:
#Comment is pretty salty
comments_score.loc[600,'Saltiness']

'-103%'

In [38]:
comments_score.loc[4600,'Comment']

'That actually sounds like a much better policy than what was implemented in the US.  Bailing out foreign creditors to private banks was  simply a huge waste of money.'

In [39]:
#Comment is overall positive
comments_score.loc[4600,'Saltiness']

'74%'

###Pushing to postgres: Comment's individual score.
####Convert pandas dataframe to SQL and load into postgres database through elephantsql.

In [40]:
def to_postgres(df, title, engine):
    """
    Move pandas dataframe to postgresql database.

    Determines if you can query using SQLAlchemy in python.
    """
    df.to_sql(title, engine, index=False)

In [41]:
from sqlalchemy import create_engine
#Establish connection to database
engine = create_engine('postgres://aecqvjqa:JTG9-DH2xUSvObwIc98hFfw3PpzQZHJo@ruby.db.elephantsql.com:5432/aecqvjqa')

#Convert dataframe to SQL
to_postgres(comments_score, 'comment_salt', engine)

  """)
