#Becca
-----------------------

In [None]:
import sys, os
cwd = os.getcwd()
sys.path[0] = cwd[:cwd.rfind('/')]

In [None]:
#Imports
import os
import numpy as np
import pandas as pd

from google.cloud import bigquery
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
#Google cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/content/Saltiest HN Trolls.json'

# Create a "Client" object
client = bigquery.Client()

# Bigquery dataset reference
hn_ref = client.dataset('hacker_news', project='hn-dataset')

In [None]:
# Run SQL query to obtain filtered table
query_string = """
SELECT C.by
FROM (
  SELECT A.by, COUNT(*) AS count
  FROM `bigquery-public-data.hacker_news.full` AS A
  GROUP BY A.by
  ORDER BY count DESC
  LIMIT 100
) AS C
WHERE C.by != "None"
"""

frequent_users = client.query(query_string).result().to_dataframe()

In [None]:
# This is a triple-nested query that gets a number of comments made only by the most
# frequent users.  Change the "LIMIT XXX" part of the inner-most query to tune the
# number of users.  Change the outermost "LIMIT YYY" to change the total number of comments.
query_string = """
SELECT *
FROM `bigquery-public-data.hacker_news.full` AS A
WHERE A.by IN (
  SELECT C.by
  FROM (
    SELECT A.by, COUNT(*) AS count
    FROM `bigquery-public-data.hacker_news.full` AS A
    GROUP BY A.by
    ORDER BY count DESC
    LIMIT 500
  ) AS C
  WHERE C.by != "None"
)
LIMIT 50000
"""

df = client.query(query_string).result().to_dataframe()

In [None]:
# filter out by type (keep story and comment)
df[(df['type'] == 'story') | (df['type'] == 'comment')]

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,Cannabis has never been &#x27;completely legal...,,roel_v,,1551582184,2019-03-03 03:03:04+00:00,comment,19292438,19292418.0,,,
1,Starlight - History Channel,https://www.youtube.com/watch?v=U5F_ptPX7_8,,,bane,1.0,1379672467,2013-09-20 10:21:07+00:00,story,6416876,,0.0,,
2,"Nepal bans solo, disabled and underaged climbe...",https://www.upi.com/Top_News/World-News/2017/1...,,,DoreenMichele,9.0,1514657287,2017-12-30 18:08:07+00:00,story,16036448,,1.0,,
3,,,There are very few career mistakes that hurt y...,,nostrademons,,1306613116,2011-05-28 20:05:16+00:00,comment,2595097,2595030.0,,,
4,,,"I dunno, man. What do you enjoy?<p>Pick a proj...",,bitwize,,1548884795,2019-01-30 21:46:35+00:00,comment,19039896,19039016.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,,,In a recent job I ended up rewriting some of t...,,grandalf,,1434989168,2015-06-22 16:06:08+00:00,comment,9759256,9757892.0,,,
49996,,,I would argue that the American right and Brit...,,humanrebar,,1360328326,2013-02-08 12:58:46+00:00,comment,5187597,5187474.0,,,
49997,,,"Back in the late 1990s, my team's SourceSafe r...",,cpeterso,,1324150520,2011-12-17 19:35:20+00:00,comment,3364921,3364603.0,,,
49998,,,We all see how well &quot;feeling safe is the ...,,mschuster91,,1444137331,2015-10-06 13:15:31+00:00,comment,10338606,10338572.0,,,


In [None]:
# drop some columns
#drop title, has 447,218 null values 
#drop ranking, has 500000 null values
#drop score, has 446968 null values 
clean_df = df.drop(['time', 'descendants', 'deleted', 'url', 'dead', 'title', 'ranking', 'score'], axis=1)

In [None]:
#only 507 comments with 5 characters or less, upon analysis doesn't look like they provide much context, 
#not helpful for model analysis so drop comments with 5 words or less

# take only the rows whose text column's length is greater than five
# this also removes the texts that are None
clean_df = clean_df[clean_df['text'].str.len() > 5]

In [None]:
# Use BS on a single piece of text to remove strange chars and URLs and tags
from bs4 import BeautifulSoup

def clean_soup(text):
    # make a soup - this takes care of strange characters
    soup = BeautifulSoup(text, "html.parser")

    # delete <a> tags as they contain URLs -- remember <a href="url_here">
    # tags contain URLs
    for s in soup.select('a'):
       s.extract()

    # the soup contains <p> tags -- here we get rid of those and join them with spaces
    strings = soup.find_all(text=True, recursive=True)
    result = ' '.join(strings)

    return result

In [None]:
# Apply that to each comment/story text
clean_text = clean_df['text'].apply(clean_soup)

  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
clean_df['text'] = clean_text

In [None]:
import pandas as pd
pd.options.display.max_colwidth = 100
clean_df.sample(10)

Unnamed: 0,text,by,timestamp,type,id,parent
31348,This assumes that people actually behave rationally. Indeed. Just the idea that TP is the topic...,dhimes,2020-04-14 16:01:51+00:00,comment,22867582,22863905.0
33718,"I don't think there's an unbiased opinion anywhere. I've tried to figure out what this ""classic ...",StavrosK,2016-02-28 17:17:44+00:00,comment,11191438,11191428.0
29121,"Your comment started with the phrase ""anecdotal evidence"". Anecdotal evidence for what? Note: I ...",skrebbel,2020-01-05 12:24:14+00:00,comment,21960975,21960953.0
42953,Not sure exactly how duped this is for HN... For those that don't know.. Lua is a scripting lang...,tracker1,2013-03-16 19:29:31+00:00,comment,5386444,5386204.0
15501,Citation needed. That doesn't match my experience at all. When Activity Monitor shows high CPU i...,gowld,2020-06-08 17:02:48+00:00,comment,23458361,23454944.0
28917,"He claims not doing any development himself for several years now, so I guess this means no.",ekianjo,2015-02-10 15:27:59+00:00,comment,9027652,9027554.0
3493,"Indeed. There were 5 classified types prior to this outbreak: With the Zaire strain, now just ...",hga,2014-08-31 12:33:40+00:00,comment,8249162,8248249.0
998,An interesting follow-up paper:,MaysonL,2009-11-04 07:23:56+00:00,comment,921255,920368.0
47535,Just adding another plug for Econtalk. They cover a lot of different topics and invite people t...,refurb,2017-09-29 00:36:30+00:00,comment,15361918,15361648.0
45370,I liked debugging more once I realized that the answer always comes as a surprise. Something you...,gruseom,2013-10-01 16:04:30+00:00,comment,6477010,6476811.0


In [None]:
#makes sense the parent would have some null values, if they are the first comment
clean_df.isnull().sum()

text          0
by            0
timestamp     0
type          0
id            0
parent       87
dtype: int64

#Pyrom
---------------------

In [None]:
pip install vaderSentiment



In [None]:
#Panda Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
df = clean_df.drop(columns=['timestamp', 'type', 'id', 'parent'])
df.head()

Unnamed: 0,text,by
0,Cannabis has never been 'completely legal' anywhere in the Netherlands. It depends a bit though ...,roel_v
3,"There are very few career mistakes that hurt you permanently, if you make an effort to recover f...",nostrademons
4,"I dunno, man. What do you enjoy? Pick a project to work on in your free time. A game, an audio s...",bitwize
5,> That's ignoring all the computing work done before C. And we can really ignore it software wis...,coldtea
8,"Look, if you really want to debate this, I could try to defend what he said, but I will have to ...",thaumaturgy


In [None]:
#Convert to string values
df = df.astype(str)

In [None]:
df = df.drop(columns= ['Unnamed: 0', 'timestamp', 'type', 'id', 'parent'])
df.head()

Unnamed: 0,text,by
0,No indication of how large these programs are ...,pjc50
1,"Well, if you want to obfuscate a constant stri...",thaumasiotes
2,"Yeah, it's like how we had to stop using GIF a...",coldtea
3,Well duh; would wouldn't do X if X was afforda...,jodrellblank
4,What are the restrictions? Are you going to fi...,gus_massa


In [None]:
#Slim out users with less than 1000 comments, more manageable, cuts data in half
def slim(df):
    """
    Slim out dataset to user with 1000 or more comments.

    Returns a filtered dataframe.
    """

    X = df.copy()
    X = X.groupby('by').filter(lambda x: x['by'].count()<1000)

    return X

In [None]:
new_df = slim(df)
print(new_df.shape)
new_df.head()

(44804, 2)


Unnamed: 0,text,by
0,Cannabis has never been 'completely legal' anywhere in the Netherlands. It depends a bit though ...,roel_v
3,"There are very few career mistakes that hurt you permanently, if you make an effort to recover f...",nostrademons
4,"I dunno, man. What do you enjoy? Pick a project to work on in your free time. A game, an audio s...",bitwize
5,> That's ignoring all the computing work done before C. And we can really ignore it software wis...,coldtea
8,"Look, if you really want to debate this, I could try to defend what he said, but I will have to ...",thaumaturgy


In [None]:
#Machine Learning, using VADER Sentiment Analysis

def sentiment_score(comment):
    analyser = SentimentIntensityAnalyzer()
    
    x = 0
    score = analyser.polarity_scores(comment)
    x = x + score['pos']
    x = x + score['compound']
    x = x - score['neg'] 

    return x

In [None]:
#Apply function to each sample in 'text' column
new_df['Saltiness'] = new_df['text'].apply(sentiment_score)

In [None]:
#Convert Saltiness column to percent
new_df['Saltiness']=new_df['Saltiness'].apply('{:.0%}'.format)

In [None]:
#Moment of truth!
new_df.head()

Unnamed: 0,text,by,Saltiness
0,Cannabis has never been 'completely legal' anywhere in the Netherlands. It depends a bit though ...,roel_v,6%
3,"There are very few career mistakes that hurt you permanently, if you make an effort to recover f...",nostrademons,29%
4,"I dunno, man. What do you enjoy? Pick a project to work on in your free time. A game, an audio s...",bitwize,109%
5,> That's ignoring all the computing work done before C. And we can really ignore it software wis...,coldtea,-42%
8,"Look, if you really want to debate this, I could try to defend what he said, but I will have to ...",thaumaturgy,-105%


In [None]:
#Rename columns
new_df.columns = ['Comment', 'Username', 'Saltiness']

#Rearrange columns, sort by ranking
order = ['Username', 'Comment', 'Saltiness']
comments_score = new_df[order]

In [None]:
comments_score.sort_values(by=['Username'], inplace=True)

In [None]:
comments_score.head(20)

Unnamed: 0,Username,Comment,Saltiness
36228,001sky,"According to a recent NSA memo, 1.6% of all online activity is monitored and collected. That’s e...",0%
40512,001sky,"== Your missing the point, or at least part of it. It is akin to a protection racket. Those comp...",-60%
2880,001sky,"""@DEVOPS_BORAT got heavy duty @dev ops gig available in San Mateo,ca and Seattle area! $130-$140...",0%
427,001sky,"""May have"" is not news What is potentially more relevant here is if a discussion about legal pro...",29%
30998,001sky,-- Or your investor's friends Or business acquantances or political pet-project holders. pretty...,89%
39827,001sky,"As a point of math, that's the top 0.0001% (Or, as a percentage of the workforce: 0.0005%)",30%
15988,001sky,"If you ""interview"" people that are blatantly incompetent , that's on you not on them. Just sayin.",-64%
20808,001sky,"It seems much equipment is not designed to go above 150ish, and once you get above that nothing'...",0%
32596,001sky,relevant >,0%
15620,001sky,The point you're missing tho is that pandoras box is not a box of chocalates. You don't know wha...,-84%


In [None]:
#Random comment test
comments_score.loc[600,'Comment']

'What does this mean for copyrighting tweets?  All my tweets are micro-blog headlines.'

In [None]:
#Comment is pretty neutral
comments_score.loc[600,'Saltiness']

'0%'

In [None]:
comments_score.loc[4600,'Comment']

"There are times when you want non-repudiable signatures, times when you want to be able to keep an archive, times when you want your messages to behave more like letters than like spoken conversations. PGP is still the best fit for email-like use cases and long-lived identities; Signal et al don't even try to address that use case."

In [None]:
#Comment is overall very positive
comments_score.loc[4600,'Saltiness']

'115%'