In [1]:
# set env variable to json credential file of google cloud

import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'my_key_file.json'

In [2]:
from google.cloud import bigquery
import time
import pandas as pd
import numpy as np

In [3]:
# setup client
bq_client = bigquery.Client() 

In [4]:
# GETS SQL QUERY FOR 100 USERS (14,000-14,000) FROM SORTED TABLE OF MOST COMMENTS
# WHERE TOTAL COMMENTS IS SLIGHTLY ABOVE 100

QUERY = '''
        SELECT E.*, C.comments_count
        FROM `bigquery-public-data.hacker_news.full_201510` as E
        JOIN(
            SELECT *
            FROM(
                SELECT *
                FROM(
                    SELECT  `bigquery-public-data.hacker_news.full_201510`.by, COUNT(*) as comments_count
                    FROM `bigquery-public-data.hacker_news.full_201510`
                    GROUP BY `bigquery-public-data.hacker_news.full_201510`.by
                )  
                ORDER BY comments_count DESC
                LIMIT 9500
            )
            ORDER BY comments_count 
            LIMIT 100            
        ) C 
        ON E.by = C.by
        
        '''

In [5]:
import time
start = time.time()
df = bq_client.query(QUERY).to_dataframe()
end = time.time()
print(end - start)

13.626693964004517


In [6]:
df.shape

(20089, 14)

In [7]:
users = df['by'].unique()
len(users)

100

In [8]:
df.head()

Unnamed: 0,by,score,time,title,type,url,text,parent,deleted,dead,descendants,id,ranking,comments_count
0,dimino,,1438628769,,comment,,I was going to write a little client library f...,9998227.0,,,,9998949,,202
1,jmngomes,,1390823014,,comment,,"Agreed, and I think it&#x27;s mainly because i...",7128883.0,,,,7129850,,202
2,cedsav,,1194217292,,comment,,I think you might be a bit too ambitious. <p>1...,76111.0,,,,76129,,200
3,archivator,,1398022736,,comment,,It&#x27;s either that or Twitter will be compl...,7618050.0,,,,7618200,,201
4,jherdman,,1258725132,,comment,,It seems to me that the goal of the App Store ...,952349.0,,,,952386,,200


In [9]:
# DATA WRANGLING

# Drop unecessary columns
df_drop = df.drop(columns= ['score','title','url','deleted','dead','descendants','ranking'])

# Keep only comments in the dataframe
df_drop = df_drop[df_drop['type']=='comment']

#drop empty comments 
df_drop = df_drop[(df_drop['text'] != '')&(df_drop['text'] != np.nan)]

# Fix weird text encodings
import html
df_drop['text']=df_drop['text'].apply(str)
df_drop['text'] = df_drop['text'].apply(lambda x: html.unescape(x))

# Remove html tags from string
import re
def remove_html_tags(text):
   clean = re.compile('<.*?>')
   return re.sub(clean, '', text)
df_drop['text'] = df_drop['text'].apply(lambda x: remove_html_tags(x))

# Convert unix time to datetime object with date
from datetime import datetime
df_drop['time']=pd.to_datetime(df_drop['time'],unit='s') # for accuracy secs
#df_drop['time']=pd.to_datetime(df_drop['time']).dt.date # for just date

In [10]:
# Check only analyzing comments
df_drop['type'].value_counts()

comment    16300
Name: type, dtype: int64

In [11]:
users = df_drop['by'].unique()
print(len(users))

98


In [12]:
df_drop.head()

Unnamed: 0,by,time,type,text,parent,id,comments_count
0,dimino,2015-08-03 19:06:09,comment,I was going to write a little client library f...,9998227.0,9998949,202
1,jmngomes,2014-01-27 11:43:34,comment,"Agreed, and I think it's mainly because it get...",7128883.0,7129850,202
2,cedsav,2007-11-04 23:01:32,comment,I think you might be a bit too ambitious. 1. Y...,76111.0,76129,200
3,archivator,2014-04-20 19:38:56,comment,It's either that or Twitter will be completely...,7618050.0,7618200,201
4,jherdman,2009-11-20 13:52:12,comment,It seems to me that the goal of the App Store ...,952349.0,952386,200


In [13]:
# Populate sentiment analysis columns in dataframe
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [14]:
df_drop['neg']=np.zeros(df_drop.shape[0])
df_drop['pos']=np.zeros(df_drop.shape[0])
df_drop['neu']=np.zeros(df_drop.shape[0])

In [15]:
# OLD METHOD

# Adding sentiments to dataframe is computationally intensive
# On Martin's local computer 12,811 comments took ~24 min 

#import time
#start = time.time()

# populate vader sentiments in additional cols
#for idx,x in df_drop['text'].iteritems():
#    #print (idx,x)
    
#    df_drop['neg'][idx] = sia.polarity_scores(x)['neg']
#    df_drop['pos'][idx] = sia.polarity_scores(x)['pos']
#    df_drop['neu'][idx] = sia.polarity_scores(x)['neu']    
#end = time.time()
#print(end - start)

In [16]:
# NEW METHOD

# Using apply function (which uses compiled code) took 12,811 comments done in ~30s!

now=time.time()
df_drop['neg'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neg'])
df_drop['pos'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['pos'])
df_drop['neu'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neu'])
print('sentiment cal took:',round(time.time()-now,2),'s')

sentiment cal took: 36.72 s


In [17]:
df_drop.head()

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
0,dimino,2015-08-03 19:06:09,comment,I was going to write a little client library f...,9998227.0,9998949,202,0.069,0.038,0.892
1,jmngomes,2014-01-27 11:43:34,comment,"Agreed, and I think it's mainly because it get...",7128883.0,7129850,202,0.017,0.063,0.92
2,cedsav,2007-11-04 23:01:32,comment,I think you might be a bit too ambitious. 1. Y...,76111.0,76129,200,0.079,0.146,0.775
3,archivator,2014-04-20 19:38:56,comment,It's either that or Twitter will be completely...,7618050.0,7618200,201,0.058,0.089,0.852
4,jherdman,2009-11-20 13:52:12,comment,It seems to me that the goal of the App Store ...,952349.0,952386,200,0.0,0.175,0.825


In [18]:
# Shrink the dataframe to 100 comments per user, where comment is not a pure neg or pos so that
# comment displayed is somewhat interesting and not just one word.

In [19]:
df_final = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_pos = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_neg = pd.DataFrame(data=None, columns=df_drop.columns)

for user in users:
    # Create a dataframe of only one user
    df_user = df_drop[df_drop['by'] == user]
    
    # Remove pure sentiments (removes pure one-word comments)
    df_user_unpure = df_user[(df_user['neg'] !=1) & (df_user['pos'] !=1)]
    
    # Submit first 100 comments of the unpure sentiments to final dataframe
    df_final = df_final.append(df_user_unpure[:100])
    
    # Submit max unpure neg/pos comment per user into dataframe
    df_temp = df_final[df_final['by'] == user]
    ix_neg = df_temp['neg'].idxmax(axis=0)
    df_user_most_neg = df_user_most_neg.append(df_temp.loc[ix_neg])
    ix_pos = df_temp['pos'].idxmax(axis=0)
    df_user_most_pos = df_user_most_pos.append(df_temp.loc[ix_pos])
    
df_final.shape

(9133, 10)

In [20]:
users = df_drop['by'].unique()
len(users)

98

In [21]:
df_final.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
0,dimino,2015-08-03 19:06:09,comment,I was going to write a little client library f...,9998227.0,9998949,202,0.069,0.038,0.892
28,dimino,2015-05-15 22:10:46,comment,"I would have sworn up and down, and bet all my...",9552927.0,9553993,202,0.088,0.069,0.844
233,dimino,2015-07-29 15:42:01,comment,"Yeah, also the trauma and panic it incites.",9965941.0,9969346,202,0.459,0.165,0.376
289,dimino,2015-06-15 18:08:22,comment,I agree with everything you've written! All I...,9720916.0,9720959,202,0.096,0.149,0.756
307,dimino,2015-07-11 19:07:04,comment,"There are reasons other than ""I disagree with ...",9867780.0,9871127,202,0.224,0.0,0.776
390,dimino,2015-06-26 17:05:05,comment,You say that like it'd take a weekend and a bu...,9786075.0,9786115,202,0.0,0.152,0.848
413,dimino,2015-06-02 23:18:16,comment,> Users can still have other perfectly reasona...,9649348.0,9649643,202,0.063,0.175,0.762
667,dimino,2015-07-10 21:57:28,comment,I came this close to not even knowing about th...,9866900.0,9867059,202,0.0,0.057,0.943
745,dimino,2015-05-28 20:45:24,comment,Price is a huge inhibitor for a mid-range Cano...,9621004.0,9621062,202,0.123,0.217,0.66
746,dimino,2015-05-29 02:06:40,comment,"Oh certainly, I just wanted to make the point ...",9621930.0,9622329,202,0.0,0.13,0.87


In [22]:
# Random comments from the 100 users
df_random100 = df_final.sample(100)
df_random100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
3389,ASneakyFox,2014-07-01 07:40:41,comment,The whole thing sounds like an episode of some...,7969301.0,7969558,201,0.0,0.16,0.84
3148,evanb,2014-08-20 14:49:35,comment,I think you mean unjustified. Unjust usually ...,8202413.0,8202485,201,0.163,0.292,0.545
618,jkurnia,2014-04-22 17:23:32,comment,I would agree with these statements: Zidisha i...,7628325.0,7628991,202,0.041,0.211,0.748
20,shadytrees,2009-08-30 00:38:03,comment,> [Writers] ... are busily grafting the sophis...,793052.0,793874,200,0.112,0.095,0.793
294,maneesh,2012-10-31 17:05:37,comment,Maybe the most powerful post I've ever read on...,4723342.0,4723878,201,0.0,0.205,0.795
9415,alexkearns,2010-08-23 19:40:17,comment,Are you always this annoying?,1624561.0,1627769,201,0.439,0.0,0.561
12445,acron0,2014-08-03 15:33:44,comment,Any article or discussion that involves C/C++ ...,8127499.0,8128179,200,0.0,0.0,1.0
4455,cmsmith,2015-06-05 18:10:02,comment,And what happens to the local farmers and roof...,9666406.0,9667003,201,0.0,0.05,0.95
2928,droidist2,2013-06-01 16:07:33,comment,Of course I wasn't saying that applies in ever...,5804497.0,5804528,200,0.0,0.133,0.867
3440,twerquie,2014-10-23 15:22:54,comment,I really enjoy that trend. Tech is a dry topic...,8498502.0,8498601,200,0.0,0.241,0.759


In [23]:
# Most negative comments
df_neg100 = df_final.sort_values(['neg'],ascending=False)[:100]
df_neg100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
7587,alexkearns,2011-04-24 07:44:02,comment,Stop spamming: http://news.ycombinator.com/sub...,2478465.0,2478472,201,0.841,0.0,0.159
8827,kang,2012-05-28 15:24:19,comment,thats racist,4032524.0,4033741,201,0.8,0.0,0.2
3249,twerquie,2014-06-23 18:48:58,comment,You are fighting a losing battle.,7933460.0,7933701,200,0.794,0.0,0.206
772,shadytrees,2014-08-31 19:38:17,comment,My bad!,8245728.0,8250164,200,0.791,0.0,0.209
9831,j2labs,2013-06-15 03:18:41,comment,painfully cheesy,5882834.0,5883794,201,0.773,0.0,0.227
7563,yuchi,2012-06-26 17:46:15,comment,"Sorry, but this is a dupe! :\",4163034.0,4163176,201,0.718,0.0,0.282
8300,kellysutton,2013-01-21 16:27:52,comment,Rails. No.,5092095.0,5092132,200,0.688,0.0,0.312
15560,reddotX,2015-08-04 13:53:17,comment,what github fiasco?,10003348.0,10003370,201,0.623,0.0,0.377
6846,yuchi,2012-12-12 21:49:18,comment,"I hate the abuse of the word ""meme""...",4912487.0,4912662,201,0.612,0.0,0.388
8642,okasaki,2014-06-09 17:49:41,comment,Discouraged by whom?,7869290.0,7869516,201,0.574,0.0,0.426


In [24]:
# Most positive comments
df_pos100 = df_final.sort_values(['pos'],ascending=False)[:100]
df_pos100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
1129,domp,2007-04-03 02:28:35,comment,Wow thats pretty amazing.,8396.0,8401,201,0.0,0.915,0.085
939,medianama,2010-03-07 19:15:11,comment,Great. Thank You.,1173801.0,1173817,200,0.0,0.868,0.132
3890,dlf,2013-01-27 00:37:19,comment,Happy to help.,5122079.0,5122473,202,0.0,0.865,0.135
10554,nav,2009-06-18 23:32:32,comment,Thanks. Was fun.,664341.0,664342,201,0.0,0.861,0.139
11066,daviday,2012-06-02 17:36:34,comment,Thanks for sharing!,4057945.0,4058591,202,0.0,0.857,0.143
8808,jdunck,2013-01-10 21:56:25,comment,"Thanks, Matt. :)",5039861.0,5040042,201,0.0,0.855,0.145
8975,arisAlexis,2015-07-21 08:54:08,comment,haha cool trolling!!,9921007.0,9921101,200,0.0,0.855,0.145
6600,mikeg8,2014-02-12 17:33:44,comment,Interesting perspective. Thanks.,7225297.0,7225758,200,0.0,0.848,0.152
4992,mikeg8,2012-03-24 22:05:40,comment,Interesting point. Thanks.,3750879.0,3750903,200,0.0,0.848,0.152
5887,shadytrees,2014-08-30 19:04:05,comment,Please cheer up.,8246606.0,8247275,200,0.0,0.848,0.152


In [25]:
# Most single negative comment for each user
df_user_most_neg.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
233,dimino,2015-07-29 15:42:01,comment,"Yeah, also the trauma and panic it incites.",9965941.0,9969346,202,0.459,0.165,0.376
1462,jmngomes,2013-07-03 15:36:58,comment,You just reminded me of the closing words of K...,5985041.0,5985229,202,0.268,0.0,0.732
8005,cedsav,2008-01-18 22:56:20,comment,...says the developer snob.,100563.0,100571,200,0.5,0.0,0.5
8034,archivator,2013-04-19 07:09:10,comment,When fighting the institutions that write thes...,5575251.0,5575319,201,0.382,0.0,0.618
8969,jherdman,2012-08-03 17:23:59,comment,Yup! You're right. My bad.,4334441.0,4335028,200,0.487,0.0,0.513
1384,sreque,2010-05-27 17:48:05,comment,No one will argue with you that Perl isn't bet...,1384157.0,1384411,202,0.233,0.0,0.767
3293,droidist2,2015-06-10 21:39:32,comment,And that insecurity manifests itself as a faca...,9695580.0,9695679,200,0.408,0.0,0.592
3443,manaskarekar,2012-11-10 13:55:23,comment,This just sounds like desperate clawing of a d...,4766557.0,4766641,201,0.361,0.139,0.5
8839,gdb,2014-12-24 18:32:15,comment,The site should now be loading — sorry about t...,8793908.0,8793924,201,0.349,0.0,0.651
8300,kellysutton,2013-01-21 16:27:52,comment,Rails. No.,5092095.0,5092132,200,0.688,0.0,0.312


In [26]:
# Most single positive comment for each user
df_user_most_pos.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
3916,dimino,2015-07-28 17:55:30,comment,"To the law? Yes, a huge one.",9962518.0,9963313,202,0.0,0.556,0.444
5075,jmngomes,2014-10-28 10:54:54,comment,Glad they now have a faster way to make inaccu...,8519820.0,8520320,202,0.0,0.4,0.6
6912,cedsav,2009-11-02 19:32:22,comment,"FWIW, I like the bookshelf metaphor, more huma...",917642.0,917665,200,0.0,0.416,0.584
4503,archivator,2012-05-14 21:31:49,comment,Definitely true on Linux with bash - .* will e...,3973375.0,3973430,201,0.0,0.432,0.568
4140,jherdman,2010-06-12 14:21:23,comment,Zed loves Fossil SCM: http://www.fossil-scm.or...,1426016.0,1426031,200,0.0,0.481,0.519
3283,sreque,2013-01-04 16:31:50,comment,That is certainly false hyperbole. There a lot...,5008266.0,5008318,202,0.019,0.231,0.75
4617,droidist2,2015-02-28 10:34:46,comment,"LOL, brogrammers?",9122631.0,9123354,200,0.0,0.779,0.221
7499,manaskarekar,2012-08-28 12:39:11,comment,"Thanks, I'll try that out. :)",4442634.0,4443339,201,0.0,0.596,0.404
3356,gdb,2012-05-29 20:25:35,comment,"Ah, interesting. Yeah, would love a patch!",4039492.0,4039622,201,0.0,0.758,0.242
5779,kellysutton,2012-09-13 20:07:26,comment,Yes. https://github.com/lyondhill/socket.io-ru...,4518154.0,4518286,200,0.0,0.73,0.27


In [27]:
# Clean up columns and columns order to match desired PostgreSQL output

In [28]:
cols = df_final.columns.tolist()
cols

['by',
 'time',
 'type',
 'text',
 'parent',
 'id',
 'comments_count',
 'neg',
 'pos',
 'neu']

In [29]:
cols_final = cols[5:6] + cols[0:1] + cols[3:4] + cols[1:2] + cols[-3:]
cols_final

['id', 'by', 'text', 'time', 'neg', 'pos', 'neu']

In [30]:
df_final_form = df_final[cols_final]
df_random100_form = df_random100[cols_final]
df_neg100_form = df_neg100[cols_final]
df_pos100_form = df_pos100[cols_final]
df_user_most_neg_form = df_user_most_neg[cols_final]
df_user_most_pos_form = df_user_most_pos[cols_final]


In [31]:
df_final_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
0,9998949,dimino,I was going to write a little client library f...,2015-08-03 19:06:09,0.069,0.038,0.892
28,9553993,dimino,"I would have sworn up and down, and bet all my...",2015-05-15 22:10:46,0.088,0.069,0.844
233,9969346,dimino,"Yeah, also the trauma and panic it incites.",2015-07-29 15:42:01,0.459,0.165,0.376
289,9720959,dimino,I agree with everything you've written! All I...,2015-06-15 18:08:22,0.096,0.149,0.756
307,9871127,dimino,"There are reasons other than ""I disagree with ...",2015-07-11 19:07:04,0.224,0.0,0.776
390,9786115,dimino,You say that like it'd take a weekend and a bu...,2015-06-26 17:05:05,0.0,0.152,0.848
413,9649643,dimino,> Users can still have other perfectly reasona...,2015-06-02 23:18:16,0.063,0.175,0.762
667,9867059,dimino,I came this close to not even knowing about th...,2015-07-10 21:57:28,0.0,0.057,0.943
745,9621062,dimino,Price is a huge inhibitor for a mid-range Cano...,2015-05-28 20:45:24,0.123,0.217,0.66
746,9622329,dimino,"Oh certainly, I just wanted to make the point ...",2015-05-29 02:06:40,0.0,0.13,0.87


In [32]:
df_random100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
3389,7969558,ASneakyFox,The whole thing sounds like an episode of some...,2014-07-01 07:40:41,0.0,0.16,0.84
3148,8202485,evanb,I think you mean unjustified. Unjust usually ...,2014-08-20 14:49:35,0.163,0.292,0.545
618,7628991,jkurnia,I would agree with these statements: Zidisha i...,2014-04-22 17:23:32,0.041,0.211,0.748
20,793874,shadytrees,> [Writers] ... are busily grafting the sophis...,2009-08-30 00:38:03,0.112,0.095,0.793
294,4723878,maneesh,Maybe the most powerful post I've ever read on...,2012-10-31 17:05:37,0.0,0.205,0.795
9415,1627769,alexkearns,Are you always this annoying?,2010-08-23 19:40:17,0.439,0.0,0.561
12445,8128179,acron0,Any article or discussion that involves C/C++ ...,2014-08-03 15:33:44,0.0,0.0,1.0
4455,9667003,cmsmith,And what happens to the local farmers and roof...,2015-06-05 18:10:02,0.0,0.05,0.95
2928,5804528,droidist2,Of course I wasn't saying that applies in ever...,2013-06-01 16:07:33,0.0,0.133,0.867
3440,8498601,twerquie,I really enjoy that trend. Tech is a dry topic...,2014-10-23 15:22:54,0.0,0.241,0.759


In [33]:
df_neg100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
7587,2478472,alexkearns,Stop spamming: http://news.ycombinator.com/sub...,2011-04-24 07:44:02,0.841,0.0,0.159
8827,4033741,kang,thats racist,2012-05-28 15:24:19,0.8,0.0,0.2
3249,7933701,twerquie,You are fighting a losing battle.,2014-06-23 18:48:58,0.794,0.0,0.206
772,8250164,shadytrees,My bad!,2014-08-31 19:38:17,0.791,0.0,0.209
9831,5883794,j2labs,painfully cheesy,2013-06-15 03:18:41,0.773,0.0,0.227
7563,4163176,yuchi,"Sorry, but this is a dupe! :\",2012-06-26 17:46:15,0.718,0.0,0.282
8300,5092132,kellysutton,Rails. No.,2013-01-21 16:27:52,0.688,0.0,0.312
15560,10003370,reddotX,what github fiasco?,2015-08-04 13:53:17,0.623,0.0,0.377
6846,4912662,yuchi,"I hate the abuse of the word ""meme""...",2012-12-12 21:49:18,0.612,0.0,0.388
8642,7869516,okasaki,Discouraged by whom?,2014-06-09 17:49:41,0.574,0.0,0.426


In [34]:
df_pos100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
1129,8401,domp,Wow thats pretty amazing.,2007-04-03 02:28:35,0.0,0.915,0.085
939,1173817,medianama,Great. Thank You.,2010-03-07 19:15:11,0.0,0.868,0.132
3890,5122473,dlf,Happy to help.,2013-01-27 00:37:19,0.0,0.865,0.135
10554,664342,nav,Thanks. Was fun.,2009-06-18 23:32:32,0.0,0.861,0.139
11066,4058591,daviday,Thanks for sharing!,2012-06-02 17:36:34,0.0,0.857,0.143
8808,5040042,jdunck,"Thanks, Matt. :)",2013-01-10 21:56:25,0.0,0.855,0.145
8975,9921101,arisAlexis,haha cool trolling!!,2015-07-21 08:54:08,0.0,0.855,0.145
6600,7225758,mikeg8,Interesting perspective. Thanks.,2014-02-12 17:33:44,0.0,0.848,0.152
4992,3750903,mikeg8,Interesting point. Thanks.,2012-03-24 22:05:40,0.0,0.848,0.152
5887,8247275,shadytrees,Please cheer up.,2014-08-30 19:04:05,0.0,0.848,0.152


In [35]:
df_user_most_neg_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
233,9969346,dimino,"Yeah, also the trauma and panic it incites.",2015-07-29 15:42:01,0.459,0.165,0.376
1462,5985229,jmngomes,You just reminded me of the closing words of K...,2013-07-03 15:36:58,0.268,0.0,0.732
8005,100571,cedsav,...says the developer snob.,2008-01-18 22:56:20,0.5,0.0,0.5
8034,5575319,archivator,When fighting the institutions that write thes...,2013-04-19 07:09:10,0.382,0.0,0.618
8969,4335028,jherdman,Yup! You're right. My bad.,2012-08-03 17:23:59,0.487,0.0,0.513
1384,1384411,sreque,No one will argue with you that Perl isn't bet...,2010-05-27 17:48:05,0.233,0.0,0.767
3293,9695679,droidist2,And that insecurity manifests itself as a faca...,2015-06-10 21:39:32,0.408,0.0,0.592
3443,4766641,manaskarekar,This just sounds like desperate clawing of a d...,2012-11-10 13:55:23,0.361,0.139,0.5
8839,8793924,gdb,The site should now be loading — sorry about t...,2014-12-24 18:32:15,0.349,0.0,0.651
8300,5092132,kellysutton,Rails. No.,2013-01-21 16:27:52,0.688,0.0,0.312


In [36]:
df_user_most_pos_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
3916,9963313,dimino,"To the law? Yes, a huge one.",2015-07-28 17:55:30,0.0,0.556,0.444
5075,8520320,jmngomes,Glad they now have a faster way to make inaccu...,2014-10-28 10:54:54,0.0,0.4,0.6
6912,917665,cedsav,"FWIW, I like the bookshelf metaphor, more huma...",2009-11-02 19:32:22,0.0,0.416,0.584
4503,3973430,archivator,Definitely true on Linux with bash - .* will e...,2012-05-14 21:31:49,0.0,0.432,0.568
4140,1426031,jherdman,Zed loves Fossil SCM: http://www.fossil-scm.or...,2010-06-12 14:21:23,0.0,0.481,0.519
3283,5008318,sreque,That is certainly false hyperbole. There a lot...,2013-01-04 16:31:50,0.019,0.231,0.75
4617,9123354,droidist2,"LOL, brogrammers?",2015-02-28 10:34:46,0.0,0.779,0.221
7499,4443339,manaskarekar,"Thanks, I'll try that out. :)",2012-08-28 12:39:11,0.0,0.596,0.404
3356,4039622,gdb,"Ah, interesting. Yeah, would love a patch!",2012-05-29 20:25:35,0.0,0.758,0.242
5779,4518286,kellysutton,Yes. https://github.com/lyondhill/socket.io-ru...,2012-09-13 20:07:26,0.0,0.73,0.27


In [37]:
# Save to local json

df_final_form.to_json(r'output/tiny_data_100x100.json')
df_random100_form.to_json(r'output/tiny_random100.json')
df_neg100_form.to_json(r'output/tiny_neg100.json')
df_pos100_form.to_json(r'output/tiny_pos100.json') 
df_user_most_neg_form.to_json(r'output/tiny_neg100_per_user.json') 
df_user_most_pos_form.to_json(r'output/tiny_pos100_per_user.json') 

In [38]:
# Save to local csv

df_final_form.to_csv('output/tiny_data_100x100.csv', index=False)
df_random100_form.to_csv('output/tiny_random100.csv', index=False)
df_neg100_form.to_csv('output/tiny_neg100.csv', index=False)
df_pos100_form.to_csv('output/tiny_pos100.csv', index=False)
df_user_most_neg_form.to_csv('output/tiny_neg100_per_user.csv', index=False)
df_user_most_pos_form.to_csv('output/tiny_pos100_per_user.csv', index=False)

In [39]:
# Load csv's to Postgresql 

In [60]:
# CLEAN OUT TABLES (IF NEEDED)
#pg_curs.execute('DROP TABLE tiny_data_100x100')
#pg_curs.execute('DROP TABLE tiny_random100')
#pg_curs.execute('DROP TABLE tiny_neg100')
#pg_curs.execute('DROP TABLE tiny_pos100')
#pg_curs.execute('DROP TABLE tiny_neg100_per_user')
#pg_curs.execute('DROP TABLE tiny_pos100_per_user')

UndefinedTable: table "tiny_data" does not exist


In [58]:
# ROLLBACK (IF NEEDED)
pg_curs.execute("ROLLBACK")

In [50]:
import sqlite3
import psycopg2

In [51]:
# CONFIGURE YOUR OWN USER AND PASSWORD

dbname = 'dfoaqcagkvg30j'
user = ''
password = ''
host = 'ec2-174-129-252-226.compute-1.amazonaws.com'

In [52]:
pg_conn = psycopg2.connect(dbname=dbname,user=user,password=password,host=host)

In [53]:
pg_curs = pg_conn.cursor()

In [63]:
create_tiny_data_100x100 = '''
CREATE TABLE tiny_data_100x100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_data_100x100)

In [64]:
import csv

In [65]:
# Insert tiny_data.csv line by line into table
start = time.time()

with open('output/tiny_data_100x100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_data_100x100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)
        
end = time.time()
print(end - start)

103.07400226593018


In [66]:
# NOW DO THE SAME FOR OTHER TABLES...

# THESE SHOULD BE DONW WITH A FUNCTION!!!!!

In [67]:
# Tiny_random100
create_tiny_random100 = '''
CREATE TABLE tiny_random100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_random100)

In [68]:
with open('output/tiny_random100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_random100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [69]:
# Tiny_neg100
create_tiny_neg100 = '''
CREATE TABLE tiny_neg100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_neg100)

In [70]:
with open('output/tiny_neg100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_neg100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [71]:
# Tiny_pos100
create_tiny_pos100 = '''
CREATE TABLE tiny_pos100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_pos100)

In [72]:
with open('output/tiny_pos100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_pos100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [73]:
# Tiny_neg100_per_user
create_tiny_neg100_per_user = '''
CREATE TABLE tiny_neg100_per_user (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_neg100_per_user)

In [74]:
with open('output/tiny_neg100_per_user.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_neg100_per_user VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [75]:
# Tiny_pos100_per_user
create_tiny_pos100_per_user = '''
CREATE TABLE tiny_pos100_per_user (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_pos100_per_user)

In [76]:
with open('output/tiny_pos100_per_user.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_pos100_per_user VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)