In [1]:
# set env variable to json credential file of google cloud

import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'my_key_file.json'

In [2]:
from google.cloud import bigquery
import time
import pandas as pd
import numpy as np

In [3]:
# setup client
bq_client = bigquery.Client() 

In [4]:
# GETS SQL QUERY FOR 100 USERS (14,000-14,000) FROM SORTED TABLE OF MOST COMMENTS
# WHERE TOTAL COMMENTS IS SLIGHTLY ABOVE 100

QUERY = '''
        SELECT E.*, C.comments_count
        FROM `bigquery-public-data.hacker_news.full_201510` as E
        JOIN(
            SELECT *
            FROM(
                SELECT *
                FROM(
                    SELECT  `bigquery-public-data.hacker_news.full_201510`.by, COUNT(*) as comments_count
                    FROM `bigquery-public-data.hacker_news.full_201510`
                    GROUP BY `bigquery-public-data.hacker_news.full_201510`.by
                )  
                ORDER BY comments_count DESC
                LIMIT 10400
            )
            ORDER BY comments_count 
            LIMIT 100            
        ) C 
        ON E.by = C.by
        
        '''

In [5]:
import time
start = time.time()
df = bq_client.query(QUERY).to_dataframe()
end = time.time()
print(end - start)

5.598115682601929


In [6]:
df.shape

(18193, 14)

In [7]:
users = df['by'].unique()
len(users)

100

In [8]:
df.head()

Unnamed: 0,by,score,time,title,type,url,text,parent,deleted,dead,descendants,id,ranking,comments_count
0,frade33,,1398021228,,comment,,I rather gave an example of chimp which is not...,7616693.0,,,,7618101,,182
1,reginaldo,,1359990428,,comment,,The article is talking about <i>temperate</i> ...,5164285.0,,,,5164420,,181
2,robbles,,1360023062,,comment,,"Really handy tool, thanks! I second the sugges...",5166346.0,,,,5168163,,183
3,vlucas,,1307804237,,comment,,My overall reaction after reading this piece i...,2643568.0,,,,2644040,,182
4,jbenz,,1252435760,,comment,,"Skins! Yes, it was actually in the original g...",811575.0,,,,811603,,181


In [9]:
df['by'].value_counts().head(10)

adambenayoun     183
NY_USA_Hacker    183
imd23            183
robodale         183
michaelkeenan    183
yaeger           183
akhilcacharya    183
daralthus        183
dvdhsu           183
robbles          183
Name: by, dtype: int64

In [10]:
# DATA WRANGLING

# Drop unecessary columns
df_drop = df.drop(columns= ['score','title','url','deleted','dead','descendants','ranking'])

# Keep only comments in the dataframe
df_drop = df_drop[df_drop['type']=='comment']

# Fix weird text encodings
import html
df_drop['text']=df_drop['text'].apply(str)
df_drop['text'] = df_drop['text'].apply(lambda x: html.unescape(x))

# Remove html tags from string
import re
def remove_html_tags(text):
   clean = re.compile('<.*?>')
   return re.sub(clean, '', text)
df_drop['text'] = df_drop['text'].apply(lambda x: remove_html_tags(x))


# Convert unix time to datetime object with date
from datetime import datetime
df_drop['time']=pd.to_datetime(df_drop['time'],unit='s') # for accuracy secs
#df_drop['time']=pd.to_datetime(df_drop['time']).dt.date # for just date

In [11]:
# Check only analyzing comments
df_drop['type'].value_counts()

comment    14844
Name: type, dtype: int64

In [12]:
users = df_drop['by'].unique()
print(len(users))

100


In [13]:
df_drop.head()

Unnamed: 0,by,time,type,text,parent,id,comments_count
0,frade33,2014-04-20 19:13:48,comment,I rather gave an example of chimp which is not...,7616693.0,7618101,182
1,reginaldo,2013-02-04 15:07:08,comment,The article is talking about temperate climate...,5164285.0,5164420,181
2,robbles,2013-02-05 00:11:02,comment,"Really handy tool, thanks! I second the sugges...",5166346.0,5168163,183
3,vlucas,2011-06-11 14:57:17,comment,My overall reaction after reading this piece i...,2643568.0,2644040,182
4,jbenz,2009-09-08 18:49:20,comment,"Skins! Yes, it was actually in the original g...",811575.0,811603,181


In [14]:
# Populate sentiment analysis columns in dataframe
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [15]:
df_drop['neg']=np.zeros(df_drop.shape[0])
df_drop['pos']=np.zeros(df_drop.shape[0])
df_drop['neu']=np.zeros(df_drop.shape[0])

In [16]:
# OLD METHOD

# Adding sentiments to dataframe is computationally intensive
# On Martin's local computer 12,811 comments took ~24 min 

#import time
#start = time.time()

# populate vader sentiments in additional cols
#for idx,x in df_drop['text'].iteritems():
#    #print (idx,x)
    
#    df_drop['neg'][idx] = sia.polarity_scores(x)['neg']
#    df_drop['pos'][idx] = sia.polarity_scores(x)['pos']
#    df_drop['neu'][idx] = sia.polarity_scores(x)['neu']    
#end = time.time()
#print(end - start)

In [17]:
# NEW METHOD

# Using apply function (which uses compiled code) took 12,811 comments done in ~30s!

now=time.time()
df_drop['neg'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neg'])
df_drop['pos'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['pos'])
df_drop['neu'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neu'])
print('sentiment cal took:',round(time.time()-now,2),'s')

sentiment cal took: 47.89 s


In [18]:
df_drop.head()

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
0,frade33,2014-04-20 19:13:48,comment,I rather gave an example of chimp which is not...,7616693.0,7618101,182,0.021,0.151,0.829
1,reginaldo,2013-02-04 15:07:08,comment,The article is talking about temperate climate...,5164285.0,5164420,181,0.0,0.0,1.0
2,robbles,2013-02-05 00:11:02,comment,"Really handy tool, thanks! I second the sugges...",5166346.0,5168163,183,0.0,0.178,0.822
3,vlucas,2011-06-11 14:57:17,comment,My overall reaction after reading this piece i...,2643568.0,2644040,182,0.088,0.058,0.854
4,jbenz,2009-09-08 18:49:20,comment,"Skins! Yes, it was actually in the original g...",811575.0,811603,181,0.0,0.201,0.799


In [19]:
# Shrink the dataframe to 100 comments per user, where comment is not a pure neg or pos so that
# comment displayed is somewhat interesting and not just one word.

In [20]:
df_final = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_pos = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_neg = pd.DataFrame(data=None, columns=df_drop.columns)

for user in users:
    # Create a dataframe of only one user
    df_user = df_drop[df_drop['by'] == user]
    
    # Remove pure sentiments (removes pure one-word comments)
    df_user_unpure = df_user[(df_user['neg'] !=1) & (df_user['pos'] !=1)]
    
    # Submit first 100 comments of the unpure sentiments to final dataframe
    df_final = df_final.append(df_user_unpure[:100])
    
    # Submit max unpure neg/pos comment per user into dataframe
    df_temp = df_final[df_final['by'] == user]
    ix_neg = df_temp['neg'].idxmax(axis=0)
    df_user_most_neg = df_user_most_neg.append(df_temp.loc[ix_neg])
    ix_pos = df_temp['pos'].idxmax(axis=0)
    df_user_most_pos = df_user_most_pos.append(df_temp.loc[ix_pos])
    
df_final.shape

(9305, 10)

In [21]:
users = df_drop['by'].unique()
len(users)

100

In [22]:
df_final.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
0,frade33,2014-04-20 19:13:48,comment,I rather gave an example of chimp which is not...,7616693.0,7618101,182,0.021,0.151,0.829
261,frade33,2015-05-16 19:33:54,comment,"couple of years back, adwords suspended my acc...",9556908.0,9557320,182,0.113,0.118,0.769
277,frade33,2014-04-25 21:53:43,comment,>That is when the money will really be earned....,7649224.0,7649245,182,0.043,0.201,0.757
306,frade33,2014-03-29 18:51:46,comment,> but they would never dieChallenge Accepted. ...,7493231.0,7493434,182,0.294,0.086,0.62
596,frade33,2014-03-28 02:11:50,comment,I do not recall being addictive to any iOS gam...,7484106.0,7484887,182,0.0,0.0,1.0
744,frade33,2014-04-17 22:13:40,comment,"it's 3am, here. up from 10am. so i'd request p...",7606727.0,7606814,182,0.075,0.061,0.865
767,frade33,2014-02-15 17:26:38,comment,Social networking overall has run its course. ...,7243851.0,7244430,182,0.061,0.096,0.843
984,frade33,2014-03-21 12:15:09,comment,"@sauuki, Thanks for the in depth reply, and th...",7439991.0,7441776,182,0.0,0.284,0.716
1344,frade33,2014-03-25 18:36:47,comment,"this is where, Google should keep focusing tha...",7467165.0,7467833,182,0.025,0.311,0.663
1675,frade33,2015-01-21 22:01:31,comment,Computer. Tv. Phone. Camera. Radio. \nInternet...,8926036.0,8926071,182,0.0,0.0,1.0


In [23]:
# Random comments from the 100 users
df_random100 = df_final.sample(100)
df_random100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
3559,0x006A,2014-11-21 09:57:38,comment,at best he might speak for some sysadmins. His...,8639739.0,8640568,181,0.159,0.14,0.701
7227,vlokshin,2013-03-25 18:14:52,comment,"You CAN stop, it's just very difficult to hold...",5438594.0,5438614,181,0.22,0.214,0.566
2264,robocat,2013-04-18 23:11:03,comment,I too hate the disabling of pinch-to-zoom on w...,5570348.0,5573761,183,0.099,0.054,0.847
8117,jessep,2009-08-28 21:58:49,comment,"Nice work. I just put two of our sites, getmir...",792313.0,792372,182,0.0,0.128,0.872
10037,jan_g,2012-11-14 06:49:36,comment,One particular action won't demand that kind o...,4782054.0,4782316,183,0.0,0.11,0.89
120,hatu,2012-08-19 10:28:33,comment,"I hate that hacks have become ""ninja"" or ""clev...",4402207.0,4403722,182,0.179,0.0,0.821
969,jiaaro,2012-11-26 22:55:09,comment,"I use safari, and I use the ""two-finger double...",4834389.0,4834481,182,0.054,0.0,0.946
4487,ankhmoop,2009-06-26 19:02:30,comment,"If that's the case, then the market is unsusta...",675775.0,675899,181,0.157,0.077,0.767
7346,rcxdude,2014-02-12 12:43:34,comment,I believe the main reason it wasn't really con...,7222770.0,7223943,183,0.0,0.0,1.0
215,jiaaro,2013-08-01 18:32:13,comment,"If I understand correctly, iPhone and iPad run...",6140333.0,6142375,182,0.0,0.0,1.0


In [24]:
# Most negative comments
df_neg100 = df_final.sort_values(['neg'],ascending=False)[:100]
df_neg100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
2202,vlucas,2014-03-26 18:33:06,comment,April fools!,7475278.0,7475350,182,0.777,0.0,0.223
1300,dan_bk,2014-03-19 20:41:37,comment,Simply disgusting.,7430877.0,7431713,181,0.773,0.0,0.227
1488,scottmagdalein,2013-07-05 02:21:13,comment,Racist? C'mon dude.,5993071.0,5993209,181,0.667,0.0,0.333
9630,adambenayoun,2012-09-21 16:40:40,comment,Right - my bad - missed that,4554255.0,4554424,183,0.655,0.0,0.345
7466,rcxdude,2014-12-27 19:48:46,comment,Most projects fail.,8802025.0,8803685,183,0.654,0.0,0.346
7805,jdorfman,2013-08-19 21:41:09,comment,word my bad,6240017.0,6240084,182,0.636,0.0,0.364
6935,ankhmoop,2009-04-05 21:32:04,comment,A false dilemma.,547417.0,547775,181,0.63,0.0,0.37
6758,jusob,2013-09-02 18:33:06,comment,A guide for spamming!,6316214.0,6316434,183,0.629,0.0,0.371
2392,_asummers,2015-07-27 04:50:05,comment,"Whoah, that's frightening.",9953397.0,9953402,182,0.615,0.0,0.385
3496,chrisdone,2015-02-15 23:10:14,comment,"Oh, that is evil.",9054521.0,9054574,182,0.595,0.0,0.405


In [25]:
# Most positive comments
df_pos100 = df_final.sort_values(['pos'],ascending=False)[:100]
df_pos100.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
7562,earnubs,2010-10-09 22:04:17,comment,"Wow, I love this.http://markup.io/v/d3tjg1xn1eqx",1775990.0,1776009,182,0.0,0.889,0.111
12286,robbiea,2012-12-27 20:32:57,comment,"great, thank you.",4974904.0,4974919,182,0.0,0.868,0.132
253,imd23,2013-11-06 11:18:53,comment,Totally agree. :),6682094.0,6682100,183,0.0,0.859,0.141
9586,hastur,2012-01-22 13:30:05,comment,Very cool. :),3496485.0,3496517,183,0.0,0.854,0.146
12878,jdorfman,2013-03-13 01:25:52,comment,all good <3,5366188.0,5366203,182,0.0,0.853,0.147
1960,redact207,2013-09-23 12:52:40,comment,"certainly did, thanks",6430450.0,6430455,182,0.0,0.841,0.159
373,jeffclark,2010-12-31 19:58:04,comment,GREAT idea!,2056274.0,2056287,182,0.0,0.837,0.163
8385,tsax,2014-03-10 14:45:48,comment,Awesome thanks! It is useful,7367524.0,7373164,181,0.0,0.836,0.164
657,jdorfman,2014-04-04 15:33:59,comment,fucking awesome,7531140.0,7532027,182,0.0,0.815,0.185
5036,scottmagdalein,2013-02-25 18:50:59,comment,Love it.,5280483.0,5280836,181,0.0,0.808,0.192


In [26]:
# Most single negative comment for each user
df_user_most_neg.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
306,frade33,2014-03-29 18:51:46,comment,> but they would never dieChallenge Accepted. ...,7493231.0,7493434,182,0.294,0.086,0.62
7938,reginaldo,2012-05-15 18:23:32,comment,People are saying lots of things about the dea...,3977441.0,3977971,181,0.194,0.092,0.714
90,robbles,2014-04-29 07:20:36,comment,It would still be vulnerable to a CSRF attack....,7665359.0,7665482,183,0.308,0.0,0.692
2202,vlucas,2014-03-26 18:33:06,comment,April fools!,7475278.0,7475350,182,0.777,0.0,0.223
1216,jbenz,2015-08-21 18:24:58,comment,"If you're choosing between ""should I commit a ...",10099182.0,10099331,181,0.193,0.149,0.659
2819,eikenberry,2013-07-12 18:57:43,comment,Not to argue that gender bias doesn't exist. B...,6033567.0,6034423,181,0.249,0.108,0.643
7805,jdorfman,2013-08-19 21:41:09,comment,word my bad,6240017.0,6240084,182,0.636,0.0,0.364
2233,daralthus,2011-06-26 11:58:41,comment,I used to but when I get these problems I real...,2697054.0,2697866,183,0.392,0.0,0.608
3505,blored,2007-10-01 20:14:13,comment,We're a bitch in IE though.,61718.0,61723,181,0.487,0.0,0.513
3330,robbiea,2012-07-02 16:51:38,comment,sorry guys. horrible host... all my fault.. mo...,4189376.0,4189637,182,0.286,0.0,0.714


In [27]:
# Most single positive comment for each user
df_user_most_pos.head(10)

Unnamed: 0,by,time,type,text,parent,id,comments_count,neg,pos,neu
10465,frade33,2014-03-25 18:52:07,comment,Yes indeed.,7467067.0,7467989,182,0.0,0.73,0.27
5246,reginaldo,2014-01-23 13:21:54,comment,You're actually pretty close.,7107238.0,7107947,181,0.0,0.516,0.484
577,robbles,2015-01-17 23:48:39,comment,Looks like it's www.faceyourmanga.com.,8905541.0,8906022,183,0.0,0.455,0.545
6665,vlucas,2010-09-15 18:59:44,comment,Cool. Looks like http://keynot.es is doing the...,1649260.0,1695429,182,0.0,0.463,0.537
8834,jbenz,2009-09-08 19:10:00,comment,"Yes, this would be a terrific feature. We'll ...",811490.0,811657,181,0.0,0.473,0.527
240,eikenberry,2013-02-10 09:25:00,comment,> Oracle Corporation decided to sacrifice the ...,5192617.0,5195906,181,0.0,0.534,0.466
12878,jdorfman,2013-03-13 01:25:52,comment,all good <3,5366188.0,5366203,182,0.0,0.853,0.147
8983,daralthus,2012-11-21 19:08:23,comment,This is a pretty good one: http://ocdevel.com/...,4815089.0,4815751,183,0.0,0.604,0.396
12475,blored,2008-03-18 19:43:52,comment,Congrats guys!,140025.0,140105,181,0.0,0.787,0.213
12286,robbiea,2012-12-27 20:32:57,comment,"great, thank you.",4974904.0,4974919,182,0.0,0.868,0.132


In [28]:
# Clean up columns and columns order to match desired PostgreSQL output

In [29]:
cols = df_final.columns.tolist()
cols

['by',
 'time',
 'type',
 'text',
 'parent',
 'id',
 'comments_count',
 'neg',
 'pos',
 'neu']

In [30]:
cols_final = cols[5:6] + cols[0:1] + cols[3:4] + cols[1:2] + cols[-3:]
cols_final

['id', 'by', 'text', 'time', 'neg', 'pos', 'neu']

In [31]:
df_final_form = df_final[cols_final]
df_random100_form = df_random100[cols_final]
df_neg100_form = df_neg100[cols_final]
df_pos100_form = df_pos100[cols_final]
df_user_most_neg_form = df_user_most_neg[cols_final]
df_user_most_pos_form = df_user_most_pos[cols_final]


In [32]:
df_final_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
0,7618101,frade33,I rather gave an example of chimp which is not...,2014-04-20 19:13:48,0.021,0.151,0.829
261,9557320,frade33,"couple of years back, adwords suspended my acc...",2015-05-16 19:33:54,0.113,0.118,0.769
277,7649245,frade33,>That is when the money will really be earned....,2014-04-25 21:53:43,0.043,0.201,0.757
306,7493434,frade33,> but they would never dieChallenge Accepted. ...,2014-03-29 18:51:46,0.294,0.086,0.62
596,7484887,frade33,I do not recall being addictive to any iOS gam...,2014-03-28 02:11:50,0.0,0.0,1.0
744,7606814,frade33,"it's 3am, here. up from 10am. so i'd request p...",2014-04-17 22:13:40,0.075,0.061,0.865
767,7244430,frade33,Social networking overall has run its course. ...,2014-02-15 17:26:38,0.061,0.096,0.843
984,7441776,frade33,"@sauuki, Thanks for the in depth reply, and th...",2014-03-21 12:15:09,0.0,0.284,0.716
1344,7467833,frade33,"this is where, Google should keep focusing tha...",2014-03-25 18:36:47,0.025,0.311,0.663
1675,8926071,frade33,Computer. Tv. Phone. Camera. Radio. \nInternet...,2015-01-21 22:01:31,0.0,0.0,1.0


In [33]:
df_random100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
3559,8640568,0x006A,at best he might speak for some sysadmins. His...,2014-11-21 09:57:38,0.159,0.14,0.701
7227,5438614,vlokshin,"You CAN stop, it's just very difficult to hold...",2013-03-25 18:14:52,0.22,0.214,0.566
2264,5573761,robocat,I too hate the disabling of pinch-to-zoom on w...,2013-04-18 23:11:03,0.099,0.054,0.847
8117,792372,jessep,"Nice work. I just put two of our sites, getmir...",2009-08-28 21:58:49,0.0,0.128,0.872
10037,4782316,jan_g,One particular action won't demand that kind o...,2012-11-14 06:49:36,0.0,0.11,0.89
120,4403722,hatu,"I hate that hacks have become ""ninja"" or ""clev...",2012-08-19 10:28:33,0.179,0.0,0.821
969,4834481,jiaaro,"I use safari, and I use the ""two-finger double...",2012-11-26 22:55:09,0.054,0.0,0.946
4487,675899,ankhmoop,"If that's the case, then the market is unsusta...",2009-06-26 19:02:30,0.157,0.077,0.767
7346,7223943,rcxdude,I believe the main reason it wasn't really con...,2014-02-12 12:43:34,0.0,0.0,1.0
215,6142375,jiaaro,"If I understand correctly, iPhone and iPad run...",2013-08-01 18:32:13,0.0,0.0,1.0


In [34]:
df_neg100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
2202,7475350,vlucas,April fools!,2014-03-26 18:33:06,0.777,0.0,0.223
1300,7431713,dan_bk,Simply disgusting.,2014-03-19 20:41:37,0.773,0.0,0.227
1488,5993209,scottmagdalein,Racist? C'mon dude.,2013-07-05 02:21:13,0.667,0.0,0.333
9630,4554424,adambenayoun,Right - my bad - missed that,2012-09-21 16:40:40,0.655,0.0,0.345
7466,8803685,rcxdude,Most projects fail.,2014-12-27 19:48:46,0.654,0.0,0.346
7805,6240084,jdorfman,word my bad,2013-08-19 21:41:09,0.636,0.0,0.364
6935,547775,ankhmoop,A false dilemma.,2009-04-05 21:32:04,0.63,0.0,0.37
6758,6316434,jusob,A guide for spamming!,2013-09-02 18:33:06,0.629,0.0,0.371
2392,9953402,_asummers,"Whoah, that's frightening.",2015-07-27 04:50:05,0.615,0.0,0.385
3496,9054574,chrisdone,"Oh, that is evil.",2015-02-15 23:10:14,0.595,0.0,0.405


In [35]:
df_pos100_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
7562,1776009,earnubs,"Wow, I love this.http://markup.io/v/d3tjg1xn1eqx",2010-10-09 22:04:17,0.0,0.889,0.111
12286,4974919,robbiea,"great, thank you.",2012-12-27 20:32:57,0.0,0.868,0.132
253,6682100,imd23,Totally agree. :),2013-11-06 11:18:53,0.0,0.859,0.141
9586,3496517,hastur,Very cool. :),2012-01-22 13:30:05,0.0,0.854,0.146
12878,5366203,jdorfman,all good <3,2013-03-13 01:25:52,0.0,0.853,0.147
1960,6430455,redact207,"certainly did, thanks",2013-09-23 12:52:40,0.0,0.841,0.159
373,2056287,jeffclark,GREAT idea!,2010-12-31 19:58:04,0.0,0.837,0.163
8385,7373164,tsax,Awesome thanks! It is useful,2014-03-10 14:45:48,0.0,0.836,0.164
657,7532027,jdorfman,fucking awesome,2014-04-04 15:33:59,0.0,0.815,0.185
5036,5280836,scottmagdalein,Love it.,2013-02-25 18:50:59,0.0,0.808,0.192


In [37]:
df_user_most_neg_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
306,7493434,frade33,> but they would never dieChallenge Accepted. ...,2014-03-29 18:51:46,0.294,0.086,0.62
7938,3977971,reginaldo,People are saying lots of things about the dea...,2012-05-15 18:23:32,0.194,0.092,0.714
90,7665482,robbles,It would still be vulnerable to a CSRF attack....,2014-04-29 07:20:36,0.308,0.0,0.692
2202,7475350,vlucas,April fools!,2014-03-26 18:33:06,0.777,0.0,0.223
1216,10099331,jbenz,"If you're choosing between ""should I commit a ...",2015-08-21 18:24:58,0.193,0.149,0.659
2819,6034423,eikenberry,Not to argue that gender bias doesn't exist. B...,2013-07-12 18:57:43,0.249,0.108,0.643
7805,6240084,jdorfman,word my bad,2013-08-19 21:41:09,0.636,0.0,0.364
2233,2697866,daralthus,I used to but when I get these problems I real...,2011-06-26 11:58:41,0.392,0.0,0.608
3505,61723,blored,We're a bitch in IE though.,2007-10-01 20:14:13,0.487,0.0,0.513
3330,4189637,robbiea,sorry guys. horrible host... all my fault.. mo...,2012-07-02 16:51:38,0.286,0.0,0.714


In [36]:
df_user_most_pos_form.head(10)

Unnamed: 0,id,by,text,time,neg,pos,neu
10465,7467989,frade33,Yes indeed.,2014-03-25 18:52:07,0.0,0.73,0.27
5246,7107947,reginaldo,You're actually pretty close.,2014-01-23 13:21:54,0.0,0.516,0.484
577,8906022,robbles,Looks like it's www.faceyourmanga.com.,2015-01-17 23:48:39,0.0,0.455,0.545
6665,1695429,vlucas,Cool. Looks like http://keynot.es is doing the...,2010-09-15 18:59:44,0.0,0.463,0.537
8834,811657,jbenz,"Yes, this would be a terrific feature. We'll ...",2009-09-08 19:10:00,0.0,0.473,0.527
240,5195906,eikenberry,> Oracle Corporation decided to sacrifice the ...,2013-02-10 09:25:00,0.0,0.534,0.466
12878,5366203,jdorfman,all good <3,2013-03-13 01:25:52,0.0,0.853,0.147
8983,4815751,daralthus,This is a pretty good one: http://ocdevel.com/...,2012-11-21 19:08:23,0.0,0.604,0.396
12475,140105,blored,Congrats guys!,2008-03-18 19:43:52,0.0,0.787,0.213
12286,4974919,robbiea,"great, thank you.",2012-12-27 20:32:57,0.0,0.868,0.132


In [40]:
# Save to local json

df_final_form.to_json(r'output/tiny_data_100x100.json')
df_random100_form.to_json(r'output/tiny_random100.json')
df_neg100_form.to_json(r'output/tiny_neg100.json')
df_pos100_form.to_json(r'output/tiny_pos100.json') 
df_user_most_neg_form.to_json(r'output/tiny_neg100_per_user.json') 
df_user_most_pos_form.to_json(r'output/tiny_pos100_per_user.json') 

In [41]:
# Save to local csv

df_final_form.to_csv('output/tiny_data_100x100.csv', index=False)
df_random100_form.to_csv('output/tiny_random100.csv', index=False)
df_neg100_form.to_csv('output/tiny_neg100.csv', index=False)
df_pos100_form.to_csv('output/tiny_pos100.csv', index=False)
df_user_most_neg_form.to_csv('output/tiny_neg100_per_user.csv', index=False)
df_user_most_pos_form.to_csv('output/tiny_pos100_per_user.csv', index=False)

In [42]:
# Load csv's to Postgresql 

In [58]:
# CLEAN OUT TABLES (IF NEEDED)
#pg_curs.execute('DROP TABLE tiny_data_100x100')
#pg_curs.execute('DROP TABLE tiny_random100')
#pg_curs.execute('DROP TABLE tiny_neg100')
#pg_curs.execute('DROP TABLE tiny_pos100')
#pg_curs.execute('DROP TABLE tiny_neg100_per_user')
#pg_curs.execute('DROP TABLE tiny_pos100_per_user')

In [53]:
# ROLLBACK (IF NEEDED)
pg_curs.execute("ROLLBACK")

In [43]:
import sqlite3
import psycopg2

In [44]:
# CONFIGURE YOUR OWN USER AND PASSWORD

dbname = 'dfoaqcagkvg30j'
user = ''
password = ''
host = 'ec2-174-129-252-226.compute-1.amazonaws.com'

In [45]:
pg_conn = psycopg2.connect(dbname=dbname,user=user,password=password,host=host)

In [46]:
pg_curs = pg_conn.cursor()

In [54]:
create_tiny_data_100x100 = '''
CREATE TABLE tiny_data_100x100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_data_100x100)

In [55]:
import csv

In [56]:
# Insert tiny_data.csv line by line into table
start = time.time()

with open('output/tiny_data_100x100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_data VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)
        
end = time.time()
print(end - start)

In [None]:
# NOW DO THE SAME FOR OTHER TABLES...

# THESE SHOULD BE DONW WITH A FUNCTION!!!!!

In [59]:
# Tiny_random100
create_tiny_random100 = '''
CREATE TABLE tiny_random100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_random100)

In [60]:
with open('output/tiny_random100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_random100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [61]:
# Tiny_neg100
create_tiny_neg100 = '''
CREATE TABLE tiny_neg100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_neg100)

In [62]:
with open('output/tiny_neg100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_neg100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [63]:
# Tiny_pos100
create_tiny_pos100 = '''
CREATE TABLE tiny_pos100 (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_pos100)

In [64]:
with open('output/tiny_pos100.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_pos100 VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [65]:
# Tiny_neg100_per_user
create_tiny_neg100_per_user = '''
CREATE TABLE tiny_neg100_per_user (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_neg100_per_user)

In [66]:
with open('output/tiny_neg100_per_user.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_neg100_per_user VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)

In [67]:
# Tiny_pos100_per_user
create_tiny_pos100_per_user = '''
CREATE TABLE tiny_pos100_per_user (
    comment_ID SERIAL PRIMARY KEY,
    author VARCHAR(100),
    comment_Text VARCHAR(10000),
    time DATE,
    neg FLOAT,
    pos FLOAT,
    neu FLOAT
)
'''

pg_curs.execute(create_tiny_pos100_per_user)

In [68]:
with open('output/tiny_pos100_per_user.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip first row of column names
    for row in reader:
        pg_curs.execute('INSERT INTO tiny_pos100_per_user VALUES \
                        (%s, %s, %s, %s, %s, %s, %s)', row)