In [12]:
!pip install praw
!pip install tensorflow_hub
!pip install tf-nightly



# Imports

In [16]:
# Used to pull data from Reddit
import praw
import pandas as pd
import datetime as dt

import nltk
import tensorflow as tf
import tensorflow_hub as hub
#nltk.download('punkt')

In [2]:
reddit = praw.Reddit(client_id='OMnAyXELD9zyJA', 
                     client_secret='fW7tokngC-W9lCtDET4-9n0f5QY', 
                     user_agent='Reddit WebScrapping')

### Data from r/democrats Subreddit

In [3]:
democrats = reddit.subreddit('democrats')

In [27]:
posts = []
# for post in democrats.top(limit = 1):
for post in democrats.hot(limit = 1000):
    posts.append([post.title, post.score, post.id, post.subreddit, post.url, 
                  post.num_comments, 
                  post.selftext,
                  post.created, 
                  post.is_self,
                  post.link_flair_text])

# Column names
columns = ['title', 
           'score',
           'id', 
           'subreddit',
           'url',
           'num_comments', 
           'body', 
           'created', 
           'is_self', 
           'link_flair_text']
    
dem_df = pd.DataFrame(posts, columns=columns)
dem_df['created'] = dem_df['created'].apply(lambda x: dt.datetime.fromtimestamp(x))

In [28]:
# These are the posts that are self created by the user and contains only text
# For these, we will use 'title' and 'body'
dem_df[dem_df['is_self'] == True].head(3)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,is_self,link_flair_text
11,Change,2,cb9esc,democrats,https://www.reddit.com/r/democrats/comments/cb...,0,For some reason for sometime now I have felt t...,2019-07-10 08:32:09,True,
26,Complicit Legal Definition.,1,cb3pkp,democrats,https://www.reddit.com/r/democrats/comments/cb...,0,“Complicity in criminal law refers to when som...,2019-07-10 01:11:53,True,
42,How do you feel about bussing?,0,cavuqn,democrats,https://www.reddit.com/r/democrats/comments/ca...,3,"After reading Ta-Nehisi Coates, I’m less convi...",2019-07-09 12:18:15,True,


In [29]:
# These are the posts that are simply a posted link and do not have body text
# For these, we will use 'title'
dem_df[dem_df['is_self'] == False].head(3)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,is_self,link_flair_text
0,Amy McGrath says she will take on Mitch McConn...,813,cazris,democrats,https://www.courier-journal.com/story/news/pol...,48,,2019-07-09 19:42:33,False,
1,Hillary Clinton: When the Trump administration...,38,cb5c7v,democrats,https://twitter.com/HillaryClinton/status/1148...,1,,2019-07-10 03:11:23,False,
2,Democrat and Former Marine pilot Amy McGrath a...,85,cb0c8d,democrats,https://www.rollcall.com/news/campaigns/mcgrat...,2,,2019-07-09 20:39:50,False,


### Tokenizing

### Tokenizing at Sentence Level

In [30]:
# Tokenize at sentence level the title and body of each post
dem_df['sent_tokenized_title'] = dem_df['title'].apply(nltk.sent_tokenize)
dem_df['sent_tokenized_body'] = dem_df['body'].apply(nltk.sent_tokenize)

# Calculate sentence length for each post
dem_df['sent_count_body'] = dem_df['sent_tokenized_body'].apply(lambda x: len(x))
dem_df['sent_count_title'] = dem_df['sent_tokenized_title'].apply(lambda x: len(x))

print('Total body sentence count:', dem_df['sent_count_body'].sum())
print('Total title sentence count:', dem_df['sent_count_title'].sum())

Total body sentence count: 650
Total title sentence count: 1488


In [78]:
dem_df['sent_count_title'].value_counts()

1    706
2    168
3     64
4     32
5     15
6      5
7      3
Name: sent_count_title, dtype: int64

In [77]:
dem_df['sent_count_body'].value_counts()

0      924
2       12
4        9
1        8
3        6
5        6
6        5
8        4
7        4
11       3
15       2
58       1
9        1
12       1
13       1
18       1
19       1
24       1
25       1
33       1
170      1
Name: sent_count_body, dtype: int64

### Sentence Clean Up

In [65]:
dem_df[dem_df['sent_count_body'] > 0]['sent_tokenized_body'].head()

11    [For some reason for sometime now I have felt ...
26    [“Complicity in criminal law refers to when so...
42    [After reading Ta-Nehisi Coates, I’m less conv...
46    [A little introduction, I am a registered Demo...
82    [My mother has very little time on her hands, ...
Name: sent_tokenized_body, dtype: object

In [75]:
test = dem_df[dem_df['sent_count_body'] > 0]['sent_tokenized_body'].apply(pd.Series).stack().to_frame()
test.reset_index(inplace=True)
test.set_index('level_0', inplace=True)
test.drop('level_1', axis=1, inplace=True)
test['score'] = dem_df[dem_df['sent_count_body'] > 0]['score']

In [None]:
# TODO remove symbols, contractions
# TODO lowercase words?
# TODO Treat numbers as something else
# Add beginning and end of sentence?
# TODO how do we deal with stop words?


In [32]:
# Put tokenized body into a list
sent_list = list(dem_df[dem_df['sent_count_body'] > 0]['sent_tokenized_body'].apply(pd.Series).stack().unique())

# Add tokenized title into the list
sent_list += list(dem_df['sent_tokenized_title'].apply(pd.Series).stack().unique())

# Put list into dataframe
ds_df = pd.DataFrame(sent_list, columns=['sentence'])

# Tokenize each sentence at the word level
ds_df['word_token'] = ds_df['sentence'].apply(nltk.word_tokenize)

In [38]:
dem_df[dem_df['sent_count_body'] > 0]['sent_tokenized_body'].iloc[0]

['For some reason for sometime now I have felt that the message behind being a Democrat has changed.',
 'I don’t know if it’s all of the sometimes ridiculous things I see on TV or social media but I feel like it has been radicalized by the youth.']

11   0     For some reason for sometime now I have felt t...
     1     I don’t know if it’s all of the sometimes ridi...
26   0     “Complicity in criminal law refers to when som...
     1     Criminal complicity may arise in the following...
42   0     After reading Ta-Nehisi Coates, I’m less convi...
46   0     A little introduction, I am a registered Democ...
     1     I have voted for the Democratic Party my whole...
     2     I am not a concern troll but a real American t...
     3     I am frustrated with the story of what Jeffere...
     4     What was he charged in was it 2008 only to be ...
     5          This is such a let down from our government.
     6     How could the judicial system  allow his behav...
     7     How in the world was the FBI and other intelli...
     8     Did they actually turn the other way in the fa...
     9     We deserve better from our government involvin...
     10    I expect to get few upvotes for this post and ...
     11                 

### ELMo

In [124]:
#download the model to local so it can be used again and again
!mkdir module_elmo

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module_elmo

In [24]:
elmo = hub.Module("module_elmo", trainable=False)
embeddings = elmo([demsent_df.iloc[0]['sentence']], signature="default", as_dict=True)["elmo"]

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embeddings)

In [23]:
len(message_embeddings[0])

12

# Questions
* Sentiment analysis labeled data
* How would we go about scoring the performance of our models?
    * I proposed to use the scores that are available on the reddit
* Should we train the sentiment classifier at a sentence or post level?
    * Each post may contain multiple sentences


### Data from r/Republican Subreddit

In [91]:
len(list(dem_posts['sent_tokenized_title'].apply(pd.Series).stack()))

1497

In [92]:
1497 + 714

2211

In [90]:
len(test)

2211

In [61]:
republicans = reddit.subreddit('Republican')

In [76]:
posts = []
# for post in democrats.top(limit = 1):
for post in republicans.hot(limit = 1000):
    posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created, post.is_self, post.link_flair_text])
rep_posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created', 'is_self', 'link_flair_text'])
created = rep_posts["created"].apply(lambda x: dt.datetime.fromtimestamp(x))
rep_posts['created'] = created