# Imports

In [2]:
# Used to pull data from Reddit
import praw
import pandas as pd
import datetime as dt

from gensim.models import Word2Vec
from nltk.corpus import brown
import nltk
import tensorflow as tf
import tensorflow_hub as hub
import multiprocessing
from google.cloud import bigquery

#nltk.download('punkt')
#nltk.download('brown')

In [None]:
sql_query = '''
SELECT 
    r.subreddit,
    r.created_utc,
    DATETIME_TRUNC(DATETIME(TIMESTAMP(r.created_utc, 'US/Eastern')), MONTH) AS date_month,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2018_10` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
UNION ALL
SELECT 
    r.subreddit,
    r.created_utc,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2018_11` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
UNION ALL
SELECT 
    r.subreddit,
    r.created_utc,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2018_12` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
UNION ALL
SELECT 
    r.subreddit,
    r.created_utc,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2019_01` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
UNION ALL
SELECT 
    r.subreddit,
    r.created_utc,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2019_02` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
UNION ALL
SELECT 
    r.subreddit,
    r.created_utc,
    r.body
FROM 
  `fh-bigquery.reddit_comments.2019_03` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
'''

# Put query results into df
comment_df = pd.read_gbq(sql_query,
                         project_id='w266-240122',
                         dialect='standard')

# Convert date into proper date/time
comment_df['created_dt_tm'] = comment_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))

# Create field for month
comment_df['created_dt_month'] = comment_df.created_dt_tm.dt.to_period('M').dt.to_timestamp()

In [4]:
sql_query = """
SELECT 
    r.subreddit,
    r.title,
    r.selftext,
    r.is_self
FROM 
  `fh-bigquery.reddit_posts.2019_03` r
WHERE
  r.subreddit IN ('democrats', 'progressive', 'Conservative', 'republicans')
"""

post_df = pd.read_gbq(sql_query,
                      project_id='w266-240122',
                      dialect='standard')

# Basic EDA

In [51]:
comment_df['created_dt_month'].value_counts()

2019-03-01    109492
2018-10-01    107297
2019-01-01    100792
2018-11-01     91601
2019-02-01     88902
2018-12-01     73725
2018-09-01       765
Name: created_dt_month, dtype: int64

# Emebdding from Democratic Leaning Subreddits

### Text PreProcessing

In [67]:
def pre_process(comment_df, post_df, subreddit):
    
    # Body text from post dataframe
    post_text = list(post_df[(post_df['is_self'] == True) &
                             (~post_df['selftext'].isin(['[removed]', '[deleted]'])) &
                             post_df['subreddit'].isin(subreddit)]['selftext'].unique())
    
    # Add in text from title
    post_text += list(post_df[post_df['is_self'] == True]['title'].unique())
    
    # Add in text from post comments
    post_text += list(comment_df[(comment_df['subreddit'].isin(subreddit)) & 
                                     (~comment_df['body'].isin(['[removed]', '[deleted]']))]['body'])
    
    # Put all text into dataframe and drop dupes
    text_df = pd.DataFrame(post_text, columns=['text'])
    text_df.drop_duplicates(inplace=True)
    
    return text_df

In [60]:
def train_embedding(text_df):

    # Tokenize at sentence level
    text_df['sent_tokenized'] = text_df['text'].apply(nltk.sent_tokenize)
    text_df['sent_count'] = text_df['sent_tokenized'].apply(lambda x: len(x))
    
    # Put tokenized body into a list
    sent_list = list(text_df[text_df['sent_count'] > 0]['sent_tokenized'].apply(pd.Series).stack().unique())

    # Put list into dataframe
    sent_df = pd.DataFrame(dem_sent_list, columns=['sentence'])

    # Tokenize each sentence at the word level
    sent_df['word_token'] = sent_df['sentence'].apply(nltk.word_tokenize)
    
    # train word embedding
    embedding = Word2Vec(list(ds_df['word_token']),
                         size=100,
                         window=5, 
                         min_count=5, 
                         negative=15, 
                         iter=10, workers=6)
    
    return embedding

In [69]:
test_df = pre_process(comment_df, post_df, ['democrats', 'progressive'])
word_embedding = train_embedding(test_df)

In [None]:
# TODO remove symbols, contractions
# TODO lowercase words?
# TODO Treat numbers as something else
# Add beginning and end of sentence?
# TODO how do we deal with stop words?

### Skipgram Embedding

In [149]:
# train word embedding
dem_embedding = Word2Vec(list(ds_df['word_token']), size=100, window=5, min_count=5, negative=15, iter=10, workers=6)

In [151]:
dem_embedding.wv.similar_by_word('marijuana')

[('protections', 0.7453681826591492),
 ('legalization', 0.7333822250366211),
 ('Healthcare', 0.7216455340385437),
 ('violations', 0.7168678641319275),
 ('protection', 0.7053380012512207),
 ('procedures', 0.7013863325119019),
 ('disability', 0.6971272230148315),
 ('**Rescinds**', 0.6955320835113525),
 ('restricting', 0.6952570676803589),
 ('prisons', 0.6950944662094116)]

In [163]:
dem_embedding.wv.similar_by_word('Harris')

[('Booker', 0.8538322448730469),
 ('Warren', 0.8496584892272949),
 ('Biden', 0.8336505889892578),
 ('Klobuchar', 0.8247814178466797),
 ('Kamala', 0.8058872222900391),
 ('Beto', 0.8028988838195801),
 ('Gillibrand', 0.7822431325912476),
 ('Elizabeth', 0.7690243721008301),
 ('Sanders', 0.7555172443389893),
 ("O'Rourke", 0.7544282674789429)]

In [150]:
dem_embedding.wv.similar_by_word('Trump')

[('trump', 0.8671435117721558),
 ('Putin', 0.6433150172233582),
 ('Bernie', 0.6169761419296265),
 ('Maduro', 0.5872288346290588),
 ('Russia', 0.5814106464385986),
 ('Trumpski', 0.5781972408294678),
 ('he', 0.5778998732566833),
 ('him', 0.5740189552307129),
 ('bernie', 0.5607132911682129),
 ('tRump', 0.56041419506073)]

In [143]:
dem_embedding.wv.similar_by_word('Trump')

[('trump', 0.7485096454620361),
 ('he', 0.6877127885818481),
 ('president', 0.6715365648269653),
 ('Russia', 0.6475684642791748),
 ('He', 0.6389645338058472),
 ('McConnell', 0.6372545957565308),
 ('Clinton', 0.6367909908294678),
 ('Hillary', 0.6154500246047974),
 ('President', 0.612384557723999),
 ('She', 0.6113145351409912)]

In [123]:
dem_embedding.wv.similar_by_word('McConnell')

[('nobody', 0.7864992022514343),
 ('Cornyn', 0.7798274755477905),
 ('drop', 0.7716587781906128),
 ('necessary', 0.762731671333313),
 ('fun', 0.7519711256027222),
 ('Trumps', 0.7489361763000488),
 ('investigated', 0.7456454038619995),
 ('Apple', 0.7444183826446533),
 ('9/11', 0.7405687570571899),
 ('Ivanka', 0.7376561760902405)]

In [144]:
dem_embedding.wv.similar_by_word('Harris')

[('Warren', 0.888292670249939),
 ('Biden', 0.8418302536010742),
 ('Sanders', 0.8297291398048401),
 ('Elizabeth', 0.8213909864425659),
 ('Klobuchar', 0.7796796560287476),
 ('Booker', 0.7661500573158264),
 ('congressman', 0.7622315883636475),
 ('VP', 0.7596662640571594),
 ('Kamala', 0.7543649077415466),
 ('Gabbard', 0.7347267270088196)]

### ELMo

In [20]:
# Download ELMo to a local directory
!mkdir module_elmo

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module_elmo

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  331M    0   415    0     0    369      0  10d 21h  0:00:01  10d 21h   369x assets/
x saved_model.pb
x tfhub_module.pb
x variables/
x variables/variables.index
100  331M  100  331M    0     0  2054k      0  0:02:45  0:02:45 --:--:-- 2379k 2614kk      0  0:02:59  0:01:26  0:01:33 2755k   0  2087k      0  0:02:42  0:01:43  0:00:59 3369k      0  0:02:41  0:01:51  0:00:50 2300k  0:01:56  0:00:44 2391k0:02:44  0:02:25  0:00:19 2082k



In [3]:
# Download ELMo to a local directory
!mkdir module_elmo

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module_elmo

# Initialize elmo
elmo = hub.Module('module_elmo', trainable=True)
embeddings = elmo(['This is an example sentence that I want an embedding for'], signature="default", as_dict=True)['elmo']

# Get word embedding
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embeddings)

Instructions for updating:
Colocations handled automatically by placer.


W0715 08:37:23.398818 4789093824 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0715 08:37:24.135161 4789093824 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [5]:
# Download ELMo to a local directory
!mkdir module_elmo

# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module_elmo

# Initialize elmo
elmo = hub.Module('module_elmo', trainable=True)
embeddings = elmo(['This is an example sentence that I want an embedding for'], signature="default", as_dict=True)['elmo']

# Get word embedding
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embeddings)

message_embeddings.shape

(1, 11, 1024)

# Questions
* Sentiment analysis labeled data
* How would we go about scoring the performance of our models?
    * I proposed to use the scores that are available on the reddit
* Should we train the sentiment classifier at a sentence or post level?
    * Each post may contain multiple sentences


### Data from r/Republican Subreddit

In [91]:
len(list(dem_posts['sent_tokenized_title'].apply(pd.Series).stack()))

1497

In [92]:
1497 + 714

2211

In [90]:
len(test)

2211

In [61]:
republicans = reddit.subreddit('Republican')

In [76]:
posts = []
# for post in democrats.top(limit = 1):
for post in republicans.hot(limit = 1000):
    posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created, post.is_self, post.link_flair_text])
rep_posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created', 'is_self', 'link_flair_text'])
created = rep_posts["created"].apply(lambda x: dt.datetime.fromtimestamp(x))
rep_posts['created'] = created