In [34]:
import pandas as pd
import numpy as np
import re
import time
import requests
import json
import datetime
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [35]:
dates_list = []

# Making list of date ranges to use (from 2018 to 2020)
for i in range(18,21):
    dates_list.append('01/01/20'+str(i))
    dates_list.append('01/6/20'+str(i))

#convert them to a timestamp for pushshift API
def convertToTimestamp(date):
    return time.mktime(datetime.datetime.strptime(date, "%d/%m/%Y").timetuple())

dates = [int(convertToTimestamp(date)) for date in dates_list]

In [36]:
def getRedditData(after, before, subreddit):
  #use pushshift api to get subreddit data
    url = ('https://api.pushshift.io/reddit/search/submission/?size=1500&after='+
           str(after)+'&before='+str(before)+'&subreddit='+str(subreddit)+'&sort_type=score'+'&sort=desc')
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [37]:
def getTitles(subreddit):
  #get just the post titles
    titles_new = []
    titles = []

    for i in range(len(dates)-1):
        #the date range we are fetching data for
        after  = dates[i]
        before = dates[i+1]

        # Getting subreddit data between the dates after and before
        raw_json = getRedditData(after,before,subreddit)

        # Extracting just the title
        titles_new = [post['title'] for post in raw_json]

        # Appending new data on
        titles = titles + titles_new

    # Use set to get rid of duplicates
    titles = list(set(titles))
    return titles

conservative = getTitles('conservative')
liberal = getTitles('liberal')

print(len(conservative))
print(len(liberal))

494
497


In [38]:
df1= pd.DataFrame({'text':conservative})
df1['label'] = 1 #set 1 to conservative posts

df2 = pd.DataFrame({'text':liberal})
df2['label'] = 0 #set 0 to liberal posts

# Combining both datasets
df = pd.concat([df1,df2])

# Shuffling the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Converting all text to lowercase, fixing ampersands and getting rid
# of dashes and apostrophes as they can mess up the dictionary
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r'&amp;', 'and')
df['text'] = df['text'].str.replace(r'-', ' ')
df['text'] = df['text'].str.replace(r'[^\s\w]','')

df.head()

Unnamed: 0,text,label
0,navy seal make a wish,1
1,vindman says white house deleted trumps refere...,0
2,republicans defend trump argument that accepti...,0
3,democrats alarmed by trumps promise of pardons...,0
4,mike rowe americas suffering from an epidemic ...,1


In [39]:

vocab_set = set()
sentence_lengths = []

for i in range(len(df)):
    # Get the words from the text, and the lengths of the sentences
    sentence_words = re.split(r'\s',df.iloc[i]['text'])
    vocab_set.update(sentence_words)
    sentence_lengths.append(len(sentence_words))

In [40]:
#get the unique words
vocab_list = list(vocab_set)
vocab_dict = {vocab_list[i-1]: i for i in range(1, len(vocab_list)+1)}

In [41]:
max_length = max(sentence_lengths)
#map words ot numbers
def toNumbers(row):
    words = re.findall(r'([\w]+)', row['text'])
    nums =  np.array([vocab_dict[words[j]] for j in range(len(words))])
    #pad to max length
    return np.pad(nums, (0, max_length - len(nums)), mode='constant')

In [42]:
nums = df.apply(lambda row: toNumbers(row), axis=1) 
df['nums'] = nums

df['nums'].head()

0    [808, 2073, 3236, 276, 1867, 0, 0, 0, 0, 0, 0,...
1    [3225, 3449, 3678, 932, 759, 3669, 265, 1010, ...
2    [2703, 110, 1202, 2643, 3670, 134, 3344, 2485,...
3    [200, 901, 2873, 3669, 11, 1831, 667, 1010, 31...
4    [2701, 54, 2128, 685, 2385, 163, 256, 1831, 28...
Name: nums, dtype: object

In [43]:
labels = np.asarray(df['label'].values)
features = np.stack(df['nums'].values)

features.shape, labels.shape

((991, 51), (991,))

In [44]:
def get_compiled_model():
  #fiddling around with the model more should allow for higher accuracy, right
  #now it is around 83%
    embedding_dim=16

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab_set)+1, 64), #embedding layer important for NLP
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])


    model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
    
    return model

model = get_compiled_model()
model.fit(features, labels, batch_size=32, epochs=4, verbose=2, validation_split=0.2);


Epoch 1/4
25/25 - 4s - loss: 0.6588 - accuracy: 0.6301 - val_loss: 0.5727 - val_accuracy: 0.6734
Epoch 2/4
25/25 - 2s - loss: 0.4710 - accuracy: 0.7967 - val_loss: 0.4362 - val_accuracy: 0.8040
Epoch 3/4
25/25 - 2s - loss: 0.2254 - accuracy: 0.9356 - val_loss: 0.5248 - val_accuracy: 0.8141
Epoch 4/4
25/25 - 2s - loss: 0.0772 - accuracy: 0.9798 - val_loss: 0.6356 - val_accuracy: 0.8141
