In [1]:
import requests
import pandas as pd
import numpy as np
import json
import html
import io

In [2]:
def getPushshiftData(sub=None, before=None, after=None, ids=None, getSubmissions=True, getComments=False):
    suffix=''
    searchType = 'submission'
    if getComments or not getSubmissions:
        searchType='comment'
    if (before is not None):
        suffix += '&before='+str(before)
    if (after is not None):
        suffix += '&after='+str(after)
    if (sub is not None):
        suffix += '&subreddit='+sub
    if (ids is not None):
        suffix += '&ids='+','.join(ids)

    url = 'https://api.pushshift.io/reddit/search/'+searchType+'?sort=desc&size=1500'+suffix
    print('loading '+url)
    r = requests.get(url)
    data = json.loads(r.content)
    if len(data['data']) > 0:
        prev_end_date = data['data'][-1]['created_utc']
    else:
        prev_end_date = None
    return (data, prev_end_date)

In [3]:
def clean_submissions(df):
    ser = df[(df['selftext'] != '') 
            & (df['selftext'] != '[removed]') 
            & (df['selftext'] != '[deleted]')
            & (df['selftext'].notnull())]['selftext']
    return ser

def clean_comments(df):
    ser = df[(df['body'] != '') 
            & (df['body'] != '[removed]') 
            & (df['body'] != '[deleted]')
            & (df['body'].notnull())
            & (df['author'] != 'AutoModerator')]['body']
    return ser

## Submission Data

In [4]:
sub_republican='Republican'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub_republican, after='180d')
republican_submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub_republican, before=prev_end_date-1, after='180d')
    if prev_end_date is not None:
        republican_submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1590114332&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1589080051&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1588023300&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1587065553&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1585836713&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1584364259&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1582655666&after=180d&subreddit=Repub

In [5]:
republican_submissions_df = pd.DataFrame(republican_submissions)
republican_submissions_text = clean_submissions(republican_submissions_df)
republican_submissions_text

211                                   Did i miss anything?
337                    joined years ago. this sub is lost.
360      recently on tiktok conservatives have come for...
394      That sounds entirely incompetent, and shows so...
573      https://www.instagram.com/tv/CArBIGwglZO/?utm\...
                               ...                        
10936    While the majority of Republicans now support ...
10955    So I don’t align myself with any political par...
10968    I just wanted to thank the republican party fo...
10970    During the impeachment hearing, the three Demo...
10994    The evidence and Trump's behavior is clear, bu...
Name: selftext, Length: 190, dtype: object

In [6]:
sub_democrats='democrats'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub_democrats, after='180d')
democrats_submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub_democrats, before=prev_end_date-1, after='180d')
    if prev_end_date is not None:
        democrats_submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1589828976&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1588348260&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1586678908&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1585273527&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1583889119&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1582681252&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1581208010&after=180d&subreddit=democrats
lo

In [7]:
democrats_submissions_df = pd.DataFrame(democrats_submissions)
democrats_submissions_text = clean_submissions(democrats_submissions_df)
democrats_submissions_text

0       Just going by how conservative news and any co...
9       H.Res 988 is a bill introduced June 1st, 2020,...
16      Everything going on in the country at the mome...
30      I wanted to create a thread to channel the cur...
33      (Fox news and some other news places keep clai...
                              ...                        
9076    This campaign 'promise' is one of the most abs...
9351    Yang has some good ideas overall, I think, but...
9397    I'm of two minds: one hand, scream it from the...
9781    With all the horrid news about Boris looking t...
9880    Once again we've proved that trickle down econ...
Name: selftext, Length: 190, dtype: object

## Comment Data

In [8]:
(comments_tmp, prev_end_date) = getPushshiftData(sub=sub_republican, after='180d', getComments=True)
republican_comments = comments_tmp['data']
while prev_end_date is not None:
    (comments_tmp, prev_end_date) = getPushshiftData(sub=sub_republican, before=prev_end_date-1, after='180d', getComments=True)
    if prev_end_date is not None:
        republican_comments.extend(comments_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1591023747&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590892067&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590715561&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590349343&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590083441&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1589814872&after=180d&subreddit=Republican
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1589545468&after=180d&subreddit=Republican
loading https://ap

In [9]:
republican_comments_df = pd.DataFrame(republican_comments)
republican_comments_text = clean_comments(republican_comments_df)
republican_comments_text

1                   Downtown or Santa Monica or something?
2        I don't live in USA. I'm from Europe. But seei...
4             they are not the same, one is a toy!!!!!!!!!
5                                                  Source?
6        Jesus, this topic has been researched and stud...
                               ...                        
51639    Is this meant to be anti Trump or just hilario...
51641    Sounds like wishful thinking. People still tal...
51642    No clue, but not a lot of this is fact.  It's ...
51643    Look, there is a struggle that the conservativ...
51648             ..hopefully for a few election cycles...
Name: body, Length: 40069, dtype: object

In [10]:
(comments_tmp, prev_end_date) = getPushshiftData(sub=sub_democrats, after='180d', getComments=True)
democrats_comments = comments_tmp['data']
while prev_end_date is not None:
    (comments_tmp, prev_end_date) = getPushshiftData(sub=sub_democrats, before=prev_end_date-1, after='180d', getComments=True)
    if prev_end_date is not None:
        democrats_comments.extend(comments_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1591027952&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590805440&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590666514&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590521410&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590329479&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1590160112&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1589981053&after=180d&subreddit=democrats
loading https://api.pushsh

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1578946593&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1578637074&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1578342654&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1578070308&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1577811822&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1577577183&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1577143514&after=180d&subreddit=democrats
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1576890756&after=180d&subreddit=democrats
loading 

In [11]:
democrats_comments_df = pd.DataFrame(democrats_comments)
democrats_comments_text = clean_comments(democrats_comments_df)
democrats_comments_text

0        Lol... God, first link on Google. Wrong!\n\nht...
2        Ya, I don't think many people in the usa do that.
3        There's a republican subreddit if you are unaw...
4                    Trololol you're so salty.  Delicious.
5        Trump has signed into law more gun restriction...
                               ...                        
79143    Make an arrangement with McKinley and update h...
79144    While fair, the primary is the best time to do...
79145    She basically made the old Kanye joke: she cou...
79146    Oh, you must either not be american or you hav...
79147                        So what do you want him to do
Name: body, Length: 63516, dtype: object

## Combining respective subreddit data and exporting

In [12]:
ROOT_DIR = '../'
republican_data = republican_comments_text.append(republican_submissions_text)
democrats_data = democrats_comments_text.append(democrats_submissions_text)

In [13]:
def combine_text(data):
    result = ''
    for text in range(0, len(data)):
        if text > 0:
            result += " "
        result += data.values[text]
    
    #Cleaning text
    result = html.unescape(result)
    return result

In [14]:
republican_result = combine_text(republican_data)

In [15]:
with io.open(ROOT_DIR + "data/republican_result.txt", "w", encoding="utf-8") as f:
    f.write(republican_result)

In [16]:
democrats_result = combine_text(democrats_data)

In [17]:
with io.open(ROOT_DIR + "data/democrats_result.txt", "w", encoding="utf-8") as f:
    f.write(democrats_result)