In [1]:
import pandas as pd
import datetime as dt
import time
import requests

In [52]:
def query_pushshift(subreddit, kind = 'submission', day_window = 50, n = 10):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full

In [53]:
liberal_results_df = query_pushshift("Liberal")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=50d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=100d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=250d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=300d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=400d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Liberal&size=500&after=450d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit

In [54]:
liberal_results_df.shape

(1742, 9)

In [55]:
liberal_results_df.to_csv('liberal_results_df.csv',index=False)

In [27]:
def query_pushshift(subreddit, kind = 'submission', day_window = 50, n = 20):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full

In [46]:
conservative_results_df = query_pushshift("Conservative")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=50d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=100d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=250d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=300d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=400d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&size=500&after=450d
Querying from: https://api.pu

In [47]:
conservative_results_df.shape

(1422, 9)

In [56]:
conservative_results_df.to_csv('conservative_results.csv',index=False)