In [None]:
import pandas as pd
import time
import requests
import datetime as dt


def get_date(created):
    # get the date of post
    return dt.date.fromtimestamp(created)



def query_pushshift(subreddit, kind='submission', skip=5, times=50, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments',
                                'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):
    
    
    # get the base url that contains information I want to scrape where 'kind' are all submitted posts
    # and 'subreddit' is the specified subreddit. Get 500 posts.
    stem = f"https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size=500"
    
    # instantiate list to contain 
    mylist = []
   
    # scrape posts from the subreddit 'times' times
    for x in range(1, times + 1):
        # Get posts 'skip' * 'x' days ago
        URL = f"{stem}&after={skip * x}d"
        print(URL)
       
        # Scrape URL
        response = requests.get(URL)
       
        # Give me an AssertionError if status code not 200
        assert response.status_code == 200
       
        # Of the HTML scraped, take the values of 'data'
        the_json=response.json()
        no_blanks=[c for c in the_json['data'] if ('selftext' in c.keys()) and len(c['selftext'])>10]
        
        # turn the data into a dataframe
        df = pd.DataFrame.from_dict(no_blanks)
        
        # append the dataframe to mylist
        mylist.append(df)
        
        # wait to not overrun Reddit's resources
        time.sleep(3)
   
    # concatenate the dataframes together as one large dataframe, full
    full = pd.concat(mylist, sort=False)
    if kind == "submission":
       
        # take all speficied data
        full = full[subfield]
        
        # drop duplicate rows
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
   
    # date the the post was... posted
    _timestamp = full["created_utc"].apply(get_date)
    full['timestamp'] = _timestamp
    print(full.shape)
    return full

