# Collect Reddit Data Script

As an experiment, change the `total_start_date` and `total_end_date` variables in the function below to different month or year-long periods. Let's see how long it takes/how much data you can collect for different authors and different time periods.

In [33]:
def format_date(date):
    return int(datetime.datetime.strptime(date, "%m-%d-%Y").timestamp())

In [42]:
import requests
import json
import pandas as pd
import datetime
import time


def format_date(date):
    return int(datetime.datetime.strptime(date, "%m-%d-%Y").timestamp())

def collect_reddit_submissions(query, search_start_date, search_end_date):
    
    collect_start_time = datetime.datetime.now()
    
    query = query
    size = 500
    
    search_start_date = datetime.datetime.strptime(search_start_date, "%m-%d-%Y")
    search_end_date = datetime.datetime.strptime(search_end_date, "%m-%d-%Y")
    #We can only get 500 results per request, so we're requesting data for one-week increments between total_start_date and total_end_date
    total_start_date = datetime.date(search_start_date.year, search_start_date.month, search_start_date.day)
    total_end_date = datetime.date(search_end_date.year, search_end_date.month, search_end_date.day)
    week_increment = datetime.timedelta(days=7)
    
    #Make date range for the eventual CSV filename
    date_range = f"{total_start_date}-{total_end_date}"
    
    #A master list where we're appending all the Reddit data, which we will eventually write to a CSV file
    all_reddit_data = []
    
    #Looping through all the one-week increment
    while total_start_date <= total_end_date:
        
        week_start_date = total_start_date.strftime('%m-%d-%Y')
        week_end_date = (total_start_date+week_increment).strftime('%m-%d-%Y')
        
        url = f"""https://api.pushshift.io/reddit/search/submission?q={query}
        &size={size}&after={format_date(week_start_date)}&before={format_date(week_end_date)}"""
        
        response = requests.get(url)
        
        #Making an exception if we get a 429 error that says we've made too many requests
        if response.status_code == 429:
            print("Too many requests. Sleeping for one minute before trying again 😴😴😴")
            time.sleep(60)
            response = requests.get(url)
            reddit_data = response.json()['data']
            for reddit_post in reddit_data:
                all_reddit_data.append(reddit_post)
                
        elif response.status_code != 200:
            print(f'{response.status_code} error resulting from Reddit data request between {week_start_date} and {week_end_date} API request: {url}')
            continue
            
        else:
            reddit_data = response.json()['data']
            for reddit_post in reddit_data:
                all_reddit_data.append(reddit_post)
            
        #Increase the total start date by one week, so we can keep looping through all the data
        total_start_date += week_increment
        
        print(f"{len(reddit_data)} Reddit posts collected between {week_start_date} and {week_end_date}")


    all_reddit_data = pd.DataFrame([reddit_post for reddit_post in all_reddit_data])
    all_reddit_data['full_date'] = pd.to_datetime(all_reddit_data['created_utc'], utc=True, unit='s')
    all_reddit_data['date'] = all_reddit_data['full_date'].dt.strftime("%Y-%m-%d")
           
        
    query_formatted = query.replace(' ', '-')
    filename = f'{query_formatted}-{total_start_date}-{total_end_date}-Reddit-data.csv'
    
    all_reddit_data.to_csv(filename, index=False)
    
    print(f"\n✨✨✨\n{len(all_reddit_data)} total Reddits posts collected\n\nThis Reddit data was written to the file:\n{filename}")
    print(f"\n\nTotal time it took to collect this data:\n{str(datetime.datetime.now() - collect_start_time)}✨✨✨")

    return 

In [44]:
collect_reddit_submissions("David Foster Wallace", "12-31-2009", "3-14-2010")

1 Reddit posts collected between 12-31-2009 and 01-07-2010
0 Reddit posts collected between 01-07-2010 and 01-14-2010
3 Reddit posts collected between 01-14-2010 and 01-21-2010
2 Reddit posts collected between 01-21-2010 and 01-28-2010
4 Reddit posts collected between 01-28-2010 and 02-04-2010
1 Reddit posts collected between 02-04-2010 and 02-11-2010
0 Reddit posts collected between 02-11-2010 and 02-18-2010
1 Reddit posts collected between 02-18-2010 and 02-25-2010
1 Reddit posts collected between 02-25-2010 and 03-04-2010
2 Reddit posts collected between 03-04-2010 and 03-11-2010
4 Reddit posts collected between 03-11-2010 and 03-18-2010

✨✨✨
19 total Reddits posts collected

This Reddit data was written to the file:
David-Foster-Wallace-2010-03-18-2010-03-14-Reddit-data.csv


Total time it took to collect this data:
0:00:06.996641✨✨✨
