# Project 3 Web APIs & NLP

# Import libraries

In [1]:
import requests
import time
import pandas as pd

# Get the data from reddit
Using Pushshift Reddit API, get data from reddit

In [2]:
# the data is from r/sewing
sew_url = 'https://api.pushshift.io/reddit/search/submission'
sew_params = {
    'subreddit': 'sewing',
    'size': 100,
    'before': None
}

In [3]:
# the data is from r/3dprinting
printing3d_url = 'https://api.pushshift.io/reddit/search/submission'
printing3d_params = {
    'subreddit': '3Dprinting',
    'size': 100,
    'before': None
}

In [4]:
# Get data from reddit
def get_data(url, params, iteration=20):
    
    data_list = []
    
    for _ in range(iteration):

        res = get_retry(url, params, 3)
        
        if res.status_code != 200:
            continue
        
        # convert the data
        data = res.json()
        
        posts = data['data']
        
        for post in posts:
            data_list.append(post)
        
        # get the utc for the before parameter from the last of the list
        # we can get posts data only 100, so have to request over and over
        params['before'] = posts[-1]['created_utc']
        
        # decrease the loads for the website
        time.sleep(2)
        
    return pd.DataFrame(data_list)

# if the status codes were 500, 502, 505, retry
def get_retry(url, params, retry_times):
    for t in range(retry_times + 1):
        res = requests.get(url, params)
        if t < retry_times:
            if res.status_code in [500, 502, 505]:
                time.sleep(2)
                continue
        return res
    

In [5]:
data_sew = get_data(sew_url, sew_params)
data_3dprint = get_data(printing3d_url, printing3d_params)

# Save the raw data

In [9]:
data_sew.to_csv('../data/sewing_raw.csv', index=False)
data_3dprint.to_csv('../data/3dprinting_raw.csv', index=False)