# Project 3: Reddit API to CSV
   ---
  *By Ethan Koh, 18 May 2020*

## Import Libraries

In [10]:
import requests
import pandas as pd
import time
import random
from tqdm.notebook import tqdm

## Scrap Reddit API and save as CSV

In [11]:
# 'CasualConversation' Subreddit to scrap from
url = 'https://www.reddit.com/r/CasualConversation.json'

In [12]:
# Create empty list to store
posts = []
after = None

# number of pages retrieved
for a in tqdm(range(15)):
    
    # first page
    if after == None:
        current_url = url + '?&limit=50'
    else:
    #subsequent pages
        current_url = url + '?after=' + after + '&limit=50'
        
    # View url scraped
    print('Retrieved from : '+ current_url)
    
    # Request API
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    # Check for status error
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    # if no error, continue. 
    # Get post into dataframe
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    
    # View number of posts retrieved from page
    print('Number of posts : '+str(len(current_posts)))
    posts.extend(current_posts)
    
    # store next page directory
    after = current_dict['data']['after']
    
    # if not looking at first page of reddit, store data in prev_posts in case status error
    if a > 0:
        prev_posts = pd.read_csv('../data/CasualConversation.csv')
        current_df = pd.DataFrame()
    
    # create csv if looking at first page of reddit
    else:
        pd.DataFrame(posts).to_csv('../data/CasualConversation.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,8)
    time.sleep(sleep_duration) # sleep_duration

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

Retrieved from : https://www.reddit.com/r/CasualConversation.json?&limit=50
Number of posts : 52
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gioe5v&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gioteo&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gil7d6&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gi3enz&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gi2ed7&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_ghpcys&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_ghnmu1&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/CasualConversation.json?after=t3_gh3kkh&limit=50
Number of posts : 50
Retriev

In [13]:
# number of posts retrieved
len(posts)

752

In [14]:
# Save to csv 
pd.DataFrame(posts).to_csv('../data/CasualConversation.csv', index = False)

In [6]:
# 'Parenting' Subreddit to scrap from
url_2 = 'https://www.reddit.com/r/Parenting.json'

In [7]:
# Create empty list to store
posts = []
after = None

# number of pages retrieved
for a in tqdm(range(15)):
    
    # first page
    if after == None:
        current_url = url_2 + '?&limit=50'
    else:
    #subsequent pages
        current_url = url_2 + '?after=' + after + '&limit=50'
        
    # View url scraped
    print('Retrieved from : '+ current_url)
    
    # Request API
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    # Check for status error
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    # if no error, continue. 
    # Get post into dataframe
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    
    # View number of posts retrieved from page
    print('Number of posts : '+str(len(current_posts)))
    posts.extend(current_posts)
    
    # store next page directory
    after = current_dict['data']['after']
    
    # if not looking at first page of reddit, store data in prev_posts in case status error
    if a > 0:
        prev_posts = pd.read_csv('../data/CasualConversation.csv')
        current_df = pd.DataFrame()
    
    # create csv if looking at first page of reddit
    else:
        pd.DataFrame(posts).to_csv('../data/CasualConversation.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,8)
    time.sleep(sleep_duration) # sleep_duration

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

Retrieved from : https://www.reddit.com/r/Parenting.json?&limit=50
Number of posts : 52
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gi5d6w&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_ghlhqh&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gggoxr&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gfv930&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gf63wk&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gfm4mj&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gf7h45&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gettsy&limit=50
Number of posts : 50
Retrieved from : https://www.reddit.com/r/Parenting.json?after=t3_gejh78&limit=50
Number

In [8]:
# number of posts retrieved
len(posts)

752

In [9]:
# Save to csv 
pd.DataFrame(posts).to_csv('../data/Parenting.csv', index = False)