# Getting the Data from Reddit

In [1]:
import pandas as pd
import numpy as np
import requests

import time

In [2]:
# Function to get back data from a specific endpoint and given parameters

# This can go into our functions.py file

base_url = 'https://api.pushshift.io/reddit/submission/search'
endpoint = 'reddit/submission/search'

before_timestamp = round(time.time())
params = {
        "sort":'desc',
        "subreddit" : 'datascience',
        "size" : 1000,
        "user_removed": False,
        "mod_removed": False,
        "after": None,
        "before": before_timestamp,
        "title:not": "[removed]",
        #"author": "[removed]",
        "selftext:not": "[removed]", 
        #,"link_flair_text"
            }

def subreddit_to_dataframe(endpoint, params):
    # Grab time right now, so we'll get the most recent data
    before_timestamp = round(time.time())
    all_dfs = []
    
    features = ['title','selftext', 'created_utc','num_comments','num_crossposts','score','subreddit']
    
    for _ in range(20):
        print(time.ctime(before_timestamp))
        
        # Reassign the parameter to the new before time stamp
        params['before'] = before_timestamp
        
        # Get Request
        res = requests.get(base_url,params)
        
        if res.status_code == 200:
            print(f'Pulling Data, Status Code: {res.status_code}')
            
            # Json to data
            data = res.json()['data']
            
            oldest_submission_timestamp = data[-1]['created_utc']
            before_timestamp = oldest_submission_timestamp
            
            df = pd.DataFrame(data)[features]
            all_dfs.append(df)
            print(len(all_dfs))
            time.sleep(6)
        else:
            print(f'ERROR CODE FOR RES: {res.status_code}')
            return
            break
    
    # Concatenate list of dataframes
    df_full = pd.concat(all_dfs)
    
    # Create file path to save out
    file_path = f"../data/{params['subreddit']}.csv"
    
    # Save out
    df_full.to_csv(file_path,index = False)
    return df_full

Lets pull down parameters that exclude 

In [3]:
params = {
        "sort":'desc',
        "subreddit" : 'datascience',
        "size" : 1000,
        "user_removed": False,
        "mod_removed": False,
        "after": None,
        "before": before_timestamp,
        "title:not": "[removed]",
        #"author": "[removed]",
        "selftext:not": "[removed]", 
        #,"link_flair_text"
            }
df = subreddit_to_dataframe(endpoint,params)

Fri Apr 24 18:48:48 2020
Pulling Data, Status Code: 200
1
Wed Mar 25 06:09:43 2020
Pulling Data, Status Code: 200
2
Thu Feb 27 04:50:49 2020
Pulling Data, Status Code: 200
3
Sat Feb  1 13:29:09 2020
Pulling Data, Status Code: 200
4
Tue Dec 31 00:09:33 2019
Pulling Data, Status Code: 200
5
Wed Nov 27 02:13:58 2019
Pulling Data, Status Code: 200
6
Tue Oct 22 18:38:44 2019
Pulling Data, Status Code: 200
7
Wed Sep 18 02:36:44 2019
Pulling Data, Status Code: 200
8
Mon Aug 12 21:16:45 2019
Pulling Data, Status Code: 200
9
Thu Jul  4 08:14:57 2019
Pulling Data, Status Code: 200
10
Sat May 25 10:20:13 2019
Pulling Data, Status Code: 200
11
Tue Apr 16 08:44:39 2019
Pulling Data, Status Code: 200
12
Mon Mar 11 02:12:20 2019
Pulling Data, Status Code: 200
13
Tue Feb  5 01:44:00 2019
Pulling Data, Status Code: 200
14
Fri Jan  4 09:25:21 2019
Pulling Data, Status Code: 200
15
Mon Dec  3 17:38:17 2018
Pulling Data, Status Code: 200
16
Fri Nov  2 15:01:01 2018
Pulling Data, Status Code: 200
17
Fri Se

In [4]:
df.head()

Unnamed: 0,title,selftext,created_utc,num_comments,num_crossposts,score,subreddit
0,[D] How Efficient is EfficientNet?,,1587773504,0,0,1,datascience
1,If I had to start learning Data Science again....,,1587772554,0,0,1,datascience
2,data with 200 columns (mixed data types). How ...,I have a table of data with about 200columns a...,1587768353,0,0,1,datascience
3,How To Know Data Science Is For You?,,1587767893,1,0,1,datascience
4,Hello! I create a subreddit for people applyin...,It is called r/DataScienceAdmissions ! Everyon...,1587766971,0,0,1,datascience
