In [37]:
# Load .env enviroment variables
import os
from dotenv import load_dotenv
# Load your .env file
load_dotenv("/Users/ludovicschneider/Bootcamp/LS.env")

import pandas as pd
import datetime as dt

# Import Pushift API
from psaw import PushshiftAPI
api = PushshiftAPI()

# Import requests
import requests

## 1st API: Retrieving the data via the Pushift Database ##
This is a better way to extract large amount of data

In [120]:

def pushiftapi (subreddit, start_year, start_month, start_date, end_year, end_month, end_date, max_posts):
    '''Returns pd.DataFrame with psts for the given time-window and subbreddit from Pushift API/database
    ...
    
    Parameters
    ----------
    subreddit : str() - name of the subreddit you want to fetch the data from - example : wallstreetbets
    start_year : int() - year in the format YYYY
    start_month : int() - year in the format MM
    start_date : int() - year in the format DD
    end_year : int() - year in the format YYYY
    end_month : int() - year in the format MM
    end_date : int() - year in the format DD 
    
    Returns
    -------
    df_clean : pd.DataFrame with list of posts with the attributes ''title', 'score', 'body', 'timestamp', 'author'' price data
    Print number of row + start and end time
    generate a CSV file
    
    '''
    
    #Set the parametters to fetch the data from the database
    #Set teh time window
    start_time= int(dt.datetime(start_year,start_month,start_date).timestamp())
    end_time= int(dt.datetime(end_year,end_month,end_date).timestamp())

    # Set the request by specifying the filters/data we want to extract, 
    # in our case we will focus on Author - Title - Selftext(body) - score
    # the score will be always 1 because this database contain only the fist submission/post and hence don't have the following "events" (comments/votes)

    wsb_data = api.search_submissions(after=start_time, before=end_time,
                                      subreddit= 'wallstreetbets',
                                      filter=['author', 'title', 'selftext', 'score'],
                                      limit= 200000
                                     )
    # Check that we have no errors
    print(wsb_data)
    
    # Use "thing.d_" a dictionary containing all of the data attributes attached to the variable called "thing" 
    # (which otherwise would be accessed via dot notation). 
    # This is a quick way to pushing results into a pandas dataframe 

    df = pd.DataFrame([thing.d_ for thing in wsb_data])
    
    # transforming utc_date
    df_clean = df.drop(columns='created').rename(columns={'created_utc':'timestamp', 'selftext':'body'})
    df_clean['timestamp'] = df_clean['timestamp'].apply(lambda x: dt.datetime.utcfromtimestamp(x))
    df_clean = df_clean[['title', 'score', 'body', 'timestamp', 'author']]
    
    # Creating the CSV file for the data analysis
    df_clean.to_csv('wsb_pushshift_data.csv')
    
    # Print results
    print(f'Shape of the Data_Frame : {df_clean.shape(1)}')
    print(f'Start window : {df_clean.timestamp.iat[0]}')
    print(f'End window : {df_clean.timestamp.iat[-1]}')
    
    return
    

In [121]:
pushiftapi('wallstreetbets', 2021, 1, 1, 2021, 7, 10, 200000)

<generator object PushshiftAPIMinimal._search at 0x7fcc1b72eed0>
(200000, 5)
start window : 2021-07-10 03:59:25
end window : 2021-03-29 09:37:35


## 2nd API: Retrieving the data via Reddit API directly ##
It is not the best way to retrieve large dataset but it is a good way to stream real time and to keep updates on posts. As exlpain, the first API doesn't "update" to take into accounts the comments and votes of a post, it only retrieves the intial post/submission.

We are not going to use this API in our example but we want to give teh user teh flexibility to actually retrieve all teh information if you want to includes comments/votes information. 

The constraints of this API is that you are limited to 100 posts per requests. Hence we had to create a loop to allow the user to go back in time as much  as desired. However this limitation makes it harder/longer to build a large dataset to work on for our machine learning exercise. 

Therefore this part will be commented out but can be activated if needed.

In [133]:
def reddit_direct_api(subreddit, max_batch, limit_posts):
    ''' Returns a pd.DataFrame with posts fetched from the direct Reddit API. 
        Reddit API lets you fetch only 100 posts at a time and hence you need to specify how many batchs of 100
        you want to use to build your DF
        
        To run the function you need Reddit API keys and your Reddit account password saved in an .env
        format : 
                client_id = os.getenv("REDDIT_CLIENT_ID")
                reddit_secret_key = os.getenv("REDDIT_SECRET_KEY")
                reddit_pw = os.getenv('REDDIT_PW')
    
    Parameters
    ----------
    subreddit : str() - name of the subreddit you want to fetch the data from - example : wallstreetbets
    max_batch: int() - max of loops we want to fetch (will determine how far back you want to go)
                     (note two woudl be equal to 3 batches)

    limit_posts : int() - max posts to retrieve per batch
    
    Returns
    -------
    df_reddit : pd.DataFrame with list of posts with the attributes ''title', 'score', 'body', 'timestamp', 'author'' price data
    Print number of row + start and end time
    generate a CSV file
    '''
    
    # Set Reddit API public and secret keys + Reddit account password
    # We need to Request a temporary OAuth token from Reddit to access the API,
    # to do so we need our keys and password:

    client_id = os.getenv("REDDIT_CLIENT_ID")
    reddit_secret_key = os.getenv("REDDIT_SECRET_KEY")
    reddit_pw = os.getenv('REDDIT_PW')

    # Prepare the authorization
    auth = requests.auth.HTTPBasicAuth(client_id, reddit_secret_key)
    
    # Setting up your Reddit logins
    data_id = {'grant_type': 'password',
               'username': 'diveride',
               'password': reddit_pw
              }
    # As per Reddit API doc, we create the headers needed to access the website
    headers = {'User-Agent': 'API_project'}

    # Retrieve the access_token
    res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data_id, headers=headers)

    # Store the access_token to a variable
    access_token = res.json()['access_token']
    
    # Adding the access token to our headers and format it as a string
    headers['authorization'] = f'bearer {access_token}'

    # Print the access_token to make sure it worked in a proper format
    print(f'Access Token confirmation:{headers}')
    
    # Checking that we can access the website with a result 200
    # Important - the access Token is only valid for an hour
    requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

    # Retreiving the data from WSB
    # This is where we created the loop function allowing you to go back as much as needed by batches of 100 posts(max authorized by the API)
    # Since by default the request function will fetch the last 100 posts, we need to force the request to start before a specific post ID
    # hence we create a last_id varable which will set the start of teh next batch
    last_id=""

    # Set the "token to keep track on number of batches"
    num=float()

    # The loop code
    df_res= pd.DataFrame()
    while num <=max_batch:
        num += 1
        wsb_data1 = requests.get('https://oauth.reddit.com/r/'+ subreddit, headers=headers, params={'limit': limit_posts, 'after': 'last_id'})
        for post in wsb_data1.json()['data']['children']:
            df_res= df_res.append({
                'author': post['data']['author'],
                'title': post['data']['title'],
                'selftext': post['data']['selftext'],
                'score': post['data']['score'],
                'time': post['data']['created_utc']
            }, ignore_index=True)
        last_id= post['kind'] + '_' + post['data']['id']
        
    # Fomating the unix time into days
    df_res['time'] = df_res['time'].apply(lambda x: dt.datetime.utcfromtimestamp(x))
    df_res = df_res.rename(columns={'time':'timestamp', 'selftext':'body'})
    df_reddit = df_res[['title', 'score', 'body', 'timestamp', 'author']]
    
    # Creating the CSV file for the data analysis
    df_reddit.to_csv('wsb_reddit_api_data.csv')
    
    # Print results
    print(df_reddit.shape)
    print(f'Start window : {df_reddit.timestamp.iat[0]}')
    print(f'End window : {df_reddit.timestamp.iat[-1]}')
    
    return

In [134]:
reddit_direct_api('wallstreetbets', 10, 100)

Access Token confirmation:{'User-Agent': 'API_project', 'authorization': 'bearer 656422675188-fJPMrpxP__rI-eM5_7jXJU8hYxUSnA'}
(1122, 5)
start window : 2021-07-09 20:00:17
end window : 2021-07-08 18:56:30
