In [1]:
# Importing relevant libraries

import requests
import requests.auth
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Credentials

clientid = '*****'
secret = '*****'
username = '*****'
password = '*****'

In [3]:
# Acquiring web token to access Reddit API

client_auth = requests.auth.HTTPBasicAuth(f'{clientid}', f'{secret}')
post_data = {"grant_type": "password", "username": f"{username}", "password": f"{password}"}
headers = {"User-Agent": f"script:com.example.WallStBetsSentiment:v1.0.0 (by u/{username})"}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
token = response.json()['access_token']
token

'61805754-9UsTwvaFbUVSw1Oe0rDLK0Z-rT0pww'

# Data Mining

In [5]:
# 100 most recent posts and 100 most recent comments made by User - OliveInvestor

subreddits = {}

for user in ['OliveInvestor']:
    
    # Creating a list of lists with a single element containing strings that will serve as column names
   
    user_activity = [['subreddit', 'score', 'awards', 'comments']]
    
    # Posts made by User

    headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.WallStBetsSentiment:v1.0.0 (by u/{username})"}
    posts = requests.get(f"https://oauth.reddit.com/user/{user}/submitted?limit=100", headers=headers)
    
    # User may not have made any posts
    
    try:
        
        # User has made at least one post
    
        for x in posts.json()['data']['children']:

            p = []
            # The subreddit the post was published in
            p.append(x['data']['subreddit'])
            # The total score the post has received thus far 
            p.append(x['data']['score'])
            # The total number of awards the post has received thus far
            p.append(x['data']['total_awards_received'])
            # The number of comments the post has received thus far
            p.append(x['data']['num_comments'])

            user_activity.append(p)
        
    except:
        
        # User has made no posts 
        
        pass
        
        
    # Comments made by User

    headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.WallStBetsSentiment:v1.0.0 (by u/{username})"}
    comments = requests.get(f"https://oauth.reddit.com/user/{user}/comments?limit=100", headers=headers)

    try:
        
        # User has made at least one comment
    
        for x in comments.json()['data']['children']:

            c = []
            # The subreddit the comment was published in
            c.append(x['data']['subreddit'])
            # The total score the comment has received thus far 
            c.append(x['data']['score'])
            # The total number of awards the comment has received thus far
            c.append(x['data']['total_awards_received'])
            # Inserting NaN as there is no data regarding number of comments received on a comment
            c.append(np.nan)

            user_activity.append(c)
            
    except:
        
        # User has made no comments 
        
        pass
        
    # Establishing a key : value pair of the format username : list of lists
        
    subreddits[user] = user_activity
    
# If a user has no posts and no comments, it is likely because their account has been deleted


In [7]:
subreddits['OliveInvestor']

[['subreddit', 'score', 'awards', 'comments'],
 ['options', 453, 14, 91],
 ['u_OliveInvestor', 2, 0, 2],
 ['wallstreetbets', 45, 0, 32],
 ['wallstreetbets', 44, 2, 42],
 ['wallstreetbets', 136, 3, 188],
 ['StockMarket', 17, 0, 15],
 ['StockMarket', 1, 0, 0],
 ['options', 16, 1, 20],
 ['StockMarket', 7, 0, 0],
 ['StockMarket', 479, 1, 85],
 ['tech', 1, 0, 0],
 ['FluentInFinance', 14, 1, 14],
 ['StockMarket', 6, 0, 7],
 ['wallstreetbets', 207, 0, 91],
 ['StockMarket', 7, 0, 7],
 ['StockMarket', 15, 0, 12],
 ['StockMarket', 17, 0, 10],
 ['news', 19, 0, 27],
 ['StockMarket', 2, 0, 12],
 ['wallstreetbets', 205, 2, 26],
 ['news', 4, 0, 3],
 ['CryptoCurrency', 1, 0, 0],
 ['tech', 1, 0, 0],
 ['stocks', 16, 0, 24],
 ['wallstreetbets', 2, 0, 4],
 ['StockMarket', 30, 0, 29],
 ['stocks', 99, 0, 134],
 ['stocks', 1, 0, 0],
 ['StockMarket', 738, 1, 150],
 ['stocks', 39, 0, 69],
 ['StockMarket', 1, 0, 0],
 ['wallstreetbets', 154, 2, 78],
 ['stocks', 1, 0, 0],
 ['stocks', 1, 0, 1],
 ['options', 54, 1,

In [9]:
# Defining a empty dictionary

random_dict = {}

# Coverting list of lists into pandas DataFrame for the user

for user in subreddits.keys():
    
    random_dict[user] = pd.DataFrame(subreddits[user][1:], columns=subreddits[user][0])

## Data Dictionary 

Subreddit: subreddit in which post/comment was made

Score: net up / down votes received on post/comment

Awards: total number of awards received on post/comment

Comments: number of comments received on post (NaN if interaction is a comment)

In [10]:
# Example dataframe where each row is a unique interaction

random_dict['OliveInvestor']

Unnamed: 0,subreddit,score,awards,comments
0,options,453,14,91.0
1,u_OliveInvestor,2,0,2.0
2,wallstreetbets,45,0,32.0
3,wallstreetbets,44,2,42.0
4,wallstreetbets,136,3,188.0
...,...,...,...,...
158,FluentInFinance,6,0,
159,smallstreetbets,12,0,
160,smallstreetbets,2,0,
161,FluentInFinance,4,0,


# Feature Engineering

In [20]:
for user in random_dict.keys():
    
    # Creating a list of lists with a single element containing strings that will serve as column names
    
    pss = [['subreddit', 'interactions', 'score', 'awards', 'comments']]

    x = random_dict[user]

    for sub in x.subreddit.unique():

        if sub != 'wallstreetbets':
        
            ps = []
            # The name of the subreddit
            ps.append(sub)
            # The number of interactions with that subreddit
            ps.append(x.subreddit.value_counts()[sub])
            # The mean score received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].score.mean())
            # The mean number of awards received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].awards.mean())
            # The mean number of comments received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].comments.mean())

            pss.append(ps)

    # Converting the list of lists into a pandas dataframe
            
    x = pd.DataFrame(pss[1:], columns=pss[0])

In [21]:
# Example dataframe with data aggregated by subreddit

x

Unnamed: 0,subreddit,interactions,score,awards,comments
0,options,24,25.291667,0.666667,33.1
1,u_OliveInvestor,6,1.166667,0.0,1.0
2,StockMarket,21,64.714286,0.142857,25.333333
3,tech,2,1.0,0.0,0.0
4,FluentInFinance,8,5.0,0.125,14.0
5,news,3,8.666667,0.0,15.0
6,CryptoCurrency,2,1.0,0.0,0.0
7,stocks,9,35.0,0.111111,50.0
8,dividends,4,68.75,0.25,52.25
9,personalfinance,2,4.0,0.0,11.5


## Scoring Methodology:

Number of posts/comments made on subreddit

x ( 1 + (score on subreddit / sum of all scores received) )

x ( 1 + (awards on subreddit / sum of all awards received) ) 

x ( 1 + (comments received on subreddit / sum of all comments received) )

In [22]:
# Defining an empty list

a = []

for i in x.index:
    
    # If a given row/subreddit does not contain a NaN value in the 'comments' column
    # the user has published at least one post in that subreddit

    if x.comments.isna()[i] == False: # Posts

        # Using the methodology defined above, a users score/value for a given subreddit will be calculated
            
        # We will be using normalised values so we must account for cases where a user has never received 
        # any awards or comments as this would result in division by 0

        if x.awards.sum() != 0 and x.comments.sum() != 0 :

            # Creating a list containing the elements that will be used to calculate a single value

            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 1 + (x.awards[i] / x.awards.sum()), 
                  1 + (x.comments[i] / x.comments.sum())]

        elif x.comments.sum() == 0:

            # Creating a list containing the elements that will be used to calculate a single value
            
            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                  1 + (x.awards[i] / x.awards.sum())]

        elif x.awards.sum() == 0:

            # Creating a list containing the elements that will be used to calculate a single value

            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                 1 + (x.comments[i] / x.comments.sum())]

        # Finally calculating the single score/value by taking the product of the elements in the list
            
        aaa = np.prod(aa)

        
    # If a given row/subreddit contains a NaN value in the 'comments' column
    # the user has not published any posts in that subreddit

    else: # Comments

        x.comments[i] = 0

        if x.awards.sum() != 0 and x.comments.sum() != 0 :

            # Creating a list containing the elements that will be used to calculate a single value

            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 1 + (x.awards[i] / x.awards.sum()), 
                 1 + (x.comments[i] / x.comments.sum())]

        elif x.comments.sum() == 0:

            # Creating a list containing the elements that will be used to calculate a single value

            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                  1 + (x.awards[i] / x.awards.sum())]

        elif x.awards.sum() == 0:
            
            # Creating a list containing the elements that will be used to calculate a single value

            aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                 1 + (x.comments[i] / x.comments.sum())]

        # Finally calculating the single score/value by taking the product of the elements in the list
            
        # As the user has never posted on this subreddit but only commented, the final value is half weighted
            
        aaa = np.prod(aa) / 2
        
# MADE MISTAKE ABOVE, END UP WITH SOMETHING SIMILAR TO INTENDED RESULT BUT NOT QUITE
# ROWS WHERE 'COMMENTS' = NAN REPRESENT SUBREDDITS WHERE ALL INTERACTIONS HAVE BEEN COMMENTS - THESE ARE HALF WEIGHTED
# IF SUBREDDIT HAS ONLY BEEN COMMENTED IN (NO POSTS MADE BY USER) SUBREDDIT SCORE IS HALF WEIGHTED

    # Appending the final value to the empty list defined above

    a.append(aaa)

# Inserting final value into a new column in the dataframe 

x['value'] = a

# Reducing the dataframe to just the columns of interest

x = x[['subreddit', 'value']]  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.comments[i] = 0


In [23]:
# Example dataframe containing a set of unique subreddits and calculated values 
# representing the user's level of interaction with each subreddit

x

Unnamed: 0,subreddit,value
0,options,45.769703
1,u_OliveInvestor,6.054051
2,StockMarket,32.535267
3,tech,2.007928
4,FluentInFinance,9.49215
5,news,3.306165
6,CryptoCurrency,2.007928
7,stocks,13.555316
8,dividends,7.45673
9,personalfinance,2.133661


In [24]:
new_dict = {}

# Setting the subreddit names as the dataframe index
x.reset_index(inplace=True, drop=True)
x.set_index('subreddit', inplace=True)
x.index.name = None

# Redefining the 'value' column name as the user's username
x.columns = [user]

# Transposing the dataframe from a single column into a single row
x = x.T

# User HAS interacted with 'wallstreetbets' therefore 'Target' = 1
x['Target'] = 1

new_dict[user] = x

In [25]:
# Example dataframe representing a single row in the final dataset

new_dict['OliveInvestor']

Unnamed: 0,options,u_OliveInvestor,StockMarket,tech,FluentInFinance,news,CryptoCurrency,stocks,dividends,personalfinance,...,smallstreetbets,VIAC,ValueInvesting,science,Superstonk,technology,TheRaceTo10Million,ETFs,AMD_Stock,Target
OliveInvestor,45.769703,6.054051,32.535267,2.007928,9.49215,3.306165,2.007928,13.555316,7.45673,2.133661,...,3.053517,0.503964,1.003964,0.501982,0.503964,0.501982,0.501982,0.501982,0.519821,1
