# GA DSI 21 - Capstone Project 
## Ayman al Amery

## Hypothesis

Can we predict whether a reddit user has interacted / will interact with a particular subreddit (e.g. wallstreetbets) based on their previous interactions with other subreddits?

Where an interaction with a subreddit is defined as publishing a post or comment in that subreddit. 

## Executive Summary

- Used OAUTH Web Token to access Reddit API from which data was sourced
- Definined several seed subreddits from which I collected the authors of the 100 most recent posts
- Then collected the authors of the comments on each of the 100 most recent posts


- For each author, I accessed the metadata of each of their 100 most recent posts and 100 most recent comments
- From the metadata I collected the subreddit, score, number of awards & number of comments for each post/comment (Note: comment metadata contains no information regarding number of comments/replies made on author's comment so filled with NaN)
- Reddit does not provide data regarding which subreddits a user is subscribed to, as such this alternative was used to serve as proxy


- For each given user I aggregated the data by subreddit calculating the number of interactions by the user with that particular subreddit and mean of each of the other features mentioned above
- Then for each subreddit interacted with by the given user, I used the various attributes to create a single value for that subreddit and user pair
- Finally I transformed the results into a single row of a DataFrame for each user


- Something about EDA
- Removing outliers
- Something about Feature Selection


- Something about Modelling and Analysis/Evaluation

# Data Mining

In [7]:
# Importing libraries

import requests
import requests.auth
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
# Credentials

clientid = '*****'
secret = '*****'
app_name = '*****'

username = '*****'
password = '*****'

In [17]:
# Getting Web Token to access Reddit API

client_auth = requests.auth.HTTPBasicAuth(f'{clientid}', f'{secret}')
post_data = {"grant_type": "password", "username": f"{username}", "password": f"{password}"}
headers = {"User-Agent": f"script:com.example.{app_name}:v1.0.0 (by u/{username})"}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
token = response.json()['access_token']
token

'61805754-br2x43JsLFQuYL1fKa-piAOoDbrtrA'

In [9]:
# 'wallstreetbets', 'aww', 'food', 'science'

seed_subreddit = 'science'

users = []

In [10]:
# Getting the authors and URLs of the 100 most recent posts in the seed subreddit

links = []

headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.{app_name}:v1.0.0 (by u/{username})"}
response = requests.get(f"https://oauth.reddit.com/r/{seed_subreddit}?limit=100", headers=headers)


for i in range(len(response.json()['data']['children'])):
    
    author = response.json()['data']['children'][i]['data']['author']
    link = response.json()['data']['children'][i]['data']['permalink']
    
    users.append(author)
    links.append(link)

In [11]:
# Getting the authors of comments on each of the 100 most recent posts 

for post in links:
    
    headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.{app_name}:v1.0.0 (by u/{username})"}
    response = requests.get(f"https://oauth.reddit.com{post}?limit=1000", headers=headers)
    
    for i in range(len(response.json()[1]['data']['children'])):
        
        try:
        
            author = response.json()[1]['data']['children'][i]['data']['author']
        
            users.append(author)
            
        except:
            
            pass

    time.sleep(1)

In [12]:
# Identifying number of unique users

len(users), len(set(users))

(1302, 959)

In [13]:
# Saving unique users to CSV

unique_users = list(set(users))
users = pd.DataFrame(unique_users, columns=['user'])
users.to_csv(f'{seed_subreddit}_users.csv')

In [16]:
# Defining an empty dictionary 

subreddits = {}

In [19]:
# 100 most recent posts and 100 most recent comments made by each user

for user in unique_users:
    
    print(user)
    
    # Creating a list of lists with a single element containing strings that will serve as column names
   
    user_activity = [['subreddit', 'score', 'awards', 'comments']]
    
    # Posts made by User

    headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.{app_name}:v1.0.0 (by u/{username})"}
    posts = requests.get(f"https://oauth.reddit.com/user/{user}/submitted?limit=100", headers=headers)
    
    # User may not have made any posts
    
    try:
        
        # User has made at least one post
    
        for x in posts.json()['data']['children']:

            
            p = []
            # The subreddit the post was published in
            p.append(x['data']['subreddit'])
            # The total score the post has received thus far 
            p.append(x['data']['score'])
            # The total number of awards the post has received thus far
            p.append(x['data']['total_awards_received'])
            # The number of comments the post has received thus far
            p.append(x['data']['num_comments'])

            user_activity.append(p)
        
    except:
        
        # User has made no posts 
        
        pass
        
    # Comments made by User

    headers = {"Authorization": f"bearer {token}", "User-Agent": f"script:com.example.{app_name}:v1.0.0 (by u/{username})"}
    comments = requests.get(f"https://oauth.reddit.com/user/{user}/comments?limit=100", headers=headers)

    # User may not have made any comments
    
    try:
        
        # User has made at least one comment
        
        for x in comments.json()['data']['children']:

            c = []
            # The subreddit the comment was published in
            c.append(x['data']['subreddit'])
            # The total score the comment has received thus far 
            c.append(x['data']['score'])
            # The total number of awards the comment has received thus far
            c.append(x['data']['total_awards_received'])
            # Inserting NaN as there is no data regarding number of comments received on a comment 
            c.append(np.nan)

            user_activity.append(c)
            
    except:
        
        # User has made no comments 
        
        pass
        
    # Establishing a key : value pair of the format username : list of lists
    
    subreddits[user] = user_activity
    
# If a user has no posts and no comments, it is likely because their account has been deleted


SmokeyBare
machine_yearning
LetReasonRing
TomorrowWeKillToday
Rupertfitz
flyunderradar
Zupheal
LoverOfPricklyPear
hubertortiz
grey_seal77
icecreamlava
paulfromatlanta
nothaut
woNubvJjg5KtwoHAOg1A
WildGooseCarolinian
bitskewer
IOnlySayMeanThings
MisterEChops
cockroachjuice
unabashedboy
differentiatedpans
Luffy507
Lord_Augastus
Not_Legal_Advice_Pod
MissionComfortable47
NicNoletree
Exastiken
Callec254
DrunkenSealPup
PistisDeKrisis
megalink5713
Gallionella
ImAlsoAHooman
Bubotuberpuss
getupkitten
cbbuntz
Mythril_Bahaumut
Fun-Dragonfruit2999
kingofwale
rustoo
Darth_Kahuna
Hieb
systemprocessing
ExtraArrogantBastard
mcninja77
PhD_Pwnology
ConcussionsOfAParot
shartymcqueef
kink-dinka-link
KillerJupe
JTiB
fusiformgyrus
jaguarthrone
LiCHtsLiCH
Rowdycc
Azahk101
SteinersGrave
vivaramones
DeadPoster
MilkofGuthix
rbalduf1818
devnull791101
Nervous-Violinist-32
ThePathToOne
FartandSminal
DreddPirateJonesy
pencile5
YetAnotherEden
AmiInderSchweiz
SpyTheRedEye
Slapppyface
NightlyWry
MacduffFifesNo1Thane
M

ReasonablyBadass
eric9495
LONEGOAT13_
alnueman1
Philmeiweep
Davo-80
_bobby_tables_
daemn42
bindermichi
Ok-File2825
BeginningTower2486
Alt_throwaway1020
reddititty69
Not_A_Bird11
HammofGlob
dodsbo
siriously1234
SamJackson01
shinypennyonthefloor
Rispy_Girl
Arkeband
Wolfram_Hebmuller
vrosej10
crostrom
Stuart66
TedTyro
ArchitectNebulous
Brynhilr
ghaldos
Johnmagee33
leftoverpotatosalad
Individual_Radio4523
internetpointsbitch
Artistic_Sound848
ooofest
narrative_device
I_never_post_but
PegasusFolley
LynxDiscombobulated6
issastrayngewerld
TonLoc1281
deadeye619
Visual_Tumbleweed644
Viddycentt
StoicOptom
Dessamba_Redux
CurlSagan
LordMooGoo
errantgrammar
skoltroll
PistachioMarsupial
socruisemebabe
stupidhoes
literallair
CaseyBoogies
kickassdonkey
DarkEvilHedgehog
Ragnar_Dragonfyre
amesydragon
virtuzoso
totallylambert
goosebattle
ToxDocUSA
Dd0G91
canadianprotoss
jorrylee
Swoshu
travelnman85
jebrennan
MediumProfessorX
spoobydoo
dvdmaven
Lokiwastxtonly
mymar101
QuantumHope
inbetween_inbetween
jimbo

In [21]:
# Defining a empty dictionary

random_dict = {}

In [22]:
# Coverting list of lists into pandas DataFrame for each user

for user in subreddits.keys():
    
    random_dict[user] = pd.DataFrame(subreddits[user][1:], columns=subreddits[user][0])

In [23]:
len(random_dict.keys())

959

In [24]:
# Removing users who have made no posts or comments (ie accounts have been deleted)

to_del = []

for user in random_dict.keys():
    
    if len(random_dict[user]) == 0:
        
        to_del.append(user)
        
for user in to_del:
    
    del random_dict[user]

In [25]:
len(random_dict.keys())

957

In [27]:
# Removing users sourced from other subreddits ('aww', 'science', 'food') who have interacted with 'wallstreetbets'

if seed_subreddit != 'wallstreetbets':

    to_del = []

    for user in random_dict.keys():

        if 'wallstreetbets' in random_dict[user].subreddit:

            to_del.append(user)

    for user in to_del:

        del random_dict[user]

In [28]:
len(random_dict.keys())

957

# Feature Engineering

See example walkthrough with single user

### Scoring Methodology

Number of posts/comments made on subreddit

x ( 1 + (score on subreddit / sum of all scores received) )

x ( 1 + (awards on subreddit / sum of all awards received) ) 

x ( 1 + (comments received on subreddit / sum of all comments received) )

In [29]:
# Defining an empty dictionary

new_dict = {}

In [30]:
for user in random_dict.keys():
    
    # Creating a list of lists with a single element containing strings that will serve as column names
    
    pss = [['subreddit', 'interactions', 'score', 'awards', 'comments']]

    x = random_dict[user]

    for sub in x.subreddit.unique():
        
        if sub != 'wallstreetbets':

            ps = []
            # The name of the subreddit
            ps.append(sub)
            # The number of interactions with that subreddit
            ps.append(x.subreddit.value_counts()[sub])
            # The mean score received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].score.mean())
            # The mean number of awards received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].awards.mean())
            # The mean number of comments received across all interactions with that subreddit
            ps.append(x[x.subreddit == sub].comments.mean())

            pss.append(ps)

    # Converting the list of lists into a pandas dataframe
    
    x = pd.DataFrame(pss[1:], columns=pss[0])

    # Defining an empty list
    
    a = []

    for i in x.index:
        
        # If a given row/subreddit does not contain a NaN value in the 'comments' column
        # the user has published at least one post in that subreddit

        if x.comments.isna()[i] == False: # Posts 

            # Using the methodology defined above, a users score/value for a given subreddit will be calculated
            
            # We will be using normalised values so we must account for cases where a user has never received 
            # any awards or comments as this would result in division by 0
            
            if x.awards.sum() != 0 and x.comments.sum() != 0 :

                # Creating a list containing the elements that will be used to calculate a single value

                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 1 + (x.awards[i] / x.awards.sum()), 
                      1 + (x.comments[i] / x.comments.sum())]

            elif x.comments.sum() == 0:

                # Creating a list containing the elements that will be used to calculate a single value
                
                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                      1 + (x.awards[i] / x.awards.sum())]

            elif x.awards.sum() == 0:

                # Creating a list containing the elements that will be used to calculate a single value

                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                     1 + (x.comments[i] / x.comments.sum())]

            # Finally calculating the single score/value by taking the product of the elements in the list
                
            aaa = np.prod(aa)


        # If a given row/subreddit contains a NaN value in the 'comments' column
        # the user has not published any posts in that subreddit
            
        else: # Comments

            x.comments[i] = 0

            if x.awards.sum() != 0 and x.comments.sum() != 0 :
                
                # Creating a list containing the elements that will be used to calculate a single value

                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 1 + (x.awards[i] / x.awards.sum()), 
                     1 + (x.comments[i] / x.comments.sum())]

            elif x.comments.sum() == 0:

                # Creating a list containing the elements that will be used to calculate a single value

                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                      1 + (x.awards[i] / x.awards.sum())]

            elif x.awards.sum() == 0:

                # Creating a list containing the elements that will be used to calculate a single value
                
                aa = [x.interactions[i], 1 + (x.score[i] / x.score.sum()), 
                     1 + (x.comments[i] / x.comments.sum())]
                
            # Finally calculating the single score/value by taking the product of the elements in the list
            
            # As the user has never posted on this subreddit but only commented, the final value is half weighted

            aaa = np.prod(aa) / 2


        # Appending the final value to the empty list defined above

        a.append(aaa)

    # Inserting final value into a new column in the dataframe 

    x['value'] = a
    
    # Reducing the dataframe to just the columns of interest
    x = x[['subreddit', 'value']]
    
    # Setting the subreddit names as the dataframe index
    x.reset_index(inplace=True, drop=True)
    x.set_index('subreddit', inplace=True)
    x.index.name = None
    
    # Redefining the 'value' column name as the user's username
    x.columns = [user]
    
    # Transposing the dataframe from a single column into a single row
    x = x.T
    
    # Assigning target classes
    
    if seed_subreddit == 'wallstreetbets':
        
        x['Target'] = 1
        
    else:
        
        x['Target'] = 0

        
    new_dict[user] = x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.comments[i] = 0
  1 + (x.awards[i] / x.awards.sum())]
  1 + (x.awards[i] / x.awards.sum())]


In [31]:
# Creating a list of all subreddits users have interacted with

subs_list = []

for x in new_dict.keys():
    
    for y in new_dict[x].columns:
        
        subs_list.append(y)

In [32]:
# Checking number of unique subreddits

len(subs_list), len(set(subs_list))

(38329, 9396)

In [33]:
# Getting a list of unique list of subreddits users have interacted with (+ 'Target')

subs_set = list(set(subs_list))

In [34]:
# Creating an empty (NaN) DataFrame with users as rows and subreddits + Target as columns

df = pd.DataFrame(np.nan, columns=subs_set, index=new_dict.keys())

In [39]:
# Populating the empty DataFrame with user/subreddit values

for x in new_dict.keys():
    
    for y in new_dict[x].columns:
        
        df.loc[x, y] = new_dict[x].loc[x, y]

In [64]:
df.shape

(862, 9395)

In [66]:
# Checking for empty columns

len(df.isna().sum()[df.isna().sum() == df.shape[0]].index)

849

In [67]:
# Removing empty columns

drop_cols = list(df.isna().sum()[df.isna().sum() == df.shape[0]].index)
df.drop(columns=drop_cols, inplace=True)

In [69]:
df.shape

(862, 8546)

In [70]:
# Making sure 'wallstreetbets' is not in the DataFrame

'wallstreetbets' in df.columns

False

In [74]:
# Making sure 'Target' is in the DataFrame

'Target' in df.columns

True

In [76]:
# Taking a look at the populated DataFrame

df.head()

Unnamed: 0,RedditDayOf,InsideTheSoulStone,WatchPeopleDieInside,xcmtb,Irony,bristol,nocontext,u_EmbarrassedHelp,AskDoctorSmeeee,Dodocodes,...,ryzen,BPDlovedones,dirtypenpals,crtgaming,minipainting,EverythingScience,CatsISUOTTATFO,EulaMains,u_ExmoColdDodger,NarcoticsAnonymous
SmokeyBare,,,,,,,,,,,...,,,,,,,,,,
machine_yearning,,,,,,,,,,,...,,,,,,,,,,
LetReasonRing,,,,,,,,,,,...,,,,,,,,,,
TomorrowWeKillToday,,,,,,,,,,,...,,,,,,,,,,
Rupertfitz,,,,,,,,,,,...,,,,,,3.504052,,,,


In [78]:
# Saving DataFrame to CSV

if seed_subreddit = 'wallstreetbets':
    
    df.to_csv(f'target_1.csv')
    
else:

    df.to_csv(f'target_0_{seed_subreddit}.csv')

# Concatinating seperate DataFrames into complete dataset

In [None]:
# Reading CSVs into DataFrames

wsb = pd.read_csv('target_1.csv')
aww = pd.read_csv('target_0_aww.csv')
science = pd.read_csv('target_0_science.csv')
food = pd.read_csv('target_0_food.csv')

In [None]:
# Setting users as index

wsb.set_index('Unnamed: 0', inplace=True)
wsb.index.name = None

aww.set_index('Unnamed: 0', inplace=True)
aww.index.name = None

science.set_index('Unnamed: 0', inplace=True)
science.index.name = None

food.set_index('Unnamed: 0', inplace=True)
food.index.name = None

In [None]:
# Concatinating seperate DataFrames into complete dataset 

df = pd.concat([wsb, aww, science, food])

In [None]:
df.shape

In [None]:
df.Target.value_counts(normalize=True)

In [None]:
# Removing duplicate users
# Would have used .drop_duplicates(), however it is possible that a user has had additional interactions since 

drop_rows = df.index.value_counts()[df.index.value_counts() > 1].index
df2.drop(index=drop_rows, inplace=True)

In [None]:
# Removing empty columns

drop_cols = list(df.isna().sum()[df.isna().sum() == df.shape[0]].index)
df.drop(columns=drop_cols, inplace=True)

In [None]:
# Checking out DataFrame

df.head()

In [None]:
# Saving complete dataset to CSV

df.to_csv('reddit_class_data.csv')