Now that we have reddit posts labeled with which stocks, it is about, lets gather some stats about the distribution of stock related posts in different subreddits.

In [41]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(os.getcwd(), 'data', 'full_combined.csv'), index_col=0)
df.tail()

Unnamed: 0,created,title,selftext,upvote_ratio,score,gilded,total_awards_received,num_comments,stock,subreddit
771454,2021-12-24 15:18:30,NVAX - Lambo or Ramen (Part II),Here's a link to my part I - [LINK](https://ww...,0.92,69,1,3,53,"SII,NVAX",wallstreetbets
771559,2021-12-24 20:01:26,Suspicions about TSLA,Why are people so keen on buying stocks and op...,0.56,19,0,0,214,GM,wallstreetbets
771567,2021-12-24 20:18:09,I’ve Had an Epiphany (Or am I retarted? You te...,I came up with a strategy that seems like a no...,0.8,59,0,0,148,ABCL,wallstreetbets
771580,2021-12-24 21:12:42,Best EV play - Bullish as F ….. MB-private ana...,If I were a TESLA investor... though I'm not.....,0.48,0,0,0,47,"TSLA,FORD",wallstreetbets
771602,2021-12-24 22:22:52,China's Currency Trouble.,The Chinese Yuan is a volatile currency and th...,0.79,68,0,0,68,PBC,wallstreetbets


In [42]:
df.groupby('subreddit').size().sort_values()

subreddit
securityanalysis             3
robinhood                    7
financialindependence       11
forex                       36
robinhoodpennystocks       138
investing                  218
stockmarket                411
options                    633
pennystocks                807
stocks                    2008
personalfinance           6966
wallstreetbets           16809
gme                      20702
dtype: int64

In [43]:
stocks = df['stock']
stock_set = set()
for x in stocks:

    stock_set = stock_set | set(x.split(','))

len(stock_set)

2898

In [44]:
import pprint

n_occurrences = {}
for x in stocks:
    for symb in x.split(','):
        n_occurrences.setdefault(symb, 0)
        n_occurrences[symb] += 1

n_occurrences = [(val, key) for key, val in n_occurrences.items()]
n_occurrences.sort(reverse=True)

pprint.pprint(n_occurrences)


[(25874, 'GME'),
 (3834, 'AMC'),
 (1671, 'AMZN'),
 (1559, 'IRS'),
 (1444, 'TSLA'),
 (1266, 'AAPL'),
 (1184, 'TWTR'),
 (1019, 'BB'),
 (869, 'RC'),
 (849, 'NOK'),
 (779, 'NDAQ'),
 (728, 'MSFT'),
 (709, 'TD'),
 (708, 'CC'),
 (664, 'CS'),
 (636, 'CCF'),
 (540, 'PLTR'),
 (521, 'AMD'),
 (506, 'BC'),
 (472, 'MSM'),
 (466, 'HYT'),
 (455, 'NFLX'),
 (403, 'FORD'),
 (393, 'RSI'),
 (393, 'ATH'),
 (375, 'IBKR'),
 (374, 'GSBD'),
 (374, 'BAC'),
 (366, 'FICO'),
 (341, 'MS'),
 (341, 'DTC'),
 (333, 'APR'),
 (325, 'NVDA'),
 (275, 'GM'),
 (262, 'PFE'),
 (262, 'EFX'),
 (261, 'SI'),
 (235, 'INTC'),
 (233, 'MA'),
 (218, 'SPCE'),
 (213, 'SOFI'),
 (212, 'TRU'),
 (202, 'NIO'),
 (196, 'CLOV'),
 (193, 'ALLY'),
 (190, 'TLRY'),
 (180, 'SNDL'),
 (174, 'EBAY'),
 (172, 'UI'),
 (168, 'DOW'),
 (164, 'CRSR'),
 (157, 'V'),
 (154, 'IEX'),
 (153, 'TGT'),
 (153, 'FB'),
 (140, 'TBC'),
 (139, 'DKNG'),
 (136, 'ABNB'),
 (132, 'WKHS'),
 (132, 'GS'),
 (127, 'GE'),
 (125, 'TECH'),
 (120, 'ROKU'),
 (116, 'VIAC'),
 (116, 'LFG'),
 (11

Lets limit the study to the 100 most commonly discussed stocks

In [45]:
stock_subset = [x[1] for x in n_occurrences[0:100]]
print(len(df))
def drop_stocks_from_df(x, subset):
    s = x.split(',')
    new_s = []
    for symb in s:
        if symb in subset:
            new_s.append(symb)
    return ','.join(new_s)

df['stock'] = df['stock'].apply(lambda x : drop_stocks_from_df(x, stock_subset))

df = df.drop(df[df['stock'] == ''].index)
print(len(df))

48749
41332


In [46]:
# verify that we now have 100 stocks in the df
stocks = df['stock']
stock_set = set()
for x in stocks:

    stock_set = stock_set | set(x.split(','))

len(stock_set)

100

In [47]:
df.groupby('subreddit').size().sort_values()

subreddit
robinhood                    2
securityanalysis             3
financialindependence        4
forex                       19
robinhoodpennystocks        32
investing                  103
stockmarket                231
pennystocks                292
options                    315
stocks                    1224
personalfinance           5033
wallstreetbets           14485
gme                      19589
dtype: int64

r/financialindependence, r/forex, r/robinhood, r/robinhoodpennystocks and r/securityanalysis contain less than 50 posts each, and due to the sparseness of data for these subreddits, they are removed

In [48]:
df = df[~(df['subreddit'].str.contains('financialindependence|forex|robinhood|robinhoodpennystocks|securityanalysis'))]
df.groupby('subreddit').size().sort_values()

subreddit
investing            103
stockmarket          231
pennystocks          292
options              315
stocks              1224
personalfinance     5033
wallstreetbets     14485
gme                19589
dtype: int64

In [61]:
from datetime import datetime
df['created'] = pd.to_datetime(df['created'])
start_date = df['created'].dt.date.min()
end_date = df['created'].dt.date.max()
print('first date:', start_date, ', end date:', end_date)

df.to_csv(os.path.join(os.getcwd(), 'data', 'filtered_posts.csv'))

first date: 2021-01-02 , end date: 2021-12-25


Now that we have limited ourselves to 100 stocks, lets fetch the stock market data in form of daily close prices

In [62]:
import yfinance as yf

data = yf.download(stock_subset, start=start_date, end=end_date)

[*********************100%***********************]  100 of 100 completed


In [66]:
prices = data['Adj Close']
prices.head()

Unnamed: 0_level_0,AAPL,ABNB,AC,ALLY,AMC,AMD,AMZN,APR,ATH,BABA,...,TWTR,UBER,UBS,UI,UWMC,V,VIAC,WKHS,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,128.617111,139.149994,34.782536,34.772446,2.01,92.300003,3186.629883,,41.990002,227.850006,...,54.529999,51.139999,13.963018,269.026337,12.299272,216.430527,35.790138,21.42,144.236282,39.095947
2021-01-05,130.207306,148.300003,34.275127,35.145397,1.98,92.769997,3218.51001,,42.869999,240.399994,...,53.880001,54.009998,14.275696,276.219971,11.995352,213.200378,36.171513,22.43,143.468491,40.980091
2021-01-06,125.824318,142.770004,33.608524,36.283863,2.01,90.330002,3138.379883,,46.009998,227.610001,...,53.259998,52.48,14.754484,266.035614,11.397008,211.321899,39.310478,23.65,144.364258,42.025787
2021-01-07,130.117859,151.270004,35.658066,36.519409,2.05,95.160004,3162.159912,,46.209999,226.899994,...,52.330002,56.130001,15.184416,255.960526,11.805402,212.504654,39.23225,27.6,144.354416,42.355515
2021-01-08,131.240936,149.770004,36.304771,36.676437,2.14,94.580002,3182.699951,,45.16,236.190002,...,51.48,53.279999,14.940136,255.354431,12.061833,214.134628,39.173576,25.565001,144.334717,42.826546


In [67]:
prices.to_csv(os.path.join(os.getcwd(), 'data', 'filtered_prices.csv'))

Visualizations