### Code to render charts for "Inside the Hivemind of r/wallstreetbets - July 2021"

In [1]:
import beneath
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

from config import BLACKLIST, MODERATORS
from layout import make_layout
from datetime import datetime

## Stock mentions

In [4]:
mentions = await beneath.query_warehouse("""
with
    stock_mentions_posts as (
        select 
            symbol, 
            timestamp_trunc(timestamp, day) as day, 
            count(*) as num_mentions
        from `examples/wallstreetbets-analytics/r-wallstreetbets-posts-stock-mentions`
        group by symbol, timestamp_trunc(timestamp, day)
    ),
    stock_mentions_comments as (
        select 
            symbol, 
            timestamp_trunc(timestamp, day) as day, 
            count(*) as num_mentions
        from `examples/wallstreetbets-analytics/r-wallstreetbets-comments-stock-mentions`
        group by symbol, timestamp_trunc(timestamp, day)
    )
select 
    coalesce(p.symbol, c.symbol) as symbol,
    coalesce(p.day, c.day) as day,
    ifnull(p.num_mentions, 0) + ifnull(c.num_mentions,0) as num_mentions
from stock_mentions_posts p
full join stock_mentions_comments c on p.symbol = c.symbol and p.day = c.day
order by symbol, day
""")

In [5]:
mentions = mentions[~mentions['symbol'].isin(BLACKLIST)]

Keep track of the top symbols

In [6]:
top_symbols_alltime_df = mentions \
    .groupby('symbol')['num_mentions'] \
    .sum() \
    .reset_index() \
    .sort_values('num_mentions', ascending=False)
top_symbols_alltime = top_symbols_alltime_df['symbol'][:100]
top_symbols_month_df = mentions.loc[mentions['day'].dt.month == 7] \
    .groupby('symbol')['num_mentions'] \
    .sum() \
    .reset_index() \
    .sort_values('num_mentions', ascending=False)
top_symbols_month = top_symbols_month_df['symbol'][:100]

In [62]:
fig = px.bar(
    top_symbols_month_df[0:10],
    x="num_mentions",
    y="symbol",
    text="num_mentions",
    color='symbol',
    orientation='h',
    labels={'num_mentions': 'Mentions', 'symbol': 'Symbol'},
)
fig.update_layout(
    make_layout(
        title="Top 10 Most Popular Symbols in July",
        subtitle="Number of mentions in posts and comments",
        legend=False,
        source_hidden=False,
        override={
            "bargap": 0,
            'showlegend': False,
            'yaxis_title': '',
            'xaxis_zeroline': False
        },
    )
)
fig.update_traces({
    'marker_line_width': 0                 
})
fig.show()

In [86]:
fig = px.line(mentions.loc[(mentions['symbol'].isin(top_symbols_month[:5])) & (mentions['day'].dt.month == 7)], 
              x="day",
              y="num_mentions",
              line_group="symbol",
              color="symbol",
              labels={'num_mentions': 'Mentions', 'day': 'Day', 'symbol': 'Symbol'},
             )
fig.update_layout(
    make_layout(
        title="Stock mentions in July",
        subtitle="Number of mentions in posts and comments",
        source_hidden=False,
        legend=True,
        override={
#             'showlegend': True,
            'xaxis': {
                'tickmode': 'array',
                'tickvals': ["2021-07-07", "2021-07-14", "2021-07-21", "2021-07-28"],
                'title': ''
            }
        },
    )
)
fig.show()

Get the fraction of all mentions in the day

In [79]:
total_daily_mentions = mentions.groupby('day')['num_mentions'].sum().reset_index()
tmp = mentions \
    .merge(total_daily_mentions, on="day", suffixes=('_stock', '_total')) \
    .sort_values(['symbol','day'])
tmp['fraction_of_mentions'] = tmp['num_mentions_stock'] / tmp['num_mentions_total']
tmp['fraction_of_mentions_MA'] = tmp.groupby('symbol')['fraction_of_mentions'] \
                                    .transform(lambda x: x.rolling(window=7, min_periods=1).mean())

In [91]:
# symbols_to_graph = top_symbols_month[0:10]
symbols_to_graph = ['AMC', 'GME']
fig = px.line(tmp[tmp['symbol'].isin(symbols_to_graph)].sort_values('day', ascending=False), 
              x="day",
              y="fraction_of_mentions", 
              line_group="symbol",
              color="symbol", 
              labels={'day': 'Day', 'fraction_of_mentions': 'Fraction of mentions', 'symbol': 'Symbol'},
            )
fig.update_layout(
    make_layout(
        title="Relative interest in GME and AMC has declined",
        subtitle="Share of mentions across all stocks on r/wallstreetbets",
        source_hidden=False,
        legend=True,
        override={
            'xaxis': {
                'title': ''
            }
        },
    )
)
fig.show()

In [105]:
# symbols_to_graph = top_symbols_month[0:10]
symbols_to_graph = ['CLOV', 'SPCE']
fig = px.line(tmp[tmp['symbol'].isin(symbols_to_graph)].sort_values('day', ascending=False), 
              x="day",
              y="fraction_of_mentions", 
              line_group="symbol",
              color="symbol", 
              labels={'day': 'Day', 'fraction_of_mentions': 'Fraction of mentions', 'symbol': 'Symbol'},
            )
fig.update_layout(
    make_layout(
        title="Relative interest in CLOV and SPCE has increased",
        subtitle="Share of mentions across all stocks on r/wallstreetbets",
        source_hidden=False,
        legend=True,
        override={
            'xaxis': {
                'title': ''
            }
        },
    )
)
fig.show()

In [82]:
fig = px.line(tmp[tmp['symbol'].isin(top_symbols_month[0:10])].sort_values('day', ascending=False), 
              x="day", 
              y="fraction_of_mentions_MA",
              line_group="symbol",
              color="symbol",
              labels={'day': 'Day', 'fraction_of_mentions_MA': 'Fraction of mentions (7 day rolling average)', 'symbol': 'Symbol'},
             )
fig.update_layout(
    make_layout(
        title="Fraction of mentions (7 day rolling average)",
        subtitle="...",
        source_hidden=False,
        legend=True,
        override={
            'xaxis': {
                'title': ''
            },
            'yaxis': {
                'title': '% of all mentions'
            }
        },
    )
)
fig.show()

In [83]:
symbol_peaks = tmp.loc[tmp.groupby(['symbol'])["fraction_of_mentions_MA"].idxmax()][['symbol', 'day']] \
    .rename(columns={'day': 'date_of_peak_popularity'})
tmp2 = tmp.merge(symbol_peaks, on='symbol')
tmp2['days_from_peak'] = (tmp2['day'] - tmp2['date_of_peak_popularity']).dt.days
tmp2.head()

Unnamed: 0,symbol,day,num_mentions_stock,@meta.timestamp,num_mentions_total,fraction_of_mentions,fraction_of_mentions_MA,date_of_peak_popularity,days_from_peak
0,A,2021-03-10 00:00:00+00:00,1,2021-07-26 13:57:37.227000+00:00,36129,2.8e-05,2.8e-05,2021-05-11 00:00:00+00:00,-62
1,A,2021-03-16 00:00:00+00:00,1,2021-07-26 13:57:37.227000+00:00,19681,5.1e-05,3.9e-05,2021-05-11 00:00:00+00:00,-56
2,A,2021-03-19 00:00:00+00:00,1,2021-07-26 13:57:37.227000+00:00,16490,6.1e-05,4.6e-05,2021-05-11 00:00:00+00:00,-53
3,A,2021-03-22 00:00:00+00:00,2,2021-07-26 13:57:37.227000+00:00,14202,0.000141,7e-05,2021-05-11 00:00:00+00:00,-50
4,A,2021-03-23 00:00:00+00:00,1,2021-07-26 13:57:37.227000+00:00,18362,5.4e-05,6.7e-05,2021-05-11 00:00:00+00:00,-49


In [85]:
fig = px.line(tmp2[tmp2['symbol'].isin(top_symbols_alltime[0:10])].sort_values('day', ascending=False), 
              x="days_from_peak", 
              y="num_mentions_stock",
              line_group="symbol",
              color="symbol",
              labels={'days_from_peak': 'Days from peak', 'num_mentions_stock': 'Mentions', 'symbol': 'Symbol'},
             )
fig.update_layout(
    make_layout(
        title="Days from peak",
        subtitle="...",
        source_hidden=False,
        legend=True,
        override={},
    )
)
fig.show()

## Interesting discussions

In [2]:
pd.set_option('max_colwidth', 100)

In [12]:
posts = await beneath.query_warehouse("""
with
    posts as (
        select 
            created_on, 
            id, 
            author, 
            title, 
            text, 
            flair, 
            permalink,
            length(text) as post_length,
            array_length(regexp_extract_all(title, r"\\x{1F680}")) + array_length(regexp_extract_all(text, r"\\x{1F680}")) as num_rockets,
            array_length(regexp_extract_all(title, r"\\x{1F48E}")) + array_length(regexp_extract_all(text, r"\\x{1F48E}")) as num_diamonds,
        from `examples/reddit/r-wallstreetbets-posts`
        where timestamp_trunc(created_on, month) = "2021-07-01"
    ),
    comments as (
        select 
            created_on, 
            id, 
            post_id, 
            author, 
            text,
            length(text) as comment_length,
            array_length(regexp_extract_all(c.text, r"\\x{1F680}")) as num_rockets,
            array_length(regexp_extract_all(c.text, r"\\x{1F48E}")) as num_diamonds,
        from `examples/reddit/r-wallstreetbets-comments` c
        where timestamp_trunc(created_on, month) = "2021-07-01"
    )
select 
    p.created_on, 
    p.title, 
    p.text, 
    p.author, 
    p.flair, 
    p.permalink,
    p.post_length,
    count(c.id) as num_comments,
    sum(c.comment_length) as sum_comments_length, 
    sum(c.comment_length)/count(c.id) as avg_comment_length,
    p.num_rockets + sum(c.num_rockets) as num_rockets,
    p.num_diamonds + sum(c.num_diamonds) as num_diamonds,
    count(distinct c.author) as nunique_commenters,
from posts p
join comments c on p.id = c.post_id
group by p.created_on, p.title, p.text, p.author, p.flair, p.permalink, p.post_length, p.num_rockets, p.num_diamonds
""")

Exclude moderators

In [15]:
posts_no_mods = posts.loc[[author not in MODERATORS for author in posts['author']]]

Post with most comments

In [16]:
posts_no_mods.sort_values('num_comments', ascending=False).head(3)

Unnamed: 0,created_on,title,text,author,flair,permalink,post_length,num_comments,sum_comments_length,avg_comment_length,num_rockets,num_diamonds,nunique_commenters,@meta.timestamp
6713,2021-07-16 21:29:47+00:00,Zillow is aiming to kill real estate agents,Six fucking percent for a week of work. That's what realtors want in this market. 3% for the b...,zer0sumgames,Discussion,/r/wallstreetbets/comments/olqhjh/zillow_is_aiming_to_kill_real_estate_agents/,2229,4345,1010619,232.593556,8,0,2903,2021-07-23 13:57:35.735000+00:00
823,2021-07-09 16:01:26+00:00,"$90 > $24,000 in one day.",,kado63,Gain,/r/wallstreetbets/comments/ogym6z/90_24000_in_one_day/,0,3670,279444,76.142779,16,2,2785,2021-07-23 13:57:30.323000+00:00
5063,2021-07-04 17:03:18+00:00,If the DD involves a “short squeeze”… ignore it. Stop. Get help. Learn to use your brain.,I’m just an autist that wants to help my fellow dudes make money. Have I been guilty of this in ...,iyioi,Discussion,/r/wallstreetbets/comments/odoir9/if_the_dd_involves_a_short_squeeze_ignore_it_stop/,2191,2409,317987,131.999585,59,17,1475,2021-07-23 13:57:34.593000+00:00


Longest discussion

In [17]:
posts_no_mods.sort_values('sum_comments_length', ascending=False).head(3)

Unnamed: 0,created_on,title,text,author,flair,permalink,post_length,num_comments,sum_comments_length,avg_comment_length,num_rockets,num_diamonds,nunique_commenters,@meta.timestamp
6713,2021-07-16 21:29:47+00:00,Zillow is aiming to kill real estate agents,Six fucking percent for a week of work. That's what realtors want in this market. 3% for the b...,zer0sumgames,Discussion,/r/wallstreetbets/comments/olqhjh/zillow_is_aiming_to_kill_real_estate_agents/,2229,4345,1010619,232.593556,8,0,2903,2021-07-23 13:57:35.735000+00:00
5063,2021-07-04 17:03:18+00:00,If the DD involves a “short squeeze”… ignore it. Stop. Get help. Learn to use your brain.,I’m just an autist that wants to help my fellow dudes make money. Have I been guilty of this in ...,iyioi,Discussion,/r/wallstreetbets/comments/odoir9/if_the_dd_involves_a_short_squeeze_ignore_it_stop/,2191,2409,317987,131.999585,59,17,1475,2021-07-23 13:57:34.593000+00:00
823,2021-07-09 16:01:26+00:00,"$90 > $24,000 in one day.",,kado63,Gain,/r/wallstreetbets/comments/ogym6z/90_24000_in_one_day/,0,3670,279444,76.142779,16,2,2785,2021-07-23 13:57:30.323000+00:00


Post with the highest avg comment length (at least 5 comments)

In [30]:
posts_no_mods[posts_no_mods['num_comments'] >= 15].sort_values('avg_comment_length', ascending=False).head(3)

Unnamed: 0,created_on,title,text,author,flair,permalink,post_length,num_comments,sum_comments_length,avg_comment_length,num_rockets,num_diamonds,nunique_commenters,@meta.timestamp
5069,2021-07-17 20:06:16+00:00,What Quantitative Easing actually is [For Retards],"I've seen a lot of people spreading misinformation about how the Fed actually works, and misrepr...",Traditional_Fee_8828,Discussion,/r/wallstreetbets/comments/ombo3v/what_quantitative_easing_actually_is_for_retards/,1580,63,30891,490.333333,0,0,26,2021-07-23 13:57:34.593000+00:00
565,2021-07-15 01:36:36+00:00,"💚 Andy on EV Charge Stocks: I Like $EVGO a lot, and it's on sale today!","Hi, I'm Andy.\n\nI know next-to-nothing about money markets, but I know a lot about EVs and the ...",andy-broker,DD,/r/wallstreetbets/comments/okiliv/andy_on_ev_charge_stocks_i_like_evgo_a_lot_and/,4976,45,19964,443.644444,11,0,21,2021-07-23 13:57:30.323000+00:00
6723,2021-07-10 17:59:23+00:00,The Housing Market Is Not In A Bubble: Popping the Hopium Bubble,"Alright, so I have recently seen an uptick in stupidity regarding the housing market, and I have...",Hani95,Discussion,/r/wallstreetbets/comments/ohniqc/the_housing_market_is_not_in_a_bubble_popping_the/,4974,413,176308,426.895884,3,0,182,2021-07-23 13:57:35.735000+00:00


Post with the most rockets (including comments)

In [19]:
posts_no_mods.sort_values('num_rockets', ascending=False).head(3)

Unnamed: 0,created_on,title,text,author,flair,permalink,post_length,num_comments,sum_comments_length,avg_comment_length,num_rockets,num_diamonds,nunique_commenters,@meta.timestamp
5101,2021-07-01 20:23:39+00:00,"Daily option statistics for AMC, GME, WISH, MU, CLOV, AMD, NIO, PLTR, CLNE, and others.","Hey guys, \n\nAs usual here is the daily option statistics for the most popular stocks on WSB. E...",Chillznday,Discussion,/r/wallstreetbets/comments/obud5h/daily_option_statistics_for_amc_gme_wish_mu_clov/,1726,231,19873,86.030303,137,12,153,2021-07-23 13:57:34.593000+00:00
230,2021-07-09 22:18:16+00:00,I'm a retired hedge fund manager and decided to pull up the sleeves for one last O'hare play (Yo...,\n\n### [🚀](https://emojipedia.org/rocket/)[🚀](https://emojipedia.org/rocket/)[🚀](https://emoji...,wetdirtkurt,DD,/r/wallstreetbets/comments/oh62by/im_a_retired_hedge_fund_manager_and_decided_to/,4616,13,1471,113.153846,132,0,13,2021-07-23 13:57:30.322000+00:00
6347,2021-07-06 18:06:24+00:00,"Let me clear the air here regarding GME, AMC, & CLOV","Alright you retards, I'm taking the initiative to clear the air. I'm probably going to get a lot...",ShortChecker,Discussion,/r/wallstreetbets/comments/of0hls/let_me_clear_the_air_here_regarding_gme_amc_clov/,5012,1743,210471,120.752151,129,63,1320,2021-07-23 13:57:35.735000+00:00


Post with the most diamonds (including comments)

In [20]:
posts_no_mods.sort_values('num_diamonds', ascending=False).head(3)

Unnamed: 0,created_on,title,text,author,flair,permalink,post_length,num_comments,sum_comments_length,avg_comment_length,num_rockets,num_diamonds,nunique_commenters,@meta.timestamp
6347,2021-07-06 18:06:24+00:00,"Let me clear the air here regarding GME, AMC, & CLOV","Alright you retards, I'm taking the initiative to clear the air. I'm probably going to get a lot...",ShortChecker,Discussion,/r/wallstreetbets/comments/of0hls/let_me_clear_the_air_here_regarding_gme_amc_clov/,5012,1743,210471,120.752151,129,63,1320,2021-07-23 13:57:35.735000+00:00
2602,2021-07-11 17:53:34+00:00,"💎💎💎💎💎💎💎💎💎💎💎💎💎💎💎SPCE, 💎AMC, 💎GME💎💎💎💎💎💎💎💎💎💎💎💎💎💎💎💎",,Rockyturki,Meme,/r/wallstreetbets/comments/oi95zc/spce_amc_gme/,0,1,736,736.0,0,33,1,2021-07-23 13:57:32.379000+00:00
3966,2021-07-06 16:12:32+00:00,I heard we’re back to YOLOing $GME …🌍👨🏻‍🚀 🔫👨🏽‍🚀,,Gold_Flake,YOLO,/r/wallstreetbets/comments/oey4yt/i_heard_were_back_to_yoloing_gme/,0,804,56340,70.074627,57,20,612,2021-07-23 13:57:32.966000+00:00


## Degenerate shoutouts

In [21]:
authors = await beneath.query_warehouse("""
with
    posts_enhanced as (
        select *,
            array_length(split(title, " ")) as num_words_title,
            array_length(split(text, " ")) as num_words_body,
            array_length(regexp_extract_all(title, r"\\x{1F680}")) as num_rockets_title,
            array_length(regexp_extract_all(text, r"\\x{1F680}")) as num_rockets_body,
            array_length(regexp_extract_all(title, r"\\x{1F48E}")) as num_diamonds_title,
            array_length(regexp_extract_all(text, r"\\x{1F48E}")) as num_diamonds_body
        from `examples/reddit/r-wallstreetbets-posts`
        where timestamp_trunc(created_on, month) = "2021-07-01"
    ),
    comments_enhanced as (
        select *,
            array_length(split(text, " ")) as num_words,
            array_length(regexp_extract_all(text, r"\\x{1F680}")) as num_rockets,
            array_length(regexp_extract_all(text, r"\\x{1F48E}")) as num_diamonds
        from `examples/reddit/r-wallstreetbets-comments`
        where timestamp_trunc(created_on, month) = "2021-07-01"
    ),
    author_posts_stats as (
        select 
            author, 
            count(*) as num_posts,
            sum(num_words_title) + sum(num_words_body) as num_words,
            sum(num_rockets_title) + sum(num_rockets_body) as num_rockets,
            sum(num_diamonds_title) + sum(num_diamonds_body) as num_diamonds,
        from posts_enhanced
        group by author
    ),
    author_comments_stats as (
        select
            author,
            count(*) as num_comments,
            sum(num_words) as num_words,
            sum(num_rockets) as num_rockets,
            sum(num_diamonds) as num_diamonds
        from comments_enhanced
        group by author
    )
select 
    coalesce(p.author, c.author) as author,
    ifnull(p.num_posts, 0) as num_posts,
    ifnull(c.num_comments, 0) as num_comments,
    ifnull(p.num_words, 0) + ifnull(c.num_words, 0) as num_words,
    ifnull(p.num_rockets, 0) + ifnull(c.num_rockets, 0) as num_rockets,
    ifnull(p.num_diamonds, 0) + ifnull(c.num_diamonds, 0) as num_diamonds,
from author_posts_stats p
full join author_comments_stats c on p.author = c.author
""")

In [22]:
authors_no_mods = authors.loc[[author not in MODERATORS for author in authors['author']]]

Author with the most posts

In [23]:
authors_no_mods.sort_values('num_posts', ascending=False).head(3)

Unnamed: 0,author,num_posts,num_comments,num_words,num_rockets,num_diamonds,@meta.timestamp
18469,margin_call_rep,33,697,8293,0,0,2021-07-23 14:14:41.315000+00:00
10651,Throwaway1forall,32,391,3119,0,0,2021-07-23 14:14:38.416000+00:00
20936,Ratchet_as_fuck,27,248,4036,0,0,2021-07-23 14:14:42.006000+00:00


Author with the most comments

In [24]:
authors_no_mods.sort_values('num_comments', ascending=False).head(3)

Unnamed: 0,author,num_posts,num_comments,num_words,num_rockets,num_diamonds,@meta.timestamp
4860,cashflow_,1,3700,38610,0,0,2021-07-23 14:14:36.066000+00:00
7722,quarantrader,0,3266,24637,3,0,2021-07-23 14:14:37.202000+00:00
24839,Historical-Egg3243,0,2057,27443,0,0,2021-07-23 14:14:43.881000+00:00


Author who wrote the most words

In [25]:
authors_no_mods.sort_values('num_words', ascending=False).head(3)

Unnamed: 0,author,num_posts,num_comments,num_words,num_rockets,num_diamonds,@meta.timestamp
4860,cashflow_,1,3700,38610,0,0,2021-07-23 14:14:36.066000+00:00
1870,NotSm4rt,0,1594,31334,4,0,2021-07-23 14:14:34.510000+00:00
24839,Historical-Egg3243,0,2057,27443,0,0,2021-07-23 14:14:43.881000+00:00


Author who posted the most rockets

In [26]:
authors_no_mods.sort_values('num_rockets', ascending=False).head(3)

Unnamed: 0,author,num_posts,num_comments,num_words,num_rockets,num_diamonds,@meta.timestamp
6407,Dark_Prinzz,0,12,1356,864,0,2021-07-23 14:14:36.789000+00:00
25328,mateace,0,6,60,720,0,2021-07-23 14:14:44.243000+00:00
23802,Jokerboi129,0,41,144,299,2,2021-07-23 14:14:43.453000+00:00


Author who posted the most diamonds

In [27]:
authors_no_mods.sort_values('num_diamonds', ascending=False).head(3)

Unnamed: 0,author,num_posts,num_comments,num_words,num_rockets,num_diamonds,@meta.timestamp
4242,Minute-Ad-2749,0,135,1685,0,149,2021-07-23 14:14:36.066000+00:00
13781,GoldGlad2495,0,84,592,171,147,2021-07-23 14:14:39.511000+00:00
25873,Tazzer57,0,135,1493,90,78,2021-07-23 14:14:44.243000+00:00
