In [1]:
import altair as alt
import pandas as pd

alt.themes.enable("dark")

ThemeRegistry.enable('dark')

In [2]:
df = pd.read_parquet("../data/processed/reddit_posts_2.parquet")
df.head(5)

Unnamed: 0,id,parent_id,author,body,created,depth,edited,score,search_query,subreddit,title,url,num_comments
0,1fz46jo,,writtey,,2024-10-08 16:36:10,-1,,257,Ethereum ETH,r/CryptoCurrency,This User Paid $700K for a Single Ethereum Tra...,https://cointab.com/user-paid-700k-ethereum-tr...,137.0
1,lqyulca,1fz46jo,Baecchus,Average gas fee back in 2021,2024-10-08 17:15:39,0,,208,Ethereum ETH,r/CryptoCurrency,,,
2,lqzv2yy,lqyulca,partymsl,Ah. The good old forced hodl days due to gas f...,2024-10-08 20:38:34,1,,39,Ethereum ETH,r/CryptoCurrency,,,
3,lr09t08,lqzv2yy,itsaBazinga,I always had the StarCraft we require more ves...,2024-10-08 22:06:13,2,,20,Ethereum ETH,r/CryptoCurrency,,,
4,lr0jjil,lr09t08,Atyzzze,You must construct additional pylons! (stake a...,2024-10-08 23:12:21,3,,6,Ethereum ETH,r/CryptoCurrency,,,


In [3]:
df["search_query"].value_counts()

search_query
Bitcoin BTC           233353
Ethereum ETH           89614
Cosmos ATOM            52126
Safe Moon SAFEMOON     38399
Avalanche AVAX         16983
FTX Token FTT           2180
Name: count, dtype: Int64

In [4]:
subreddit_query = (
    df.groupby(["search_query", "subreddit"])
    .size()
    .unstack()
    .fillna(0)
    .astype(int)
    .reset_index()
    .melt(id_vars="search_query")
)

c = (
    alt.Chart(subreddit_query)
    .mark_bar()
    .encode(
        x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 250_000]}),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color("subreddit:N", title="Subreddit"),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Subreddit"),
    )
)


text = (
    alt.Chart(subreddit_query)
    .mark_text(align="left", dx=5, color="white")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("sum(value):Q", scale={"domain": [0, 250_000]}),
        y=alt.Y("search_query:O"),
        text=alt.Text("sum(value):Q"),
    )
)

c + text

In [5]:
subreddit_query_normalized = subreddit_query.copy()
posts_per_coin = subreddit_query_normalized.groupby("search_query")["value"].transform(
    "sum"
)
subreddit_query_normalized["value"] /= posts_per_coin


c = (
    alt.Chart(subreddit_query_normalized)
    .mark_rect()
    .encode(
        x=alt.X("subreddit:N", title="Subreddit"),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color(
            "value:Q",
            title="Number of Comments",
            scale=alt.Scale(scheme="blues"),
            legend=None,
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Subreddit (Normalized by Coin)"),
    )
)

text = (
    alt.Chart(subreddit_query_normalized)
    .mark_text(align="center")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("subreddit:N"),
        y=alt.Y("search_query:O"),
        text=alt.Text("value:Q", format=".2%"),
        color=alt.condition(
            alt.datum.value > 0.7, alt.value("white"), alt.value("black")
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
        ],
    )
)

c + text

In [6]:
subreddit_query_posts = (
    df.query("depth == -1")
    .groupby(["search_query", "subreddit"])
    .size()
    .unstack()
    .fillna(0)
    .astype(int)
    .reset_index()
    .melt(id_vars="search_query")
)

c = (
    alt.Chart(subreddit_query_posts)
    .mark_bar()
    .encode(
        x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 3_350]}),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color("subreddit:N", title="Subreddit"),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Subreddit"),
    )
)


text = (
    alt.Chart(subreddit_query_posts)
    .mark_text(align="left", dx=5, color="white")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("sum(value):Q", scale={"domain": [0, 3_350]}),
        y=alt.Y("search_query:O"),
        text=alt.Text("sum(value):Q"),
    )
)

c + text

In [7]:
subreddit_query_posts_normalized = subreddit_query_posts.copy()
posts_per_coin = subreddit_query_posts_normalized.groupby("search_query")[
    "value"
].transform("sum")
subreddit_query_posts_normalized["value"] /= posts_per_coin

c = (
    alt.Chart(subreddit_query_posts_normalized)
    .mark_rect()
    .encode(
        x=alt.X("subreddit:N", title="Subreddit"),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color(
            "value:Q",
            title="Number of Posts",
            scale=alt.Scale(scheme="blues"),
            legend=None,
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Posts per Subreddit (Normalized by Coin)"),
    )
)

text = (
    alt.Chart(subreddit_query_posts_normalized)
    .mark_text(align="center")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("subreddit:N"),
        y=alt.Y("search_query:O"),
        text=alt.Text("value:Q", format=".2%"),
        color=alt.condition(
            alt.datum.value > 0.7, alt.value("white"), alt.value("black")
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
        ],
    )
)

c + text

In [8]:
depth_df = df["depth"].value_counts().sort_index()

c = (
    alt.Chart(depth_df.reset_index())
    .mark_bar()
    .encode(
        x=alt.X("depth:O", title="Depth (-1 is the original post)"),
        y=alt.Y("count:Q", title="Number of Comments", scale={"domain": [0, 185_000]}),
        tooltip=[
            alt.Tooltip("depth:O", title="Depth"),
            alt.Tooltip("count:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Depth"),
    )
)

text = (
    alt.Chart(depth_df.reset_index())
    .mark_text(align="center", dy=-8, color="white")
    .transform_calculate(customtooltip="datum.count")
    .encode(
        x=alt.X("depth:O"),
        y=alt.Y("count:Q", scale={"domain": [0, 185_000]}),
        text=alt.Text("count:Q"),
    )
)

c + text

In [9]:
comments_per_user = df.groupby("author").size()
distribution = comments_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["comments_per_user", "number_of_users"]

c = (
    alt.Chart(distribution_df)
    .mark_bar()
    .encode(
        x=alt.X(
            "comments_per_user:O",
            title="Number of Comments per User",
            scale={"domain": range(1, 101)},
        ),
        y=alt.Y(
            "number_of_users:Q",
            title="Number of Users",
            scale=alt.Scale(
                type="symlog",
                domain=[0, 100_000],
            ),
            axis=alt.Axis(
                values=[
                    0,
                    1,
                    2,
                    5,
                    10,
                    20,
                    50,
                    100,
                    200,
                    500,
                    1_000,
                    2_000,
                    5_000,
                    10_000,
                    20_000,
                    50_000,
                    100_000,
                ]
            ),
        ),
        tooltip=[
            alt.Tooltip("comments_per_user:Q", title="# of Comments per User"),
            alt.Tooltip("number_of_users:Q", title="# of Users"),
        ],
    )
    .properties(
        width=1000,
        height=400,
        title=alt.Title(text="Number of Comments per User"),
    )
)

c

In [10]:
comments_per_user.sort_values(ascending=False).head(10)

author
None                    38237
donut-bot                8769
AutoModerator            2744
kirtash93                1769
BinanceCSHelp            1057
Every_Hunt_160            987
CointestMod               908
Objective_Digit           601
goldyluckinblokchain      580
MrPuma86                  572
dtype: int64

In [11]:
posts_per_user = df.query("depth == -1").groupby("author").size()
distribution = posts_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["posts_per_user", "number_of_users"]

c = (
    alt.Chart(distribution_df)
    .mark_bar()
    .encode(
        x=alt.X(
            "posts_per_user:O",
            title="Number of Posts per User",
            scale={"domain": range(1, 101)},
        ),
        y=alt.Y(
            "number_of_users:Q",
            title="Number of Posts",
            scale=alt.Scale(
                type="symlog",
                domain=[0, 5_000],
                zero=True,
            ),
            axis=alt.Axis(
                values=[
                    0,
                    1,
                    2,
                    5,
                    10,
                    20,
                    50,
                    100,
                    200,
                    500,
                    1_000,
                    2_000,
                    5_000,
                ]
            ),
        ),
        tooltip=[
            alt.Tooltip("posts_per_user:Q", title="# of Posts per User"),
            alt.Tooltip("number_of_users:Q", title="# of Users"),
        ],
    )
    .properties(
        width=1000,
        height=400,
        title=alt.Title(text="Number of Posts per User"),
    )
)

c

In [12]:
posts_per_user.sort_values(ascending=False).head(10)

author
None                    333
AutoModerator            70
kirtash93                52
goldyluckinblokchain     32
hiorea                   30
InclineDumbbellPress     27
Ok_Source4689            24
Every_Hunt_160           20
Creative_Ad7831          20
Downtown_Yam9137         18
dtype: int64