In [1]:
import glob
import pandas as pd
import plotly.express as px
from pathlib import Path

In [2]:
def merge_csvs_into_df(path: Path) -> pd.DataFrame:
    files = glob.glob((path / "*csv").as_posix())
    df_list = []
    for file in files:
        df_list.append(pd.read_csv(file, usecols=["id", "created_at", "text", "retweet_count", "reply_count", "like_count", "quote_count", "impression_count"], dtype={"id": str, "created_at": str, "text": str, "retweet_count": int, "reply_count": int, "like_count": int, "quote_count": int, "impression_count": int}, parse_dates=["created_at"]))
    df = pd.concat(df_list)
    return df

## Thread analysis

### Prepare data

In [6]:
thread_path = Path("..") / "data" / "01_raw" / "IAmPascio" / "threads.csv"

In [7]:
df_threads = pd.read_csv(thread_path.as_posix(),
                         dtype={"id": str, "created_at": str, "text": str, "retweet_count": int, "reply_count": int, "like_count": int, "quote_count": int, "impression_count": int}, parse_dates=["created_at"])

In [9]:
thread_week = df_threads.resample("W-Mon", on="created_at").agg({"id": ["count"],
                                                                       "retweet_count": ["mean", "sum"],
                                                                       "reply_count": ["mean", "sum"],
                                                                       "like_count": ["mean", "sum"],
                                                                       "quote_count": ["mean", "sum"],
                                                                       "impression_count": ["mean", "sum"]
                                                                       })
thread_week.columns = [' '.join(col).strip() for col in thread_week.columns.values]

### The number of threads per week

In [29]:
fig_thread_num = px.area(x=thread_week.index, y=thread_week["id count"])

fig_thread_num.update_layout(title={
                                    'text': "Number of threads per week",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Number of threads",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_thread_num.write_image((Path("..") / "data" / "08_reporting" / "thread_numbers.png").as_posix())

### Average engagement per week (likes, quotes, retweets, impressions)

#### Likes

In [42]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["like_count mean"], log_y=True)

fig_avg_like.update_layout(title={
                                    'text': "Average thread likes per week (log scale)",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg likes",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_likes_log.png").as_posix())

In [41]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["like_count mean"])

fig_avg_like.update_layout(title={
                                    'text': "Average thread likes per week",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg likes",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_likes.png").as_posix())

#### quotes

In [45]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["quote_count mean"])

fig_avg_like.update_layout(title={
                                    'text': "Average thread quotes per week",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg quotes",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_quote.png").as_posix())

#### retweets

In [46]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["retweet_count mean"])

fig_avg_like.update_layout(title={
                                    'text': "Average thread retweets per week",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg retweets",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_retweet.png").as_posix())

In [47]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["retweet_count mean"], log_y=True)

fig_avg_like.update_layout(title={
                                    'text': "Average thread retweets per week (log scale)",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg retweets",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_retweet_log.png").as_posix())

#### impressions

In [48]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["impression_count mean"])

fig_avg_like.update_layout(title={
                                    'text': "Average thread impressions per week",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg impressions",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_impressions.png").as_posix())

In [49]:
fig_avg_like = px.area(x=thread_week.index, y=thread_week["impression_count mean"], log_y=True)

fig_avg_like.update_layout(title={
                                    'text': "Average thread impressions per week (log scale)",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "Time",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "Avg impressions",
                                          "font":{"size": 16}},
                            width=900,
                            height=500,
                            template="plotly_white"
)

fig_avg_like.write_image((Path("..") / "data" / "08_reporting" / "avg_impressions_log.png").as_posix())

### Most viral threads

In [55]:
df_threads.sort_values(by="like_count", ascending=False)

Unnamed: 0,id,created_at,text,retweet_count,reply_count,like_count,quote_count,impression_count
64,1589890468594167809,2022-11-08 08:00:01+00:00,"16 year old kids are making $300,000/month wit...",3968,627,17285,120,0
21,1612434013393948677,2023-01-09 13:00:01+00:00,ChatGPT is a FREE employee.\n\nBut most people...,3660,387,15464,92,1747071
47,1597863000697733120,2022-11-30 08:00:00+00:00,"Kids are making $20,000/month with no-code.\n\...",3075,299,12721,47,0
41,1601554825778319360,2022-12-10 12:30:00+00:00,"21 year olds are making $125,000/month using o...",2196,378,9329,47,0
0,1618881501025943553,2023-01-27 08:00:01+00:00,ChatGPT is the future.\n\nBut most people are ...,2220,199,8139,36,659137
...,...,...,...,...,...,...,...,...
142,1476750993245278215,2021-12-31 03:04:08+00:00,Here's a curated list of 3 high quality must-f...,0,2,2,1,0
149,1466703155404722183,2021-12-03 09:37:36+00:00,"One week ago, I jumped straight into becoming ...",0,1,2,0,0
135,1484601764338884612,2022-01-21 19:00:17+00:00,I gathered the best ressources for your Notion...,0,1,1,0,0
132,1486625054666469383,2022-01-27 09:00:07+00:00,3 𝗡𝗼𝘁𝗶𝗼𝗻 𝗛𝗮𝗰𝗸𝘀 (𝗔 𝗡𝗼𝘁𝗶𝗼𝗻 𝗖𝗵𝗲𝗮𝘁 𝗦𝗵𝗲𝗲𝘁) 👇,2,1,1,0,0


### Comparing threads between Jun-Nov 2022 and Nov 2022 - Jan 2023

In [70]:
thread_period_1 = df_threads[(df_threads["created_at"] >= "2022-06-01") & (df_threads["created_at"] < "2022-11-01")].reset_index(drop=True).loc[:, ["text", "id"]]
thread_period_2 = df_threads[df_threads["created_at"] >= "2022-11-01"].reset_index(drop=True).loc[:, ["text", "id"]]

In [71]:
thread_period_1.to_csv((Path("..") / "data" / "02_intermediate" / "threads1.csv").as_posix())

In [72]:
thread_period_2.to_csv((Path("..") / "data" / "02_intermediate" / "threads2.csv").as_posix())

#### Comparing thread categories between Jun-Nov 2022 and Nov 2022 - Jan 2023
In order to run the code you need to use thread data from the spreadsheet (threads1, threads2): https://docs.google.com/spreadsheets/d/1O5td_vQY_ofjIaO1jaX2lfA65YLlXbfG3KEzbv0gGFw/edit?usp=sharing

In [130]:
period_1 = pd.read_csv((Path("..") / "data" / "03_primary" / "threads1.csv").as_posix(), dtype={"id":str})
period_2 = pd.read_csv((Path("..") / "data" / "03_primary" / "threads2.csv").as_posix(), dtype={"id":str})

In [131]:
period_1 = pd.concat([period_1.set_index("id"), df_threads[(df_threads["created_at"] >= "2022-06-01") & (df_threads["created_at"] < "2022-11-01")].set_index("id")], axis=1, join="inner").reset_index()
period_1["period"] = "Jul-Nov 2022"
period_2 = pd.concat([period_2.set_index("id"), df_threads[df_threads["created_at"] >= "2022-11-01"].set_index("id")], axis=1, join="inner").reset_index()
period_2["period"] = "Nov 2022-present"

In [132]:
both_periods = pd.concat([period_1, period_2])

In [137]:
grouped = both_periods.groupby(["period", "type"]).agg({"id": ["count"], "like_count": ["mean"]})

In [139]:
grouped.columns = [' '.join(col).strip() for col in grouped.columns.values]

In [142]:
grouped = grouped.reset_index()

In [149]:
fig = px.histogram(both_periods, x="type", color="period", barmode="group")

fig.update_layout(title={
                                    'text': "Number of threads per category and period",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "category",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "count",
                                          "font":{"size": 16}},
                            width=1400,
                            height=700,
                            template="plotly_white"
)

fig.write_image((Path("..") / "data" / "08_reporting" / "thread_cat_count.png").as_posix())

In [153]:
grouped_period_1 = grouped[grouped["period"]=="Jul-Nov 2022"]
grouped_period_2 = grouped[grouped["period"]=="Nov 2022-present"]

In [156]:
fig = px.bar(grouped_period_1, x="type", y="like_count mean")

fig.update_layout(title={
                                    'text': "AVG thread likes per category - Jul-Nov 2022",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "category",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "AVG likes",
                                          "font":{"size": 16}},
                            width=1400,
                            height=700,
                            template="plotly_white"
)

fig.write_image((Path("..") / "data" / "08_reporting" / "thread_cat_avg_period_1.png").as_posix())

In [157]:
fig = px.bar(grouped_period_2, x="type", y="like_count mean")

fig.update_layout(title={
                                    'text': "AVG thread likes per category - Nov 2022-present",
                                    'x':0.5,
                                    'xanchor': 'center',
                                    'yanchor': 'top',
                                    "font":{"size": 24}},
                             xaxis_title={"text": "category",
                                          "font":{"size": 16}},
                             yaxis_title={"text": "AVG likes",
                                          "font":{"size": 16}},
                            width=1400,
                            height=700,
                            template="plotly_white"
)

fig.write_image((Path("..") / "data" / "08_reporting" / "thread_cat_avg_period_2.png").as_posix())