In [44]:
import pandas as pd

## Datasets & Cleaning

### Twitch Viewership Data from TwitchTracker

In [45]:
cs_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike.csv")
cs2_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike 2.csv")
cscz_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike Condition Zero.csv")
css_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike Source.csv")

twitch_fields = ['Month', 'Avg Viewers', 'Gain', 'Peak Viewers', 'Avg Streams', 'Gain.1', 'Peak Streams', 'Hours Watched']

In [46]:
twitch_data_merged = pd.concat([cs_twitch_data, cs2_twitch_data, cscz_twitch_data, css_twitch_data], ignore_index=True)
twitch_data_merged = twitch_data_merged[twitch_fields]
twitch_data_merged

Unnamed: 0,Month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,Nov-16,18.0,-,164,2.0,-,7.0,3.2K
1,Dec-16,20.0,2,509,2.0,-,10.0,8.0K
2,Jan-17,19.0,-1,543,4.0,2,13.0,8.8K
3,Feb-17,30.0,11,5782,2.0,-2,11.0,16.0K
4,Mar-17,18.0,-12,815,2.0,-,10.0,6.9K
...,...,...,...,...,...,...,...,...
316,Jan-17,18,-1,215,1,-,3,357
317,Dec-16,19,4,216,1,-,3,946
318,Nov-16,15,-,147,1,-,3,336
319,Aug-25,14,-,1213,2,-,8,4.4K


In [47]:
# Transforming Hours Watched format
def convert_k(val):
    if isinstance(val, str):
        if val.endswith("K"):
            return float(val[:-1].replace(".", ""))* 1000
        elif val.endswith("M"):
            return float(val[:-1].replace(".", "")) * 1000000
    return float(val)

twitch_data_merged["Hours Watched"] = twitch_data_merged["Hours Watched"].map(convert_k)
twitch_data_merged

Unnamed: 0,Month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,Nov-16,18.0,-,164,2.0,-,7.0,32000.0
1,Dec-16,20.0,2,509,2.0,-,10.0,80000.0
2,Jan-17,19.0,-1,543,4.0,2,13.0,88000.0
3,Feb-17,30.0,11,5782,2.0,-2,11.0,160000.0
4,Mar-17,18.0,-12,815,2.0,-,10.0,69000.0
...,...,...,...,...,...,...,...,...
316,Jan-17,18,-1,215,1,-,3,357.0
317,Dec-16,19,4,216,1,-,3,946.0
318,Nov-16,15,-,147,1,-,3,336.0
319,Aug-25,14,-,1213,2,-,8,44000.0


In [48]:
numeric_fields = [x for x in twitch_fields if x != "Month"]
twitch_data_merged[numeric_fields] = twitch_data_merged[numeric_fields].replace({",": "", "-": "0"}, regex=True).astype(float)

twitch_data_merged["Month"] = pd.to_datetime(twitch_data_merged["Month"], format="%b-%y")
twitch_data_merged = twitch_data_merged.sort_values("Month")
twitch_data_merged = twitch_data_merged.rename(columns={"Month": "month"})
twitch_data_merged.head()

Unnamed: 0,month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,2016-11-01,18.0,0.0,164.0,2.0,0.0,7.0,32000.0
318,2016-11-01,15.0,0.0,147.0,1.0,0.0,3.0,336.0
212,2016-11-01,41089.0,0.0,236838.0,800.0,0.0,1723.0,305000000.0
211,2016-12-01,97356.0,56267.0,1153507.0,937.0,137.0,2172.0,721000000.0
1,2016-12-01,20.0,2.0,509.0,2.0,0.0,10.0,80000.0


In [49]:
twitch_data_merged[twitch_data_merged['month'].dt.year == 2023]

Unnamed: 0,month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
244,2023-01-01,3.0,1.0,45.0,1.0,0.0,4.0,319.0
74,2023-01-01,131.0,15.0,2109.0,13.0,1.0,36.0,875000.0
138,2023-01-01,82042.0,29722.0,576269.0,1176.0,80.0,2245.0,545000000.0
75,2023-02-01,139.0,8.0,7415.0,13.0,0.0,38.0,103000.0
137,2023-02-01,88932.0,6890.0,883067.0,1340.0,164.0,2957.0,655000000.0
243,2023-02-01,2.0,1.0,30.0,1.0,0.0,3.0,412.0
136,2023-03-01,112611.0,23679.0,579500.0,1580.0,240.0,3049.0,808000000.0
242,2023-03-01,2.0,0.0,66.0,1.0,0.0,4.0,536.0
76,2023-03-01,152.0,13.0,9555.0,13.0,0.0,35.0,109000.0
77,2023-04-01,146.0,6.0,5348.0,13.0,0.0,33.0,109000.0


In [51]:
twitch_data_merged_grouped = twitch_data_merged.groupby(twitch_data_merged.columns[0]).sum()
twitch_data_merged_grouped = twitch_data_merged_grouped.reset_index()
twitch_data_merged_grouped = twitch_data_merged_grouped.rename(columns={"Gain": "gain viewers", "Peak Viewers": "peak viewers", "Peak Streams": "peak streams", "Gain.1": "gain streams"})
twitch_data_merged_grouped.columns = twitch_data_merged_grouped.columns.str.lower()

twitch_data_merged_grouped['gain viewers'] = twitch_data_merged_grouped['peak viewers']-twitch_data_merged_grouped['peak viewers'].shift(1) # Recalculating gain to be based on peak views rather than average
twitch_data_merged_grouped['gain streams'] = twitch_data_merged_grouped['peak streams']-twitch_data_merged_grouped['peak streams'].shift(1) # Recalculating gain to be based on peak views rather than average
twitch_data_merged_grouped.iloc[0, [2, 5]] = 0 # Setting the first value of first row of gains to 0

twitch_data_merged_grouped.to_csv("../assets/clean/Twitch_Counter-Strike_Clean.csv", encoding="utf-8", index=False, header=True)
twitch_data_merged_grouped

Unnamed: 0,month,avg viewers,gain viewers,peak viewers,avg streams,gain streams,peak streams,hours watched
0,2016-11-01,41122.0,0.0,237149.0,803.0,0.0,1733.0,305032336.0
1,2016-12-01,97395.0,917083.0,1154232.0,940.0,452.0,2185.0,721080946.0
2,2017-01-01,60177.0,-692895.0,461337.0,1014.0,-88.0,2097.0,404088357.0
3,2017-02-01,49606.0,51119.0,512456.0,946.0,3.0,2100.0,368160397.0
4,2017-03-01,50868.0,-94180.0,418276.0,879.0,-270.0,1830.0,364069433.0
...,...,...,...,...,...,...,...,...
101,2025-04-01,99822.0,-105695.0,722031.0,1601.0,-252.0,3272.0,741902172.0
102,2025-05-01,117782.0,424796.0,1146827.0,1481.0,-219.0,3053.0,848466647.0
103,2025-06-01,86343.0,-590801.0,556026.0,1519.0,-55.0,2998.0,642565317.0
104,2025-07-01,91794.0,178269.0,734295.0,1558.0,46.0,3044.0,682532909.0
