In [1]:
import pandas as pd

## Datasets & Cleaning

### Twitch Viewership Data from TwitchTracker

In [2]:
# Loading the datasets
cs_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike.csv")
cs2_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike 2.csv")
cscz_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike Condition Zero.csv")
css_twitch_data = pd.read_csv("../assets/raw/TwitchTracker Counter-Strike Source.csv")

In [3]:
twitch_fields = ['Month', 'Avg Viewers', 'Gain', 'Peak Viewers', 'Avg Streams', 'Gain.1', 'Peak Streams', 'Hours Watched'] # Fields of interest

# Merging and cleaning the datasets
twitch_data_merged = pd.concat([cs_twitch_data, cs2_twitch_data, cscz_twitch_data, css_twitch_data], ignore_index=True)
twitch_data_merged = twitch_data_merged[twitch_fields]

In [4]:
# Transforming Hours Watched format to float
def convert_k(val):
    if isinstance(val, str):
        if val.endswith("K"):
            return float(val[:-1].replace(".", ""))* 1000
        elif val.endswith("M"):
            return float(val[:-1].replace(".", "")) * 1000000
    return float(val)

twitch_data_merged["Hours Watched"] = twitch_data_merged["Hours Watched"].map(convert_k)

In [5]:
# Further cleaning
numeric_fields = [x for x in twitch_fields if x != "Month"]
twitch_data_merged[numeric_fields] = twitch_data_merged[numeric_fields].replace({",": "", "-": "0"}, regex=True).astype(float)

# Converting Month to datetime and sorting
twitch_data_merged["Month"] = pd.to_datetime(twitch_data_merged["Month"], format="%b-%y")
twitch_data_merged = twitch_data_merged.sort_values("Month")
twitch_data_merged = twitch_data_merged.rename(columns={"Month": "month"})

In [6]:
# Grouping by month
twitch_data_merged_grouped = twitch_data_merged.groupby(twitch_data_merged.columns[0]).sum()
twitch_data_merged_grouped = twitch_data_merged_grouped.reset_index()
twitch_data_merged_grouped = twitch_data_merged_grouped.rename(columns={"Gain": "gain viewers", "Gain.1": "gain streams"})
twitch_data_merged_grouped.columns = twitch_data_merged_grouped.columns.str.lower()

# Recalculating gains based on peak viewers and peak streams
twitch_data_merged_grouped['gain viewers'] = twitch_data_merged_grouped['peak viewers']-twitch_data_merged_grouped['peak viewers'].shift(1) # Recalculating gain to be based on peak views rather than average
twitch_data_merged_grouped['gain streams'] = twitch_data_merged_grouped['peak streams']-twitch_data_merged_grouped['peak streams'].shift(1) # Recalculating gain to be based on peak views rather than average
twitch_data_merged_grouped.iloc[0, [2, 5]] = 0 # Setting the first value of first row of gains to 0

In [7]:
# Saving the cleaned data
twitch_data_merged_grouped.to_csv("../assets/clean/Twitch_Counter-Strike_Clean.csv", encoding="utf-8", index=False, header=True)