In [1]:
import pandas as pd

## Datasets & Cleaning

### Call of Duty

In [2]:
# Loading raw datasets
codbo3_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Black Ops 3.csv")
codiw_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Infinite Warfare.csv")
codwwii_data = pd.read_csv("../assets/raw/SteamDB Call of Duty WWII.csv")
codbo4_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Black Ops 4.csv")
codmw_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Modern Warfare.csv")
codbocw_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Black Ops Cold War.csv")
codv_data = pd.read_csv("../assets/raw/SteamDB Call of Duty Vanguard.csv")
codlast3games_data = pd.read_csv("../assets/raw/SteamDB Call of Duty last3games.csv")

In [3]:
fields_to_keep = ['month', 'peak', 'gain', '% gain'] # Fields we want to keep
list_of_cod_data = [codbo3_data, codiw_data, codwwii_data, codbo4_data, codmw_data, codbocw_data, codv_data, codlast3games_data] # Fields of datasets

# Cleaning the datasets in list_of_cod_data
for i in range(len(list_of_cod_data)):
    df = list_of_cod_data[i]
    df.columns = df.columns.str.lower()
    df = df.replace("-", "0")
    df[['peak', 'gain', '% gain']] = df[['peak', 'gain', '% gain']].replace({",": "", "%": ""}, regex=True).astype(float)
    df = df[fields_to_keep][1:] # Getting rid of first row of Last 30 days
    df["month"] = pd.to_datetime(df["month"], format="%b-%y")
    df = df.sort_values("month")
    list_of_cod_data[i] = df


In [4]:
# Merging the cleaned datasets
cod_merged = pd.concat(list_of_cod_data, ignore_index=True)
cod_merged = cod_merged[fields_to_keep]

In [5]:
# Grouping by month and summing the peak players
SteamDB_cod_merged_grouped = cod_merged.groupby(cod_merged.columns[0]).sum()

SteamDB_cod_merged_grouped = SteamDB_cod_merged_grouped.reset_index()
SteamDB_cod_merged_grouped = SteamDB_cod_merged_grouped.sort_values("month")

# Recalculate % gain
SteamDB_cod_merged_grouped['% gain'] = ((SteamDB_cod_merged_grouped['peak']-SteamDB_cod_merged_grouped['peak'].shift(1))*100/SteamDB_cod_merged_grouped['peak'].shift(1)).round(1)
SteamDB_cod_merged_grouped.iloc[0,3] = 0 # Setting the first % gain to 0

# Filtering rows before June 01 2020
SteamDB_cod_merged_grouped = SteamDB_cod_merged_grouped[SteamDB_cod_merged_grouped["month"] >= "2020-06-01"] 

In [6]:
# Exporting cleaned dataset
SteamDB_cod_merged_grouped.to_csv("../assets/clean/SteamDB_Call_of_Duty_Clean.csv",encoding="utf-8", index=False, header=True)