In [1]:
import pandas as pd

## Datasets & Cleaning

### Rainbow Six Siege

In [2]:
# Loading raw dataset
rss_data = pd.read_csv("../assets/raw/GG Rainbow Six Siege.csv")

In [3]:
fields_to_keep = ['month', 'peak', 'gain', '% gain'] # Fields we want to keep

# Cleaning the dataset
rss_data = rss_data.rename(columns={"Peak Players": "peak"})
rss_data.columns = rss_data.columns.str.lower()
rss_data = rss_data.replace("-", "0").replace("—", "0") # Replacing both the short - and long —
rss_data[['peak', 'gain', '% gain']] = rss_data[['peak', 'gain', '% gain']].replace({",": "", "%": ""}, regex=True).astype(float)
rss_data = rss_data[fields_to_keep][1:] # Getting rid of first row of Last 30 days
rss_data["month"] = pd.to_datetime(rss_data["month"], format="%B %Y")
rss_data = rss_data.sort_values("month")


In [4]:
# Recalculating gain to be based on peak players rather than unique players
rss_data['gain'] = rss_data['peak']-rss_data['peak'].shift(1) 
rss_data['% gain'] = ((rss_data['peak']-rss_data['peak'].shift(1))*100/rss_data['peak'].shift(1)).round(1) # Recalculating gain %

# Further cleaning
rss_data = rss_data.iloc[1:] # Dropping the first row that has only 1 player
rss_data.iloc[0, 2:] = 0 # Setting the first value of gain and % gain to 0
rss_data = rss_data[rss_data["month"] >= "2020-06-01"] # Filtering rows before June 01 2020

In [5]:
# Exporting clean dataset
rss_data.to_csv("../assets/clean/GG_Rainbow_Six_Siege_Clean.csv", encoding="utf-8", index=False, header=True)