# Notebook for Data cleaning and Imputation

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from utils.data_processing_utils import london_cleaner, hamburg_cleaner, stockholm_cleaner, boston_cleaner, chicago_cleaner, houston_cleaner, save_df, valid_df
# from utils.data_processing_utils import *

## Constants 

### Paths

In [2]:
RAW_DATA_PATH = "Marathons_Data/Raw"
CLN_DATA_PATH = "Marathons_Data/Clean"
# London
LDN: str = "London"
LDN_RAW_PATH: str = f"{RAW_DATA_PATH}/{LDN}"
LDN_CLN_PATH: str = f"{CLN_DATA_PATH}/{LDN}"
# Hamburg
HAM: str = "Hamburg"
HAM_RAW_PATH: str = f"{RAW_DATA_PATH}/{HAM}"
HAM_CLN_PATH: str = f"{CLN_DATA_PATH}/{HAM}"
# Houston
HOU: str = "Houston"
HOU_RAW_PATH: str = f"{RAW_DATA_PATH}/{HOU}"
HOU_CLN_PATH: str = f"{CLN_DATA_PATH}/{HOU}"
# Stockholm
STO: str = "Stockholm"
STO_RAW_PATH: str = f"{RAW_DATA_PATH}/{STO}"
STO_CLN_PATH: str = f"{CLN_DATA_PATH}/{STO}"
# Boston
BOS: str = "Boston"
BOS_RAW_PATH: str = f"{RAW_DATA_PATH}/{BOS}"
BOS_CLN_PATH: str = f"{CLN_DATA_PATH}/{BOS}"
# Chicago
CHI: str = "Chicago"
CHI_RAW_PATH: str = f"{RAW_DATA_PATH}/{CHI}"
CHI_CLN_PATH: str = f"{CLN_DATA_PATH}/{CHI}"

### Years & Splits

In [3]:
YEAR_13: str = "2013"
YEAR_14: str = "2014"
YEAR_15: str = "2015"
YEAR_16: str = "2016"
YEAR_17: str = "2017"
YEAR_18: str = "2018"
YEAR_19: str = "2019"
YEAR_21: str = "2021"
YEAR_22: str = "2022"
YEAR_23: str = "2023"
YEARS: list[str] = [YEAR_13, YEAR_14, YEAR_15, YEAR_16, YEAR_17, YEAR_18, YEAR_19, YEAR_21, YEAR_22, YEAR_23]
SPLITS_KEYS: list[str] = ["k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]

COLS_ORDER: list[str] = ["age_cat", "gender", "race_state", "last_split", 
                         'k_5_time', 'k_5_pace', 'k_5_speed', 'k_10_time', 'k_10_pace', 'k_10_speed',
                         'k_15_time', 'k_15_pace', 'k_15_speed', 'k_20_time', 'k_20_pace', 'k_20_speed',
                         'k_half_time', 'k_half_pace', 'k_half_speed', 'k_25_time', 'k_25_pace', 'k_25_speed', 
                         'k_30_time', 'k_30_pace', 'k_30_speed', 'k_35_time', 'k_35_pace', 'k_35_speed',
                         'k_40_time', 'k_40_pace', 'k_40_speed', 'k_finish_time', 'k_finish_pace', 'k_finish_speed']

SPLIT_NAME_DICT: dict = {'k_5_time': '5K', 'k_10_time': '10K', 'k_15_time': '15K', 
                         'k_20_time': '20K', 'k_25_time': '25K', 'k_half_time': 'HALF', 
                         'k_30_time': '30K', 'k_35_time': '35K', 'k_40_time': '40K', 'k_finish_time': 'Finish time'}

## London

In [4]:
LDN_COLS_TO_DROP = ["idp", "half", "finish", "run_no"] 

### 2014

In [5]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_clean.csv"

#### Cleaning

In [6]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 41677 || New rows count: 36289 || Dropped Rows: 5388
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 36289 || New rows count: 36283 || Dropped rows based on   age_cat  : 6
Original rows count: 36283 || New rows count: 36283 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2014/London2014_clean.csv`


#### Imputation

In [7]:
del ldn_file_raw_path, ldn_file_cln_path

### 2015

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 43741 || New rows count: 37879 || Dropped Rows: 5862
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 37879 || New rows count: 37853 || Dropped rows based on   age_cat  : 26
Original rows count: 37853 || New rows count: 37853 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2015/London2015_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2016

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 45202 || New rows count: 39217 || Dropped Rows: 5985
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39217 || New rows count: 39217 || Dropped rows based on   age_cat  : 0
Original rows count: 39217 || New rows count: 39217 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2016/London2016_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2017

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 45155 || New rows count: 39692 || Dropped Rows: 5463
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39692 || New rows count: 39691 || Dropped rows based on   age_cat  : 1
Original rows count: 39691 || New rows count: 39691 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2017/London2017_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2018

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 47667 || New rows count: 40773 || Dropped Rows: 6894
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 40773 || New rows count: 40761 || Dropped rows based on   age_cat  : 12
Original rows count: 40761 || New rows count: 40761 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2018/London2018_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2019

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 49318 || New rows count: 42737 || Dropped Rows: 6581
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 42737 || New rows count: 42737 || Dropped rows based on   age_cat  : 0
Original rows count: 42737 || New rows count: 42737 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2019/London2019_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2021

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 41594 || New rows count: 36129 || Dropped Rows: 5465
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 36129 || New rows count: 36126 || Dropped rows based on   age_cat  : 3
Original rows count: 36126 || New rows count: 36126 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2021/London2021_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2022

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 46993 || New rows count: 40812 || Dropped Rows: 6181
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 40812 || New rows count: 40811 || Dropped rows based on   age_cat  : 1
Original rows count: 40811 || New rows count: 40811 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2022/London2022_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

### 2023

In [None]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_clean.csv"

#### Cleaning

In [None]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 53077 || New rows count: 49083 || Dropped Rows: 3994
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 49083 || New rows count: 49083 || Dropped rows based on   age_cat  : 0
Original rows count: 49083 || New rows count: 49083 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2023/London2023_clean.csv`


#### Imputation

In [None]:
del ldn_file_raw_path, ldn_file_cln_path

## Hamburg

In [None]:
HAM_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2013

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 15135 || New rows count: 11872 || Dropped Rows: 3263
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 11872 || New rows count: 11872 || Dropped rows based on   age_cat  : 0
Original rows count: 11872 || New rows count: 11872 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2013/Hamburg2013_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2014

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 16695 || New rows count: 13296 || Dropped Rows: 3399
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 13296 || New rows count: 13295 || Dropped rows based on   age_cat  : 1
Original rows count: 13295 || New rows count: 13295 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2014/Hamburg2014_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2015

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 19205 || New rows count: 15259 || Dropped Rows: 3946
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 15259 || New rows count: 15257 || Dropped rows based on   age_cat  : 2
Original rows count: 15257 || New rows count: 15257 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2015/Hamburg2015_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2016

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 16011 || New rows count: 12540 || Dropped Rows: 3471
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 12540 || New rows count: 12537 || Dropped rows based on   age_cat  : 3
Original rows count: 12537 || New rows count: 12537 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2016/Hamburg2016_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2017

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 15638 || New rows count: 12396 || Dropped Rows: 3242
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 12396 || New rows count: 12391 || Dropped rows based on   age_cat  : 5
Original rows count: 12391 || New rows count: 12391 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2017/Hamburg2017_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2018

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 14010 || New rows count: 10670 || Dropped Rows: 3340
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 10670 || New rows count: 10668 || Dropped rows based on   age_cat  : 2
Original rows count: 10668 || New rows count: 10668 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2018/Hamburg2018_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2019

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 13498 || New rows count: 10468 || Dropped Rows: 3030
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 10468 || New rows count: 10453 || Dropped rows based on   age_cat  : 15
Original rows count: 10453 || New rows count: 10453 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2019/Hamburg2019_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2022

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 10416 || New rows count: 6888 || Dropped Rows: 3528
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 6888 || New rows count: 6840 || Dropped rows based on   age_cat  : 48
Original rows count: 6840 || New rows count: 6840 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2022/Hamburg2022_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

### 2023

In [None]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_clean.csv"

#### Cleaning

In [None]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 11757 || New rows count: 9002 || Dropped Rows: 2755
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 9002 || New rows count: 8998 || Dropped rows based on   age_cat  : 4
Original rows count: 8998 || New rows count: 8998 || Dropped rows based on    gender  : 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2023/Hamburg2023_clean.csv`


#### Imputation

In [None]:
del ham_file_raw_path, ham_file_cln_path

## Houston
##### Pace and speed have been converted from min/mile and miles/h to sec/km and km/h respectively.

In [None]:
HOU_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2018

In [None]:
hou_file_raw_path = f"{HOU_RAW_PATH}/{HOU}{YEAR_18}/{HOU}{YEAR_18}_full.csv"
hou_file_cln_path = f"{HOU_CLN_PATH}/{HOU}{YEAR_18}/{HOU}{YEAR_18}_clean.csv"

#### Cleaning

In [None]:
df_hou = pd.read_csv(hou_file_raw_path)
# Cleaning the DataFrame.
df_hou = houston_cleaner(df_hou, SPLITS_KEYS, HOU_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_hou):
    save_df(df_hou, hou_file_cln_path)
    print(f"** File has been saved in: `{hou_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 7547 || New rows count: 7526 || Dropped Rows: 21
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 7526 || New rows count: 7526 || Dropped rows based on   age_cat  : 0
Original rows count: 7526 || New rows count: 7526 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [12-15, 16-19, Elites]:
Original rows count: 7526 || New rows count: 7388 || Dropped rows: 138
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** Dropping rows with invalid race state ['Other', 'DQ - No Reason Was Given', 'DQ - SWITCH from HALF to MARA']:
Original rows count: 7388 || New rows count: 7374 || Dropped rows: 14
** File has been saved in: `Marathons_Data/Clean/Houston/Houston2018/Houston2018_clean.csv`


#### Imputation

In [None]:
del hou_file_raw_path, hou_file_cln_path

### 2019

In [None]:
hou_file_raw_path = f"{HOU_RAW_PATH}/{HOU}{YEAR_19}/{HOU}{YEAR_19}_full.csv"
hou_file_cln_path = f"{HOU_CLN_PATH}/{HOU}{YEAR_19}/{HOU}{YEAR_19}_clean.csv"

#### Cleaning

In [None]:
df_hou = pd.read_csv(hou_file_raw_path)
# Cleaning the DataFrame.
df_hou = houston_cleaner(df_hou, SPLITS_KEYS, HOU_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_hou):
    save_df(df_hou, hou_file_cln_path)
    print(f"** File has been saved in: `{hou_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 7159 || New rows count: 7145 || Dropped Rows: 14
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 7145 || New rows count: 7145 || Dropped rows based on   age_cat  : 0
Original rows count: 7145 || New rows count: 7145 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [12-15, 16-19, Elites]:
Original rows count: 7145 || New rows count: 7015 || Dropped rows: 130
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** Dropping rows with invalid race state ['Other', 'DQ - No Reason Was Given', 'DQ - SWITCH from HALF to MARA']:
Original rows count: 7015 || New rows count: 7003 || Dropped rows: 12
** File has been saved in: `Marathons_Data/Clean/Houston/Houston2019/Houston2019_clean.csv`


#### Imputation

In [None]:
del hou_file_raw_path, hou_file_cln_path

## Stockholm

In [8]:
STO_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2021

In [9]:
sto_file_raw_path = f"{STO_RAW_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_full.csv"
sto_file_cln_path = f"{STO_CLN_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_clean.csv"

#### Cleaning

In [10]:
df_sto = pd.read_csv(sto_file_raw_path)
# Cleaning the DataFrame.
df_sto = stockholm_cleaner(df_sto, SPLITS_KEYS, STO_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER, year=2021)
# Check if the DataFrame is valid before saving it.
if valid_df(df_sto):
    save_df(df_sto, sto_file_cln_path)
    print(f"** File has been saved in: `{sto_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 12179 || New rows count: 7177 || Dropped Rows: 5002
** Dropping rows with null values in `yob` and `gender` columns:
Original rows count: 7177 || New rows count: 7126 || Dropped rows based on     yob    : 51
Original rows count: 7126 || New rows count: 7126 || Dropped rows based on    gender  : 0
** Dropping non-adult runners (age < 18):
Original rows count: 7126 || New rows count: 7125 || Dropped rows: 1
** column name `yob` changed to `age_cat`.
** File has been saved in: `Marathons_Data/Clean/Stockholm/Stockholm2021/Stockholm2021_clean.csv`


#### Imputation

In [11]:
del sto_file_raw_path, sto_file_cln_path

### 2022

In [12]:
sto_file_raw_path = f"{STO_RAW_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_full.csv"
sto_file_cln_path = f"{STO_CLN_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_clean.csv"

#### Cleaning

In [13]:
df_sto = pd.read_csv(sto_file_raw_path)
# Cleaning the DataFrame.
df_sto = stockholm_cleaner(df_sto, SPLITS_KEYS, STO_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER, year=2022)
# Check if the DataFrame is valid before saving it.
if valid_df(df_sto):
    save_df(df_sto, sto_file_cln_path)
    print(f"** File has been saved in: `{sto_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 13593 || New rows count: 10161 || Dropped Rows: 3432
** Dropping rows with null values in `yob` and `gender` columns:
Original rows count: 10161 || New rows count: 10057 || Dropped rows based on     yob    : 104
Original rows count: 10057 || New rows count: 10057 || Dropped rows based on    gender  : 0
** Dropping non-adult runners (age < 18):
Original rows count: 10057 || New rows count: 10057 || Dropped rows: 0
** column name `yob` changed to `age_cat`.
** File has been saved in: `Marathons_Data/Clean/Stockholm/Stockholm2022/Stockholm2022_clean.csv`


#### Imputation

In [14]:
del sto_file_raw_path, sto_file_cln_path

## Boston
##### Pace and speed have been converted from min/mile and miles/h to sec/km and km/h respectively.

In [4]:
BOS_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2014

In [5]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_clean.csv"

#### Cleaning

In [6]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 35671 || New rows count: 32447 || Dropped Rows: 3224
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 32447 || New rows count: 32447 || Dropped rows based on   age_cat  : 0
Original rows count: 32447 || New rows count: 32447 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2014/Boston2014_clean.csv`


#### Imputation

In [8]:
del bos_file_raw_path, bos_file_cln_path

### 2015

In [9]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_clean.csv"

#### Cleaning

In [10]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30252 || New rows count: 27159 || Dropped Rows: 3093
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27159 || New rows count: 27159 || Dropped rows based on   age_cat  : 0
Original rows count: 27159 || New rows count: 27159 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2015/Boston2015_clean.csv`


#### Imputation

In [11]:
del bos_file_raw_path, bos_file_cln_path

### 2016

In [12]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_clean.csv"

#### Cleaning

In [13]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30743 || New rows count: 27487 || Dropped Rows: 3256
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27487 || New rows count: 27487 || Dropped rows based on   age_cat  : 0
Original rows count: 27487 || New rows count: 27487 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2016/Boston2016_clean.csv`


#### Imputation

In [14]:
del bos_file_raw_path, bos_file_cln_path

### 2017

In [15]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_clean.csv"

#### Cleaning

In [16]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30074 || New rows count: 27220 || Dropped Rows: 2854
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27220 || New rows count: 27220 || Dropped rows based on   age_cat  : 0
Original rows count: 27220 || New rows count: 27220 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2017/Boston2017_clean.csv`


#### Imputation

In [17]:
del bos_file_raw_path, bos_file_cln_path

### 2018

In [18]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_clean.csv"

#### Cleaning

In [19]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 29978 || New rows count: 26919 || Dropped Rows: 3059
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 26919 || New rows count: 26919 || Dropped rows based on   age_cat  : 0
Original rows count: 26919 || New rows count: 26919 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2018/Boston2018_clean.csv`


#### Imputation

In [20]:
del bos_file_raw_path, bos_file_cln_path

### 2019

In [21]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_clean.csv"

#### Cleaning

In [22]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30234 || New rows count: 27337 || Dropped Rows: 2897
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27337 || New rows count: 27337 || Dropped rows based on   age_cat  : 0
Original rows count: 27337 || New rows count: 27337 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2019/Boston2019_clean.csv`


#### Imputation

In [None]:
del bos_file_raw_path, bos_file_cln_path

### 2021

In [23]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_clean.csv"

#### Cleaning

In [24]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 18074 || New rows count: 15645 || Dropped Rows: 2429
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 15645 || New rows count: 15645 || Dropped rows based on   age_cat  : 0
Original rows count: 15645 || New rows count: 15645 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2021/Boston2021_clean.csv`


#### Imputation

In [25]:
del bos_file_raw_path, bos_file_cln_path

### 2022

In [26]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_clean.csv"

#### Cleaning

In [27]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 28500 || New rows count: 25217 || Dropped Rows: 3283
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 25217 || New rows count: 25217 || Dropped rows based on   age_cat  : 0
Original rows count: 25217 || New rows count: 25217 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2022/Boston2022_clean.csv`


#### Imputation

In [28]:
del bos_file_raw_path, bos_file_cln_path

### 2023

In [29]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_clean.csv"

#### Cleaning

In [30]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30105 || New rows count: 27058 || Dropped Rows: 3047
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27058 || New rows count: 27058 || Dropped rows based on   age_cat  : 0
Original rows count: 27058 || New rows count: 27058 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2023/Boston2023_clean.csv`


#### Imputation

In [31]:
del bos_file_raw_path, bos_file_cln_path

## Chicago

In [4]:
CHI_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2014

In [5]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_clean.csv"

#### Cleaning

In [6]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 50216 || New rows count: 41715 || Dropped Rows: 8501
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 41715 || New rows count: 41715 || Dropped rows based on   age_cat  : 0
Original rows count: 41715 || New rows count: 41715 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 41715 || New rows count: 41350 || Dropped rows: 365
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2014/Chicago2014_clean.csv`


#### Imputation

In [7]:
del chi_file_raw_path, chi_file_cln_path

### 2015

In [8]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_clean.csv"

#### Cleaning

In [9]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 46032 || New rows count: 39219 || Dropped Rows: 6813
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39219 || New rows count: 39219 || Dropped rows based on   age_cat  : 0
Original rows count: 39219 || New rows count: 39219 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 39219 || New rows count: 38887 || Dropped rows: 332
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2015/Chicago2015_clean.csv`


#### Imputation

In [10]:
del chi_file_raw_path, chi_file_cln_path

### 2016

In [11]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_clean.csv"

#### Cleaning

In [12]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 49067 || New rows count: 41469 || Dropped Rows: 7598
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 41469 || New rows count: 41468 || Dropped rows based on   age_cat  : 1
Original rows count: 41468 || New rows count: 41468 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 41468 || New rows count: 41163 || Dropped rows: 305
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2016/Chicago2016_clean.csv`


#### Imputation

In [13]:
del chi_file_raw_path, chi_file_cln_path

### 2017

In [14]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_clean.csv"

#### Cleaning

In [15]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 54326 || New rows count: 45565 || Dropped Rows: 8761
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 45565 || New rows count: 45565 || Dropped rows based on   age_cat  : 0
Original rows count: 45565 || New rows count: 45565 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 45565 || New rows count: 45325 || Dropped rows: 240
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2017/Chicago2017_clean.csv`


#### Imputation

In [16]:
del chi_file_raw_path, chi_file_cln_path

### 2018

In [17]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_clean.csv"

#### Cleaning

In [18]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 55621 || New rows count: 45380 || Dropped Rows: 10241
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 45380 || New rows count: 45380 || Dropped rows based on   age_cat  : 0
Original rows count: 45380 || New rows count: 45380 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 45380 || New rows count: 45145 || Dropped rows: 235
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2018/Chicago2018_clean.csv`


#### Imputation

In [None]:
del chi_file_raw_path, chi_file_cln_path

### 2019

In [19]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_clean.csv"

#### Cleaning

In [20]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 55395 || New rows count: 46513 || Dropped Rows: 8882
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 46513 || New rows count: 46512 || Dropped rows based on   age_cat  : 1
Original rows count: 46512 || New rows count: 46512 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 46512 || New rows count: 46269 || Dropped rows: 243
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2019/Chicago2019_clean.csv`


#### Imputation

In [None]:
del chi_file_raw_path, chi_file_cln_path

### 2021

In [21]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_clean.csv"

#### Cleaning

In [22]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 33502 || New rows count: 26864 || Dropped Rows: 6638
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 26864 || New rows count: 26864 || Dropped rows based on   age_cat  : 0
Original rows count: 26864 || New rows count: 26864 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 26864 || New rows count: 26749 || Dropped rows: 115
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2021/Chicago2021_clean.csv`


#### Imputation

In [None]:
del chi_file_raw_path, chi_file_cln_path

### 2022

In [23]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_clean.csv"

#### Cleaning

In [24]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 51087 || New rows count: 39939 || Dropped Rows: 11148
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39939 || New rows count: 39938 || Dropped rows based on   age_cat  : 1
Original rows count: 39938 || New rows count: 39938 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 39938 || New rows count: 39797 || Dropped rows: 141
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2022/Chicago2022_clean.csv`


#### Imputation

In [25]:
del chi_file_raw_path, chi_file_cln_path