# Notebook for Data cleaning and Imputation

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from utils.data_processing_utils import london_cleaner, hamburg_cleaner, stockholm_cleaner, boston_cleaner, \
                                        chicago_cleaner, houston_cleaner, save_df, valid_df, get_indices_of_rows_missing_data, preprocess_impute_fill, valid_splits_time
# from utils.data_processing_utils import *

## Constants 

### Paths

In [2]:
RAW_DATA_PATH = "Marathons_Data/Raw"
CLN_DATA_PATH = "Marathons_Data/Clean"
IMP_DATA_PATH = "Marathons_Data/Impute"
# London
LDN: str = "London"
LDN_RAW_PATH: str = f"{RAW_DATA_PATH}/{LDN}"
LDN_CLN_PATH: str = f"{CLN_DATA_PATH}/{LDN}"
LDN_IMP_PATH: str = f"{IMP_DATA_PATH}/{LDN}"
# Hamburg
HAM: str = "Hamburg"
HAM_RAW_PATH: str = f"{RAW_DATA_PATH}/{HAM}"
HAM_CLN_PATH: str = f"{CLN_DATA_PATH}/{HAM}"
HAM_IMP_PATH: str = f"{IMP_DATA_PATH}/{HAM}"
# Houston
HOU: str = "Houston"
HOU_RAW_PATH: str = f"{RAW_DATA_PATH}/{HOU}"
HOU_CLN_PATH: str = f"{CLN_DATA_PATH}/{HOU}"
HOU_IMP_PATH: str = f"{IMP_DATA_PATH}/{HOU}"
# Stockholm
STO: str = "Stockholm"
STO_RAW_PATH: str = f"{RAW_DATA_PATH}/{STO}"
STO_CLN_PATH: str = f"{CLN_DATA_PATH}/{STO}"
STO_IMP_PATH: str = f"{IMP_DATA_PATH}/{STO}"
# Boston
BOS: str = "Boston"
BOS_RAW_PATH: str = f"{RAW_DATA_PATH}/{BOS}"
BOS_CLN_PATH: str = f"{CLN_DATA_PATH}/{BOS}"
BOS_IMP_PATH: str = f"{IMP_DATA_PATH}/{BOS}"
# Chicago
CHI: str = "Chicago"
CHI_RAW_PATH: str = f"{RAW_DATA_PATH}/{CHI}"
CHI_CLN_PATH: str = f"{CLN_DATA_PATH}/{CHI}"
CHI_IMP_PATH: str = f"{IMP_DATA_PATH}/{CHI}"

### Years & Splits

In [3]:
YEAR_13: str = "2013"
YEAR_14: str = "2014"
YEAR_15: str = "2015"
YEAR_16: str = "2016"
YEAR_17: str = "2017"
YEAR_18: str = "2018"
YEAR_19: str = "2019"
YEAR_21: str = "2021"
YEAR_22: str = "2022"
YEAR_23: str = "2023"
YEARS: list[str] = [YEAR_13, YEAR_14, YEAR_15, YEAR_16, YEAR_17, YEAR_18, YEAR_19, YEAR_21, YEAR_22, YEAR_23]
SPLITS_KEYS: list[str] = ["k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]

COLS_ORDER: list[str] = ["age_cat", "gender", "race_state", "last_split", 
                         'k_5_time', 'k_5_pace', 'k_5_speed', 'k_10_time', 'k_10_pace', 'k_10_speed',
                         'k_15_time', 'k_15_pace', 'k_15_speed', 'k_20_time', 'k_20_pace', 'k_20_speed',
                         'k_half_time', 'k_half_pace', 'k_half_speed', 'k_25_time', 'k_25_pace', 'k_25_speed', 
                         'k_30_time', 'k_30_pace', 'k_30_speed', 'k_35_time', 'k_35_pace', 'k_35_speed',
                         'k_40_time', 'k_40_pace', 'k_40_speed', 'k_finish_time', 'k_finish_pace', 'k_finish_speed']

SPLIT_NAME_DICT: dict = {'k_5_time': '5K', 'k_10_time': '10K', 'k_15_time': '15K', 
                         'k_20_time': '20K', 'k_25_time': '25K', 'k_half_time': 'HALF', 
                         'k_30_time': '30K', 'k_35_time': '35K', 'k_40_time': '40K', 'k_finish_time': 'Finish time'}

DTYPE_DICT: defaultdict = defaultdict(np.float64, age_cat="category", gender="category", race_state="category", last_split="category")

### Other

In [4]:
# Intialise MinMaxScaler.
mms = MinMaxScaler()
# Intialise Imputers
knn_imputer = KNNImputer()
rfr = RandomForestRegressor(n_estimators=5, max_depth=10, bootstrap=True, max_samples=0.5, n_jobs=2, random_state=17)
iter_imputer = IterativeImputer(estimator=rfr, max_iter=15, random_state=17)

## London

In [5]:
LDN_COLS_TO_DROP = ["idp", "half", "finish", "run_no"] 

### 2014

In [5]:
# Raw file path.
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_full.csv"
# Clean file path.
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_clean.csv"
# Imputed files paths.
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_14}/{LDN}{YEAR_14}_iter_impute.csv"

#### Cleaning

In [6]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 41677 || New rows count: 36289 || Dropped Rows: 5388
** Dropping rows with splits that only contain time: Finished: 328 || Started: 27
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 35934 || New rows count: 35928 || Dropped rows based on   age_cat  : 6
Original rows count: 35928 || New rows count: 35928 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2014/London2014_clean.csv`


#### Imputation

In [6]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [7]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [8]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)



In [9]:
# Saving the imputed DataFrames.
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [10]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter 

### 2015

In [11]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_15}/{LDN}{YEAR_15}_iter_impute.csv"

#### Cleaning

In [9]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 43741 || New rows count: 37879 || Dropped Rows: 5862
** Dropping rows with splits that only contain time: Finished: 846 || Started: 17
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 37016 || New rows count: 36990 || Dropped rows based on   age_cat  : 26
Original rows count: 36990 || New rows count: 36990 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2015/London2015_clean.csv`


#### Imputation

In [12]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [13]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [14]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)



In [15]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [16]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter 

### 2016

In [17]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_16}/{LDN}{YEAR_16}_iter_impute.csv"

#### Cleaning

In [12]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 45202 || New rows count: 39217 || Dropped Rows: 5985
** Dropping rows with splits that only contain time: Finished: 448 || Started: 9
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 38760 || New rows count: 38760 || Dropped rows based on   age_cat  : 0
Original rows count: 38760 || New rows count: 38760 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2016/London2016_clean.csv`


#### Imputation

In [18]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [19]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [20]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)



In [21]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [22]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter 

### 2017

In [23]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_17}/{LDN}{YEAR_17}_iter_impute.csv"

#### Cleaning

In [15]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 45155 || New rows count: 39692 || Dropped Rows: 5463
** Dropping rows with splits that only contain time: Finished: 379 || Started: 16
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39297 || New rows count: 39296 || Dropped rows based on   age_cat  : 1
Original rows count: 39296 || New rows count: 39296 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2017/London2017_clean.csv`


#### Imputation

In [24]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [25]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [26]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)



In [27]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [28]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

### 2018

In [29]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_18}/{LDN}{YEAR_18}_iter_impute.csv"

#### Cleaning

In [18]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 47667 || New rows count: 40773 || Dropped Rows: 6894
** Dropping rows with splits that only contain time: Finished: 338 || Started: 7
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 40428 || New rows count: 40416 || Dropped rows based on   age_cat  : 12
Original rows count: 40416 || New rows count: 40416 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2018/London2018_clean.csv`


#### Imputation

In [30]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [31]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [32]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)



In [33]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [34]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

### 2019

In [5]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_19}/{LDN}{YEAR_19}_iter_impute.csv"

#### Cleaning

In [21]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 49318 || New rows count: 42737 || Dropped Rows: 6581
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 42737 || New rows count: 42737 || Dropped rows based on   age_cat  : 0
Original rows count: 42737 || New rows count: 42737 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2019/London2019_clean.csv`


#### Imputation

In [6]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [7]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 4 || First 3 Indices: [ 4579  7276 12700]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [12700]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 18 || First 3 Indices: [6343 8671 9363]
Invalid split time: k_15_time > k_20_time
Total Invalid: 1 || First 3 Indices: [19882]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 410 || First 3 Indices: [  27   81 1112]
Invalid split time: 

In [8]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 4 || First 3 Indices: [ 4579  7276 12700]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 17 || First 3 Indices: [6343 8671 9363]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 409 || First 3 Indices: [  27 1112 1121]
Invalid split time: k_20_time > k_half_time
Total Invalid: 1 || First 3 Indices: [1121]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (



In [9]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [10]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

### 2021

In [11]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_21}/{LDN}{YEAR_21}_iter_impute.csv"

#### Cleaning

In [24]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 41594 || New rows count: 36129 || Dropped Rows: 5465
** Dropping rows with splits that only contain time: Finished: 2 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 36127 || New rows count: 36124 || Dropped rows based on   age_cat  : 3
Original rows count: 36124 || New rows count: 36124 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2021/London2021_clean.csv`


#### Imputation

In [12]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [13]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 42 || First 3 Indices: [ 266 1610 1879]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 24 || First 3 Indices: [1685 2330 3322]
Invalid split time: k_15_time > k_20_time
Total Invalid: 1 || First 3 Indices: [27209]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 42 || First 3 Indices: [2410 3496 4840]
Invalid split time: k_20_time > k_half_time
Total Invalid: 1 || First 3 Indices: [18720]
---------------------

In [14]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 39 || First 3 Indices: [ 266 1610 1879]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 24 || First 3 Indices: [1685 2330 3322]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 44 || First 3 Indices: [2410 3496 4840]
Invalid split time: k_20_time > k_half_time
Total Invalid: 2 || First 3 Indices: [18720 20164]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative)



In [15]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [16]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

### 2022

In [17]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_22}/{LDN}{YEAR_22}_iter_impute.csv"

#### Cleaning

In [27]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 46993 || New rows count: 40812 || Dropped Rows: 6181
** Dropping rows with splits that only contain time: Finished: 1 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 40811 || New rows count: 40810 || Dropped rows based on   age_cat  : 1
Original rows count: 40810 || New rows count: 40810 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2022/London2022_clean.csv`


#### Imputation

In [18]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [19]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 5 || First 3 Indices: [ 1490  5053 19207]
Invalid split time: k_10_time > k_15_time
Total Invalid: 3 || First 3 Indices: [ 1490  5053 23642]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 21 || First 3 Indices: [ 5622  5783 12849]
Invalid split time: k_15_time > k_20_time
Total Invalid: 1 || First 3 Indices: [30330]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 53 || First 3 Indices: [3372 3796 4073]
Invali

In [20]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 5 || First 3 Indices: [ 1490  5053 19207]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 21 || First 3 Indices: [ 5622  5783 12849]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 50 || First 3 Indices: [3372 3796 6677]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time 



In [21]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [22]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

### 2023

In [23]:
ldn_file_raw_path = f"{LDN_RAW_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_full.csv"
ldn_file_cln_path = f"{LDN_CLN_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_clean.csv"
ldn_file_knn_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_knn_impute.csv"
ldn_file_iter_imp_path = f"{LDN_IMP_PATH}/{LDN}{YEAR_23}/{LDN}{YEAR_23}_iter_impute.csv"

#### Cleaning

In [30]:
df_ldn = pd.read_csv(ldn_file_raw_path)
# Cleaning the DataFrame.
df_ldn = london_cleaner(df_ldn, SPLITS_KEYS, LDN_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ldn):
    save_df(df_ldn, ldn_file_cln_path)
    print(f"** File has been saved in: `{ldn_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 53077 || New rows count: 49083 || Dropped Rows: 3994
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 49083 || New rows count: 49083 || Dropped rows based on   age_cat  : 0
Original rows count: 49083 || New rows count: 49083 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80-84', '80+', '85+' by '70+'
** File has been saved in: `Marathons_Data/Clean/London/London2023/London2023_clean.csv`


#### Imputation

In [24]:
df_ldn = pd.read_csv(ldn_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ldn, SPLITS_KEYS)

In [25]:
df_knn = preprocess_impute_fill(df_ldn, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 64 || First 3 Indices: [2036 3951 4002]
Invalid split time: k_10_time > k_15_time
Total Invalid: 4 || First 3 Indices: [ 3951  6047 24950]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 246 || First 3 Indices: [ 918 1338 1479]
Invalid split time: k_15_time > k_20_time
Total Invalid: 4 || First 3 Indices: [ 8828 24406 45505]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 150 || First 3 Indices: [ 586  874 106

In [26]:
df_iter = preprocess_impute_fill(df_ldn, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 60 || First 3 Indices: [2036 3951 4002]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 237 || First 3 Indices: [ 918 1338 1479]
Invalid split time: k_15_time > k_20_time
Total Invalid: 3 || First 3 Indices: [ 8828 45505 45506]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 148 || First 3 Indices: [ 586  874 1061]
Invalid split time: k_20_time > k_half_time
Total Invalid: 2 || First 3 Indices: [13156 18257]
-



In [27]:
save_df(df_knn, ldn_file_knn_imp_path)
save_df(df_iter, ldn_file_iter_imp_path)

In [28]:
del ldn_file_raw_path, ldn_file_cln_path, ldn_file_knn_imp_path, ldn_file_iter_imp_path, df_ldn, df_knn, df_iter

## Hamburg

In [19]:
HAM_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2013

In [5]:
# Raw file path.
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_full.csv"
# Clean file path.
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_clean.csv"
# Imputed file path.
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_13}/{HAM}{YEAR_13}_iter_impute.csv"

#### Cleaning

In [6]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 15135 || New rows count: 11872 || Dropped Rows: 3263
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 11872 || New rows count: 11872 || Dropped rows based on   age_cat  : 0
Original rows count: 11872 || New rows count: 11872 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 29 || Started: 1
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2013/Hamburg2013_clean.csv`


#### Imputation

In [6]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [7]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 6 || First 3 Indices: [ 457 2003 7082]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 1 || First 3 Indices: [849]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 9 || First 3 Indices: [ 880 1616 2003]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid:

In [8]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 6 || First 3 Indices: [ 457 2003 7082]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 1 || First 3 Indices: [849]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 10 || First 3 Indices: [  36  880 1616]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid



In [9]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [10]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2014

In [11]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_14}/{HAM}{YEAR_14}_iter_impute.csv"

#### Cleaning

In [9]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 16695 || New rows count: 13296 || Dropped Rows: 3399
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 13296 || New rows count: 13295 || Dropped rows based on   age_cat  : 1
Original rows count: 13295 || New rows count: 13295 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 13 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2014/Hamburg2014_clean.csv`


#### Imputation

In [12]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [13]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 1 || First 3 Indices: [3212]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [1022]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid: 1 || First 3 Indices: [474]
--------------------------------------------------
Invalid split time diff: k_30_time (non-cumulative) < (k_30_time - k_25_time - 5) OR k_30_time (non-cumulative) > (k_30_time 

In [14]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 1 || First 3 Indices: [3212]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [1022]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid: 1 || First 3 Indices: [474]
--------------------------------------------------
Invalid split time diff: k_30_time (non-cumulative) < (k_30_time - k_25_time - 5) OR k_30_time (non-cumulative) > (k_30_time 



In [15]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [16]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2015

In [21]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_15}/{HAM}{YEAR_15}_iter_impute.csv"

#### Cleaning

In [20]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 19205 || New rows count: 15259 || Dropped Rows: 3946
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 15259 || New rows count: 15257 || Dropped rows based on   age_cat  : 2
Original rows count: 15257 || New rows count: 15257 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 17 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2015/Hamburg2015_clean.csv`


#### Imputation

In [22]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [23]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 3 || First 3 Indices: [ 1354  3836 15119]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 21 || First 3 Indices: [185 545 595]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [2265 3278]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total I

In [24]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 3 || First 3 Indices: [ 1354  3836 15119]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 22 || First 3 Indices: [185 545 595]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [2265 3278]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total I



In [25]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [26]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2016

In [27]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_16}/{HAM}{YEAR_16}_iter_impute.csv"

#### Cleaning

In [15]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 16011 || New rows count: 12540 || Dropped Rows: 3471
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 12540 || New rows count: 12537 || Dropped rows based on   age_cat  : 3
Original rows count: 12537 || New rows count: 12537 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 5 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2016/Hamburg2016_clean.csv`


#### Imputation

In [28]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [29]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 12 || First 3 Indices: [  99 2354 2356]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 2 || First 3 Indices: [2992 3227]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [3719]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid: 1 

In [30]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 12 || First 3 Indices: [  99 2354 2356]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 2 || First 3 Indices: [2992 3227]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [3719]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid: 2 



In [31]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [32]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2017

In [33]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_17}/{HAM}{YEAR_17}_iter_impute.csv"

#### Cleaning

In [34]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 15638 || New rows count: 12396 || Dropped Rows: 3242
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 12396 || New rows count: 12391 || Dropped rows based on   age_cat  : 5
Original rows count: 12391 || New rows count: 12391 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 15 || Started: 3
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2017/Hamburg2017_clean.csv`


#### Imputation

In [35]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [36]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 4 || First 3 Indices: [2798 3245 7629]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 620 2131 3884]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [ 295 7629]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total In

In [37]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 4 || First 3 Indices: [2798 3245 7629]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 620 2131 3884]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [ 295 7629]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total In



In [38]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [39]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2018

In [40]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_18}/{HAM}{YEAR_18}_iter_impute.csv"

#### Cleaning

In [22]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 14010 || New rows count: 10670 || Dropped Rows: 3340
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 10670 || New rows count: 10668 || Dropped rows based on   age_cat  : 2
Original rows count: 10668 || New rows count: 10668 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2018/Hamburg2018_clean.csv`


#### Imputation

In [41]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [42]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 7 || First 3 Indices: [238 547 874]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 6 || First 3 Indices: [1128 1435 2076]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [9268]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid: 1

In [43]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 7 || First 3 Indices: [238 547 874]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 6 || First 3 Indices: [1128 1435 2076]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [9268]
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_30_time (non-cumulative) < (k_30_time - k_25_time - 5) OR k_30_time (non-cumulative)



In [44]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [45]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2019

In [46]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_19}/{HAM}{YEAR_19}_iter_impute.csv"

#### Cleaning

In [9]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 13498 || New rows count: 10468 || Dropped Rows: 3030
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 10468 || New rows count: 10453 || Dropped rows based on   age_cat  : 15
Original rows count: 10453 || New rows count: 10453 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2019/Hamburg2019_clean.csv`


#### Imputation

In [47]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [48]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 24 || First 3 Indices: [ 0 61 77]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [0]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 166 2768 3028]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 32 || First 3 Indices: [255 490 577]
Invalid split time: k_20_time > k_half_time
Total Invalid: 2 || First 3 Indices: [5684 5685]
-------------------------------

In [49]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 24 || First 3 Indices: [ 0 61 77]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 166 2768 3028]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 31 || First 3 Indices: [255 490 577]
Invalid split time: k_20_time > k_half_time
Total Invalid: 2 || First 3 Indices: [5684 5685]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_tim



In [50]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [51]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2022

In [52]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_22}/{HAM}{YEAR_22}_iter_impute.csv"

#### Cleaning

In [16]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 10416 || New rows count: 6888 || Dropped Rows: 3528
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 6888 || New rows count: 6840 || Dropped rows based on   age_cat  : 48
Original rows count: 6840 || New rows count: 6840 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2022/Hamburg2022_clean.csv`


#### Imputation

In [53]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [54]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 12 || First 3 Indices: [ 578  997 1362]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 203 1390 5879]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [4347 6776]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total I

In [55]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 11 || First 3 Indices: [ 578  997 1362]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 4 || First 3 Indices: [ 203 1390 2373]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [4347 6776]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total I



In [56]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [57]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

### 2023

In [58]:
ham_file_raw_path = f"{HAM_RAW_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_full.csv"
ham_file_cln_path = f"{HAM_CLN_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_clean.csv"
ham_file_knn_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_knn_impute.csv"
ham_file_iter_imp_path = f"{HAM_IMP_PATH}/{HAM}{YEAR_23}/{HAM}{YEAR_23}_iter_impute.csv"

#### Cleaning

In [25]:
df_ham = pd.read_csv(ham_file_raw_path)
# Cleaning The DataFrame
df_ham = hamburg_cleaner(df_ham, SPLITS_KEYS, HAM_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_ham):
    save_df(df_ham, ham_file_cln_path)
    print(f"** File has been saved in: `{ham_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 11757 || New rows count: 9002 || Dropped Rows: 2755
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 9002 || New rows count: 8998 || Dropped rows based on   age_cat  : 4
Original rows count: 8998 || New rows count: 8998 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** File has been saved in: `Marathons_Data/Clean/Hamburg/Hamburg2023/Hamburg2023_clean.csv`


#### Imputation

In [59]:
df_ham = pd.read_csv(ham_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_ham, SPLITS_KEYS)

In [60]:
df_knn = preprocess_impute_fill(df_ham, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 223 || First 3 Indices: [18 27 58]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [7230]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 22 || First 3 Indices: [ 30  45 808]
Invalid split time: k_15_time > k_20_time
Total Invalid: 2 || First 3 Indices: [2544 3928]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [2423 7358]
Invalid split time: k_20_time > k_

In [61]:
df_iter = preprocess_impute_fill(df_ham, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 213 || First 3 Indices: [27 58 73]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 22 || First 3 Indices: [ 30  45 808]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 2 || First 3 Indices: [2423 7358]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid:



In [62]:
save_df(df_knn, ham_file_knn_imp_path)
save_df(df_iter, ham_file_iter_imp_path)

In [63]:
del ham_file_raw_path, ham_file_cln_path, ham_file_knn_imp_path, ham_file_iter_imp_path, df_ham, df_knn, df_iter

## Houston
##### Pace and speed have been converted from min/mile and miles/h to sec/km and km/h respectively.

In [4]:
HOU_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2018

In [5]:
hou_file_raw_path = f"{HOU_RAW_PATH}/{HOU}{YEAR_18}/{HOU}{YEAR_18}_full.csv"
hou_file_cln_path = f"{HOU_CLN_PATH}/{HOU}{YEAR_18}/{HOU}{YEAR_18}_clean.csv"
hou_file_imp_path = f"{HOU_IMP_PATH}/{HOU}{YEAR_18}/{HOU}{YEAR_18}_imputed.csv"

#### Cleaning

In [6]:
df_hou = pd.read_csv(hou_file_raw_path)
# Cleaning the DataFrame.
df_hou = houston_cleaner(df_hou, SPLITS_KEYS, HOU_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_hou):
    save_df(df_hou, hou_file_cln_path)
    print(f"** File has been saved in: `{hou_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 7547 || New rows count: 7526 || Dropped Rows: 21
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 7526 || New rows count: 7526 || Dropped rows based on   age_cat  : 0
Original rows count: 7526 || New rows count: 7526 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [12-15, 16-19, Elites]:
Original rows count: 7526 || New rows count: 7388 || Dropped rows: 138
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** Dropping rows with invalid race state ['Other', 'DQ - No Reason Was Given', 'DQ - SWITCH from HALF to MARA']:
Original rows count: 7388 || New rows count: 7374 || Dropped rows: 14
** Dropping rows with splits that only contain time: Finished: 0 || Started: 4
** File has been saved in: `Marathons_Data/Clean/Houston/Houston2018/Houston2018_clean.c

#### Imputation

In [7]:
del hou_file_raw_path, hou_file_cln_path, hou_file_imp_path, df_hou

### 2019

In [8]:
hou_file_raw_path = f"{HOU_RAW_PATH}/{HOU}{YEAR_19}/{HOU}{YEAR_19}_full.csv"
hou_file_cln_path = f"{HOU_CLN_PATH}/{HOU}{YEAR_19}/{HOU}{YEAR_19}_clean.csv"
hou_file_imp_path = f"{HOU_IMP_PATH}/{HOU}{YEAR_19}/{HOU}{YEAR_19}_imputed.csv"

#### Cleaning

In [9]:
df_hou = pd.read_csv(hou_file_raw_path)
# Cleaning the DataFrame.
df_hou = houston_cleaner(df_hou, SPLITS_KEYS, HOU_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_hou):
    save_df(df_hou, hou_file_cln_path)
    print(f"** File has been saved in: `{hou_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 7159 || New rows count: 7145 || Dropped Rows: 14
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 7145 || New rows count: 7145 || Dropped rows based on   age_cat  : 0
Original rows count: 7145 || New rows count: 7145 || Dropped rows based on    gender  : 0
** Dropping rows with invalid age categories [12-15, 16-19, Elites]:
Original rows count: 7145 || New rows count: 7015 || Dropped rows: 130
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** Dropping rows with invalid race state ['Other', 'DQ - No Reason Was Given', 'DQ - SWITCH from HALF to MARA']:
Original rows count: 7015 || New rows count: 7003 || Dropped rows: 12
** Dropping rows with splits that only contain time: Finished: 0 || Started: 4
** File has been saved in: `Marathons_Data/Clean/Houston/Houston2019/Houston2019_clean.c

#### Imputation

In [10]:
del hou_file_raw_path, hou_file_cln_path, hou_file_imp_path, df_hou

## Stockholm

In [64]:
STO_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2021

In [65]:
sto_file_raw_path = f"{STO_RAW_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_full.csv"
sto_file_cln_path = f"{STO_CLN_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_clean.csv"
sto_file_knn_imp_path = f"{STO_IMP_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_knn_impute.csv"
sto_file_iter_imp_path = f"{STO_IMP_PATH}/{STO}{YEAR_21}/{STO}{YEAR_21}_iter_impute.csv"

#### Cleaning

In [6]:
df_sto = pd.read_csv(sto_file_raw_path)
# Cleaning the DataFrame.
df_sto = stockholm_cleaner(df_sto, SPLITS_KEYS, STO_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER, year=2021)
# Check if the DataFrame is valid before saving it.
if valid_df(df_sto):
    save_df(df_sto, sto_file_cln_path)
    print(f"** File has been saved in: `{sto_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 12179 || New rows count: 7177 || Dropped Rows: 5002
** Dropping rows with null values in `yob` and `gender` columns:
Original rows count: 7177 || New rows count: 7126 || Dropped rows based on     yob    : 51
Original rows count: 7126 || New rows count: 7126 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping non-adult runners (age < 18):
Original rows count: 7126 || New rows count: 7125 || Dropped rows: 1
** column name `yob` changed to `age_cat`.
** File has been saved in: `Marathons_Data/Clean/Stockholm/Stockholm2021/Stockholm2021_clean.csv`


#### Imputation

In [66]:
df_sto = pd.read_csv(sto_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_sto, SPLITS_KEYS)

In [67]:
df_knn = preprocess_impute_fill(df_sto, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 1 || First 3 Indices: [3282]
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [6939]
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_30_time (non-cumulative) < (k_30_time - k_25_time - 5) OR k_30_time (non-cumulative) > (k_30_time - k_25_time + 5)
Total Invalid: 1 || First 3 Indices: [5298]
--------------------------------------------------
Invalid split time diff: k_35_time (non-cumulative) < (k_35_time - k_30_time 

In [68]:
df_iter = preprocess_impute_fill(df_sto, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 1 || First 3 Indices: [3282]
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [6939]
--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_30_time (non-cumulative) < (k_30_time - k_25_time - 5) OR k_30_time (non-cumulative) > (k_30_time - k_25_time + 5)
Total Invalid: 1 || First 3 Indices: [5298]
--------------------------------------------------
Invalid split time diff: k_35_time (non-cumulative) < (k_35_time - k_30_time 



In [69]:
save_df(df_knn, sto_file_knn_imp_path)
save_df(df_iter, sto_file_iter_imp_path)

In [70]:
del sto_file_raw_path, sto_file_cln_path, sto_file_knn_imp_path, sto_file_iter_imp_path, df_sto, df_knn, df_iter

### 2022

In [71]:
sto_file_raw_path = f"{STO_RAW_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_full.csv"
sto_file_cln_path = f"{STO_CLN_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_clean.csv"
sto_file_knn_imp_path = f"{STO_IMP_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_knn_impute.csv"
sto_file_iter_imp_path = f"{STO_IMP_PATH}/{STO}{YEAR_22}/{STO}{YEAR_22}_iter_impute.csv"

#### Cleaning

In [9]:
df_sto = pd.read_csv(sto_file_raw_path)
# Cleaning the DataFrame.
df_sto = stockholm_cleaner(df_sto, SPLITS_KEYS, STO_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER, year=2022)
# Check if the DataFrame is valid before saving it.
if valid_df(df_sto):
    save_df(df_sto, sto_file_cln_path)
    print(f"** File has been saved in: `{sto_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 13593 || New rows count: 10161 || Dropped Rows: 3432
** Dropping rows with null values in `yob` and `gender` columns:
Original rows count: 10161 || New rows count: 10057 || Dropped rows based on     yob    : 104
Original rows count: 10057 || New rows count: 10057 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping non-adult runners (age < 18):
Original rows count: 10057 || New rows count: 10057 || Dropped rows: 0
** column name `yob` changed to `age_cat`.
** File has been saved in: `Marathons_Data/Clean/Stockholm/Stockholm2022/Stockholm2022_clean.csv`


#### Imputation

In [72]:
df_sto = pd.read_csv(sto_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_sto, SPLITS_KEYS)

In [73]:
df_knn = preprocess_impute_fill(df_sto, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 9 || First 3 Indices: [3114 3621 4636]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 9 || First 3 Indices: [ 123 1310 3807]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [6499]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid

In [74]:
df_iter = preprocess_impute_fill(df_sto, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 9 || First 3 Indices: [3114 3621 4636]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 9 || First 3 Indices: [ 123 1310 3807]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 1 || First 3 Indices: [6499]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)
Total Invalid



In [75]:
save_df(df_knn, sto_file_knn_imp_path)
save_df(df_iter, sto_file_iter_imp_path)

In [76]:
del sto_file_raw_path, sto_file_cln_path, sto_file_knn_imp_path, sto_file_iter_imp_path, df_sto, df_knn, df_iter

## Boston
##### Pace and speed have been converted from min/mile and miles/h to sec/km and km/h respectively.

In [77]:
BOS_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2014

In [78]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_14}/{BOS}{YEAR_14}_iter_impute.csv"

#### Cleaning

In [8]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 35671 || New rows count: 32447 || Dropped Rows: 3224
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 32123 || New rows count: 32123 || Dropped rows based on   age_cat  : 0
Original rows count: 32123 || New rows count: 32123 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2014/Boston2014_clean.csv`


#### Imputation

In [79]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [80]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
Invalid split time diff: k_10_time (non-cumulative) < (k_10_time - k_5_time - 5) OR k_10_time (non-cumulative) > (k_10_time - k_5_time + 5)
Total Invalid: 17 || First 3 Indices: [4726 7897 9079]
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 17 || First 3 Indices: [4649 6553 7485]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 14 || First 3 Indices: [ 568 5386 6549]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 27 || First 3 Indices: [ 4031 13369 15375]

In [82]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
Invalid split time diff: k_10_time (non-cumulative) < (k_10_time - k_5_time - 5) OR k_10_time (non-cumulative) > (k_10_time - k_5_time + 5)
Total Invalid: 17 || First 3 Indices: [4726 7897 9079]
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 17 || First 3 Indices: [4649 6553 7485]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 14 || First 3 Indices: [ 568 5386 6549]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 26 || First 3 Indices: [ 4031 13369 15375]



In [83]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [84]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2015

In [85]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_15}/{BOS}{YEAR_15}_iter_impute.csv"

#### Cleaning

In [11]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30252 || New rows count: 27159 || Dropped Rows: 3093
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 26986 || New rows count: 26986 || Dropped rows based on   age_cat  : 0
Original rows count: 26986 || New rows count: 26986 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2015/Boston2015_clean.csv`


#### Imputation

In [86]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [87]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 22 || First 3 Indices: [1300 1487 2492]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 4 || First 3 Indices: [ 4395 19968 21246]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 54 || First 3 Indices: [4677 5161 8290]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5

In [88]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 23 || First 3 Indices: [1300 1487 2492]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 3 || First 3 Indices: [ 4395 19968 21246]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 50 || First 3 Indices: [4677 5161 8290]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5



In [89]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [90]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2016

In [91]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_16}/{BOS}{YEAR_16}_iter_impute.csv"

#### Cleaning

In [14]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30743 || New rows count: 27487 || Dropped Rows: 3256
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27421 || New rows count: 27421 || Dropped rows based on   age_cat  : 0
Original rows count: 27421 || New rows count: 27421 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2016/Boston2016_clean.csv`


#### Imputation

In [92]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [93]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 23 || First 3 Indices: [ 526 1310 4996]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 7 || First 3 Indices: [ 8083 11459 12598]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 43 || First 3 Indices: [1621 1991 2092]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5

In [94]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 23 || First 3 Indices: [ 526 1310 4996]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 7 || First 3 Indices: [ 8083 11459 12598]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 39 || First 3 Indices: [1621 1991 2092]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5



In [95]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [96]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2017

In [117]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_17}/{BOS}{YEAR_17}_iter_impute.csv"

#### Cleaning

In [17]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30074 || New rows count: 27220 || Dropped Rows: 2854
** Dropping rows with splits that only contain time: Finished: 0 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27189 || New rows count: 27189 || Dropped rows based on   age_cat  : 0
Original rows count: 27189 || New rows count: 27189 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2017/Boston2017_clean.csv`


#### Imputation

In [118]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [99]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 44 || First 3 Indices: [ 786  986 1026]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 10 || First 3 Indices: [ 675 6135 6296]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 41 || First 3 Indices: [   8 1499 1962]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_25_time - k_half_time - 5) OR k_25_time (non-cumulative) > (k_25_time - k_half_time + 5)


In [100]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 44 || First 3 Indices: [ 786  986 1026]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 11 || First 3 Indices: [ 675 6135 6296]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 41 || First 3 Indices: [   8 1499 1962]
Invalid split time: k_20_time > k_half_time
Total Invalid: 1 || First 3 Indices: [13326]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k_



In [101]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [102]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2018 (ISSUE WITH IMPUTATION)

In [103]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_18}/{BOS}{YEAR_18}_iter_impute.csv"

#### Cleaning

In [20]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 29978 || New rows count: 26919 || Dropped Rows: 3059
** Dropping rows with splits that only contain time: Finished: 9 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 26910 || New rows count: 26910 || Dropped rows based on   age_cat  : 0
Original rows count: 26910 || New rows count: 26910 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2018/Boston2018_clean.csv`


#### Imputation

In [104]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [105]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 519 || First 3 Indices: [  7 127 201]
Invalid split time: k_10_time > k_15_time
Total Invalid: 17 || First 3 Indices: [ 201  569 1486]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 1623 || First 3 Indices: [ 4 17 35]
Invalid split time: k_15_time > k_20_time
Total Invalid: 23 || First 3 Indices: [ 388  934 1268]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 78 || First 3 Indices: [421 691 812]
Invalid spli

In [None]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [None]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [None]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2019

In [120]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_19}/{BOS}{YEAR_19}_iter_impute.csv"

#### Cleaning

In [23]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30234 || New rows count: 27337 || Dropped Rows: 2897
** Dropping rows with splits that only contain time: Finished: 37 || Started: 1
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27299 || New rows count: 27299 || Dropped rows based on   age_cat  : 0
Original rows count: 27299 || New rows count: 27299 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2019/Boston2019_clean.csv`


#### Imputation

In [121]:
df_bos = pd.read_csv(bos_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_bos, SPLITS_KEYS)

In [None]:
df_knn = preprocess_impute_fill(df_bos, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [None]:
df_iter = preprocess_impute_fill(df_bos, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

In [None]:
save_df(df_knn, bos_file_knn_imp_path)
save_df(df_iter, bos_file_iter_imp_path)

In [None]:
del bos_file_raw_path, bos_file_cln_path, bos_file_knn_imp_path, bos_file_iter_imp_path, df_bos, df_knn, df_iter

### 2021

In [25]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_21}/{BOS}{YEAR_21}_iter_impute.csv"

#### Cleaning

In [26]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 18074 || New rows count: 15645 || Dropped Rows: 2429
** Dropping rows with splits that only contain time: Finished: 1 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 15644 || New rows count: 15644 || Dropped rows based on   age_cat  : 0
Original rows count: 15644 || New rows count: 15644 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2021/Boston2021_clean.csv`


#### Imputation

In [27]:
del bos_file_raw_path, bos_file_cln_path, bos_file_imp_path, df_bos

### 2022

In [28]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_22}/{BOS}{YEAR_22}_iter_impute.csv"

#### Cleaning

In [29]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 28500 || New rows count: 25217 || Dropped Rows: 3283
** Dropping rows with splits that only contain time: Finished: 4 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 25213 || New rows count: 25213 || Dropped rows based on   age_cat  : 0
Original rows count: 25213 || New rows count: 25213 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2022/Boston2022_clean.csv`


#### Imputation

In [30]:
del bos_file_raw_path, bos_file_cln_path, bos_file_imp_path, df_bos

### 2023

In [31]:
bos_file_raw_path = f"{BOS_RAW_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_full.csv"
bos_file_cln_path = f"{BOS_CLN_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_clean.csv"
bos_file_knn_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_knn_impute.csv"
bos_file_iter_imp_path = f"{BOS_IMP_PATH}/{BOS}{YEAR_23}/{BOS}{YEAR_23}_iter_impute.csv"

#### Cleaning

In [32]:
df_bos = pd.read_csv(bos_file_raw_path, low_memory=False)
# # Cleaning the DataFrame.
df_bos = boston_cleaner(df_bos, SPLITS_KEYS, BOS_COLS_TO_DROP, COLS_ORDER)
# # Check if the DataFrame is valid before saving it.
if valid_df(df_bos):
    save_df(df_bos, bos_file_cln_path)
    print(f"** File has been saved in: `{bos_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 30105 || New rows count: 27058 || Dropped Rows: 3047
** Dropping rows with splits that only contain time: Finished: 3 || Started: 0
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 27055 || New rows count: 27055 || Dropped rows based on   age_cat  : 0
Original rows count: 27055 || New rows count: 27055 || Dropped rows based on    gender  : 0
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Boston/Boston2023/Boston2023_clean.csv`


#### Imputation

In [33]:
del bos_file_raw_path, bos_file_cln_path, bos_file_imp_path, df_bos

## Chicago

In [4]:
CHI_COLS_TO_DROP = ["idp", "finish", "run_no"] 

### 2014

In [5]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_14}/{CHI}{YEAR_14}_iter_impute.csv"

#### Cleaning

In [6]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 50216 || New rows count: 41715 || Dropped Rows: 8501
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 41715 || New rows count: 41715 || Dropped rows based on   age_cat  : 0
Original rows count: 41715 || New rows count: 41715 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 27 || Started: 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 41688 || New rows count: 41323 || Dropped rows: 365
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2014/Chicago2014_clean.csv`


#### Imputation

In [6]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [7]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 100 || First 3 Indices: [  64  941 3306]
Invalid split time: k_10_time > k_15_time
Total Invalid: 11 || First 3 Indices: [  64  941 3306]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 59 || First 3 Indices: [ 810  976 1347]
Invalid split time: k_15_time > k_20_time
Total Invalid: 7 || First 3 Indices: [ 7928 19273 20284]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 64 || First 3 Indices: [171 348 704]
Inv

In [8]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 99 || First 3 Indices: [  64  941 3306]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [17613]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 60 || First 3 Indices: [ 810  976 1347]
Invalid split time: k_15_time > k_20_time
Total Invalid: 1 || First 3 Indices: [26238]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 64 || First 3 Indices: [171 348 704]
Invalid split time: k_20_t



In [9]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [10]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2015

In [11]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_15}/{CHI}{YEAR_15}_iter_impute.csv"

#### Cleaning

In [9]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 46032 || New rows count: 39219 || Dropped Rows: 6813
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39219 || New rows count: 39219 || Dropped rows based on   age_cat  : 0
Original rows count: 39219 || New rows count: 39219 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 18 || Started: 1
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 39200 || New rows count: 38868 || Dropped rows: 332
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2015/Chicago2015_clean.csv`


#### Imputation

In [12]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [13]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 45 || First 3 Indices: [ 77 575 986]
Invalid split time: k_10_time > k_15_time
Total Invalid: 13 || First 3 Indices: [  77  986 1402]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 51 || First 3 Indices: [ 836 1098 2554]
Invalid split time: k_15_time > k_20_time
Total Invalid: 7 || First 3 Indices: [ 8797 16389 16950]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 35 || First 3 Indices: [ 196 2410 2443]
Inva

In [14]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 44 || First 3 Indices: [ 77 575 986]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 52 || First 3 Indices: [ 836 1098 1378]
Invalid split time: k_15_time > k_20_time
Total Invalid: 2 || First 3 Indices: [16389 16950]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 36 || First 3 Indices: [ 196 2410 2443]
Invalid split time: k_20_time > k_half_time
Total Invalid: 6 || First 3 Indices: [ 2443 21267 36406]
------



In [15]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [16]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2016

In [17]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_16}/{CHI}{YEAR_16}_iter_impute.csv"

#### Cleaning

In [12]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 49067 || New rows count: 41469 || Dropped Rows: 7598
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 41469 || New rows count: 41468 || Dropped rows based on   age_cat  : 1
Original rows count: 41468 || New rows count: 41468 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 22 || Started: 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 41446 || New rows count: 41141 || Dropped rows: 305
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2016/Chicago2016_clean.csv`


#### Imputation

In [18]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [19]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 67 || First 3 Indices: [1945 2366 2908]
Invalid split time: k_10_time > k_15_time
Total Invalid: 5 || First 3 Indices: [ 4411  7962 15862]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 78 || First 3 Indices: [ 776 1528 1538]
Invalid split time: k_15_time > k_20_time
Total Invalid: 9 || First 3 Indices: [ 1538 11723 13998]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 62 || First 3 Indices: [ 73 703 938]
In

In [20]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 67 || First 3 Indices: [1945 2366 2908]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [39943]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 82 || First 3 Indices: [ 776 1528 1538]
Invalid split time: k_15_time > k_20_time
Total Invalid: 2 || First 3 Indices: [24467 39564]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 62 || First 3 Indices: [ 73 703 938]
Invalid split time: 



In [21]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [22]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2017

In [25]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_17}/{CHI}{YEAR_17}_iter_impute.csv"

#### Cleaning

In [15]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 54326 || New rows count: 45565 || Dropped Rows: 8761
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 45565 || New rows count: 45565 || Dropped rows based on   age_cat  : 0
Original rows count: 45565 || New rows count: 45565 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 34 || Started: 3
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 45528 || New rows count: 45289 || Dropped rows: 239
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2017/Chicago2017_clean.csv`


#### Imputation

In [26]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [27]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 122 || First 3 Indices: [ 96 456 471]
Invalid split time: k_10_time > k_15_time
Total Invalid: 12 || First 3 Indices: [  96  728 3212]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 113 || First 3 Indices: [ 43 268 496]
Invalid split time: k_15_time > k_20_time
Total Invalid: 10 || First 3 Indices: [ 749 3525 4957]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 175 || First 3 Indices: [ 30  96 164]
Invalid s

In [28]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 123 || First 3 Indices: [ 96 456 471]
Invalid split time: k_10_time > k_15_time
Total Invalid: 1 || First 3 Indices: [40551]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 115 || First 3 Indices: [ 43 268 496]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 178 || First 3 Indices: [ 96 164 275]
Invalid split time: k_20_time > k_half_time
Total Invalid: 14 || First 3 Indices: [ 164 1494 1636]
-----------------



In [29]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [30]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2018

In [32]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_18}/{CHI}{YEAR_18}_iter_impute.csv"

#### Cleaning

In [18]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 55621 || New rows count: 45380 || Dropped Rows: 10241
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 45380 || New rows count: 45380 || Dropped rows based on   age_cat  : 0
Original rows count: 45380 || New rows count: 45380 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 25 || Started: 1
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 45354 || New rows count: 45119 || Dropped rows: 235
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2018/Chicago2018_clean.csv`


#### Imputation

In [33]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [34]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 227 || First 3 Indices: [313 334 337]
Invalid split time: k_10_time > k_15_time
Total Invalid: 27 || First 3 Indices: [1092 1979 2936]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 232 || First 3 Indices: [ 471 1356 2378]
Invalid split time: k_15_time > k_20_time
Total Invalid: 10 || First 3 Indices: [3477 6530 9082]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 166 || First 3 Indices: [ 125  206 1374]
Inv

In [35]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 220 || First 3 Indices: [313 334 337]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 235 || First 3 Indices: [ 471 1356 2378]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 167 || First 3 Indices: [ 125  206 1374]
Invalid split time: k_20_time > k_half_time
Total Invalid: 6 || First 3 Indices: [ 8241 16869 29787]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumul



In [36]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [37]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2019

In [38]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_19}/{CHI}{YEAR_19}_iter_impute.csv"

#### Cleaning

In [21]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 55395 || New rows count: 46513 || Dropped Rows: 8882
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 46513 || New rows count: 46512 || Dropped rows based on   age_cat  : 1
Original rows count: 46512 || New rows count: 46512 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 4 || Started: 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 46508 || New rows count: 46265 || Dropped rows: 243
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2019/Chicago2019_clean.csv`


#### Imputation

In [39]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [40]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 134 || First 3 Indices: [119 531 533]
Invalid split time: k_10_time > k_15_time
Total Invalid: 22 || First 3 Indices: [ 119  531 2102]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 26 || First 3 Indices: [ 323 2782 3318]
Invalid split time: k_15_time > k_20_time
Total Invalid: 5 || First 3 Indices: [ 3318  5314 13262]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 140 || First 3 Indices: [ 267  735 1054]
In

In [41]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 131 || First 3 Indices: [119 531 533]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 26 || First 3 Indices: [ 323 2782 3318]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 142 || First 3 Indices: [ 267  735 1054]
Invalid split time: k_20_time > k_half_time
Total Invalid: 5 || First 3 Indices: [  267  9769 19082]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumula



In [42]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [43]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2021

In [44]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_21}/{CHI}{YEAR_21}_iter_impute.csv"

#### Cleaning

In [24]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 33502 || New rows count: 26864 || Dropped Rows: 6638
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 26864 || New rows count: 26864 || Dropped rows based on   age_cat  : 0
Original rows count: 26864 || New rows count: 26864 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 19 || Started: 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 26845 || New rows count: 26730 || Dropped rows: 115
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2021/Chicago2021_clean.csv`


#### Imputation

In [45]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [46]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 83 || First 3 Indices: [692 844 959]
Invalid split time: k_10_time > k_15_time
Total Invalid: 4 || First 3 Indices: [ 959 1283 7778]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 108 || First 3 Indices: [ 82 103 213]
Invalid split time: k_15_time > k_20_time
Total Invalid: 8 || First 3 Indices: [3589 6589 7200]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 50 || First 3 Indices: [251 341 831]
Invalid split

In [47]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 80 || First 3 Indices: [692 771 844]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 110 || First 3 Indices: [ 82 103 213]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 50 || First 3 Indices: [251 341 831]
Invalid split time: k_20_time > k_half_time
Total Invalid: 6 || First 3 Indices: [ 341 1091 3924]
--------------------------------------------------
Invalid split time diff: k_25_time (non-cumulative) < (k



In [48]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [49]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter

### 2022

In [50]:
chi_file_raw_path = f"{CHI_RAW_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_full.csv"
chi_file_cln_path = f"{CHI_CLN_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_clean.csv"
chi_file_knn_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_knn_impute.csv"
chi_file_iter_imp_path = f"{CHI_IMP_PATH}/{CHI}{YEAR_22}/{CHI}{YEAR_22}_iter_impute.csv"

#### Cleaning

In [27]:
df_chi = pd.read_csv(chi_file_raw_path)
# Cleaning the DataFrame.
df_chi = chicago_cleaner(df_chi, SPLITS_KEYS, CHI_COLS_TO_DROP, SPLIT_NAME_DICT, COLS_ORDER)
# Check if the DataFrame is valid before saving it.
if valid_df(df_chi):
    save_df(df_chi, chi_file_cln_path)
    print(f"** File has been saved in: `{chi_file_cln_path}`")

** Removing Runners That did not start:
Original rows count: 51087 || New rows count: 39939 || Dropped Rows: 11148
** Dropping rows with null values in `age_cat` and `gender` columns:
Original rows count: 39939 || New rows count: 39938 || Dropped rows based on   age_cat  : 1
Original rows count: 39938 || New rows count: 39938 || Dropped rows based on    gender  : 0
** Dropping rows with splits that only contain time: Finished: 7 || Started: 0
** Dropping rows with invalid age categories [W-15, M-15, 19 and under]:
Original rows count: 39931 || New rows count: 39790 || Dropped rows: 141
** Replacing these age categories '20-24', '25-29', '30-34', and '35-39' by '18-39'
** Replacing these age categories '70-74', '75-79', '80+' by '70+'
** File has been saved in: `Marathons_Data/Clean/Chicago/Chicago2022/Chicago2022_clean.csv`


#### Imputation

In [51]:
df_chi = pd.read_csv(chi_file_cln_path, dtype=DTYPE_DICT)
missing_indices = get_indices_of_rows_missing_data(df_chi, SPLITS_KEYS)

In [52]:
df_knn = preprocess_impute_fill(df_chi, missing_indices, knn_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 229 || First 3 Indices: [249 296 310]
Invalid split time: k_10_time > k_15_time
Total Invalid: 76 || First 3 Indices: [296 310 389]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 68 || First 3 Indices: [193 277 448]
Invalid split time: k_15_time > k_20_time
Total Invalid: 14 || First 3 Indices: [ 277  478 1081]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 126 || First 3 Indices: [  75  748 1267]
Invalid sp

In [53]:
df_iter = preprocess_impute_fill(df_chi, missing_indices, iter_imputer, mms, SPLITS_KEYS, drop_invalid_splits=True)

--------------------------------------------------
--------------------------------------------------
Invalid split time diff: k_15_time (non-cumulative) < (k_15_time - k_10_time - 5) OR k_15_time (non-cumulative) > (k_15_time - k_10_time + 5)
Total Invalid: 215 || First 3 Indices: [296 310 389]
--------------------------------------------------
Invalid split time diff: k_20_time (non-cumulative) < (k_20_time - k_15_time - 5) OR k_20_time (non-cumulative) > (k_20_time - k_15_time + 5)
Total Invalid: 71 || First 3 Indices: [193 277 448]
Invalid split time: k_15_time > k_20_time
Total Invalid: 1 || First 3 Indices: [30749]
--------------------------------------------------
Invalid split time diff: k_half_time (non-cumulative) < (k_half_time - k_20_time - 5) OR k_half_time (non-cumulative) > (k_half_time - k_20_time + 5)
Total Invalid: 126 || First 3 Indices: [  75  748 1267]
Invalid split time: k_20_time > k_half_time
Total Invalid: 3 || First 3 Indices: [ 9197 34044 34472]
-------------



In [None]:
save_df(df_knn, chi_file_knn_imp_path)
save_df(df_iter, chi_file_iter_imp_path)

In [None]:
del chi_file_raw_path, chi_file_cln_path, chi_file_knn_imp_path, chi_file_iter_imp_path, df_chi, df_knn, df_iter