**Table of contents**<a id='toc0_'></a>    
- [Prepare the notebook](#toc1_)    
  - [Import necessary libraries](#toc1_1_)    
  - [Import the datasets](#toc1_2_)    
- [Task 2: Data Transformation](#toc2_)    
  - [Feature engineering and/or novel feature definition](#toc2_1_)    
  - [Outlier detection](#toc2_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Prepare the notebook](#toc0_)

## <a id='toc1_1_'></a>[Import necessary libraries](#toc0_)

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

## <a id='toc1_2_'></a>[Import the datasets](#toc0_)

In [83]:
# Load the dataset
df_races = pd.read_csv('dataset/races.csv')
df_cyclists = pd.read_csv('dataset/cyclists.csv')

# <a id='toc2_'></a>[Task 2: Data Transformation](#toc0_)

## <a id='toc2_1_'></a>[Feature engineering and/or novel feature definition](#toc0_)

In [None]:
df_imputed = df_cyclists.copy()

# Identify rows with missing values
missing_before = df_imputed[df_imputed[['weight', 'height']].isnull().any(axis=1)]

imputer = SimpleImputer(strategy='mean')

df_imputed[['weight', 'height']] = imputer.fit_transform(df_imputed[['weight', 'height']])

# Identify rows that had missing values before but are now imputed
imputed_rows = df_imputed.loc[missing_before.index]

# Display (only) the imputed rows
print(imputed_rows)


In [None]:
df_imputed = df_cyclists.copy()

columns_to_impute = ['weight', 'height']

imputer = KNNImputer(n_neighbors=3)

df_imputed[columns_to_impute] = imputer.fit_transform(df_imputed[columns_to_impute])

# Identify rows with missing values
missing_before = df_cyclists[df_cyclists[columns_to_impute].isnull().any(axis=1)]
imputed_rows = df_imputed.loc[missing_before.index]

# Display the imputed rows from the new dataframe
print("Imputed Rows:\n", imputed_rows)

In [None]:
# Maybe we can try KNNImputer while including the birthyear and encoding the country as a number?
# one-hot encoding makes sense for the nationality to avoid ordinal relationships between countries.
from sklearn.preprocessing import OneHotEncoder

df_imputed = df_cyclists.copy()

# 1. One-Hot Encode 'nationality' to include it in the imputation process
encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid multicollinearity
nationality_encoded = encoder.fit_transform(df_cyclists[['nationality']])
nationality_encoded_df = pd.DataFrame(nationality_encoded, columns=encoder.get_feature_names_out(['nationality']))

pd.set_option('display.max_columns', 100) 
pd.set_option('display.max_rows', 100)
# print(nationality_encoded_df.iloc[0])

df_impute_data = pd.concat([df_cyclists[['birth_year', 'weight', 'height']], nationality_encoded_df], axis=1)

# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=10)

# Perform the imputation
df_imputed_values = pd.DataFrame(imputer.fit_transform(df_impute_data), columns=df_impute_data.columns)

# Replace the imputed weight and height back into the original DataFrame copy
df_final = df_cyclists.copy()
df_final[['weight', 'height']] = df_imputed_values[['weight', 'height']]

# Identify the rows that had missing values before the imputation
columns_to_impute = ['weight', 'height']
missing_before = df_cyclists[df_cyclists[columns_to_impute].isnull().any(axis=1)]

# Display the imputed rows with the full cyclist information
imputed_rows = df_final.loc[missing_before.index]

# Display the imputed rows (with full cyclist information)
print("Imputed Rows:\n", imputed_rows)

In [None]:


df_imputed = df_cyclists.copy()

columns_to_impute = ['birth_year']

imputer = KNNImputer(n_neighbors=3)

df_imputed[columns_to_impute] = imputer.fit_transform(df_imputed[columns_to_impute])

# Identify rows with missing values
missing_before = df_cyclists[df_cyclists[columns_to_impute].isnull().any(axis=1)]
imputed_rows = df_imputed.loc[missing_before.index]

# Display the imputed rows from the new dataframe
print("Imputed Rows:\n", imputed_rows)

## <a id='toc2_2_'></a>[Outlier detection](#toc0_)

In [None]:
outliers = iqr(df_races, "average_temperature")

outliers = iqr(df_races, 'length')

# Print the first 10 outliers since we actually HAVE outliers here...
print(outliers.head(10))