### Importazione librerie

In [20]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from datasets import load_dataset
import numpy
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


### Load datasets

In [21]:
# Load the dataset
dataset_races = load_dataset('csv', data_files='dataset/races.csv')

df_races = dataset_races['train'].to_pandas()

In [None]:
# Display the first few rows of races dataset
df_races.head()

### Info races dataset

In [None]:
# Cyclists dataset type columns
df_races.dtypes

In [None]:
# Cyclists dataset info
df_races.info()

In [None]:
# Cyclists dataset statistic description (numerical columns)
df_races.describe()

In [None]:
# Cyclists dataset statistic description (categorical columns)
df_races.describe(include='object')

In [None]:
# Shape of the dataset
df_races.shape

In [None]:
# Number of unique values in each column
df_races.nunique()

In [None]:
# Number of values in each column
df_races.count()

In [None]:
# for each categorical column, the number of times each unique value appears in the column
for col in df_races.select_dtypes(include='object').columns:
    print(df_races[col].value_counts())

In [None]:
# for each numerical column, the number of times each unique value appears in the column
for col in df_races.select_dtypes(include='number').columns:
    print(df_races[col].value_counts())

### Filter out duplicate records

In [None]:
# Extract duplicate rows
dups = df_races.duplicated()

# Number of duplicate rows
dups.sum()


In [None]:
# Extract duplicate rows
dup_df = df_races[df_races.duplicated(keep=False)]

# Display duplicate rows
dup_df.head()

In [34]:
# Drop duplicate rows
df_cyclists = df_races.drop_duplicates()

### Missing values

In [None]:
# Check for missing values
df_races.isnull().sum()

In [None]:
# Extract rows with missing values
df_races[df_cyclists.isnull().any(axis=1)]

### Imputations

In [37]:
# Sturges rule for approximately Normal distributions
def sturges_bin_count(df_cyclists):
    """Calculate optimal number of bins based on Sturges' rule"""
    return int(np.ceil(np.log2(len(df_cyclists)) + 1))
# Freedman-Diaconis Rule is better for data with outliers and skewed distributions
def freedman_diaconis_bin_count(data):
    """Calculate optimal number of bins based on Freedman-Diaconis rule"""
    iqr = np.percentile(data, 75) - np.percentile(data, 25)
    bin_width = 2 * iqr * len(data) ** (-1/3)
    return int(np.ceil((data.max() - data.min()) / bin_width))

#bins_1 = sturges_bin_count(df_cyclists["birth_year"].dropna())
#bins_2 = freedman_diaconis_bin_count(df_cyclists["birth_year"].dropna())


In [None]:
df_cyclists.isnull().sum() # check for missing values, sum them up by column

### Correlations and distributions

In [39]:
from typing import Dict

import pandas


def correlations(dataset: pandas.DataFrame) -> pandas.DataFrame:
    correlations_dictionary = {
        correlation_type: dataset.corr(numeric_only=True, method=correlation_type)
        for correlation_type in ("kendall", "pearson", "spearman")
    }
    for i, k in enumerate(correlations_dictionary.keys()):
        correlations_dictionary[k].loc[:, "correlation_type"] = k
    correlations_matrix = pandas.concat(correlations_dictionary.values())

    return correlations_matrix

In [40]:
from typing import Tuple, Dict, Any

from sklearn.preprocessing import StandardScaler
import pandas


def __transform_single_features(dataset: pandas.DataFrame, transformation: str) -> Tuple[
    pandas.DataFrame, Dict[str, Any]]:
    match transformation:
        case "standard":
            transformed_dataset = dataset.copy().select_dtypes(exclude=["object", "category", "bool", "datetime64"])
            transformations = dict()

            for feature in transformed_dataset.columns:
                transformations[feature] = StandardScaler()
                transformed_feature = transformations[feature].fit_transform(transformed_dataset[[feature]]).squeeze()
                transformed_dataset = transformed_dataset.astype({feature: transformed_feature.dtype})
                transformed_dataset.loc[:, feature] = transformed_feature
        case _:
            raise ValueError(f"Unknown transformation: {transformation}")

    return transformed_dataset, transformations

def center_and_scale(dataset: pandas.DataFrame) -> Tuple[pandas.DataFrame, Dict[str, Any]]:
    """Shifts data to the origin: removes mean and scales by standard deviation all numeric features. Returns a copy of the dataset."""
    return __transform_single_features(dataset, "standard")


def drop_boolean(dataset: pandas.DataFrame) -> pandas.DataFrame:
    return dataset.select_dtypes(exclude="bool")

In [None]:
normalized_dataset, normalization_scalers_dataset = center_and_scale(df_races) # center and scale the dataset

correlations = correlations(normalized_dataset) # calculate the correlations between the columns
correlations # show the correlations

In [None]:
import seaborn
import matplotlib.pyplot as plt

#seaborn.pairplot(df_races) # create a pairplot of the dataset

In [None]:
for feature in df_races.select_dtypes(include="number").columns: 
    seaborn.displot(df_races, x=feature) # create a histogram of each numerical feature

In [None]:
seaborn.boxenplot(normalized_dataset, orient="h") # create a boxen plot of the normalized dataset
normalized_dataset.head() 

In [None]:
g = seaborn.catplot(normalized_dataset, kind="box") # create a box plot of the normalized dataset
g.set_xticklabels(rotation=90) # rotate the x-axis labels