### Importazione librerie

In [270]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from datasets import load_dataset
import numpy
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression


### Load datasets

In [271]:
# Load the dataset
dataset_cyclists = load_dataset('csv', data_files='dataset/cyclists.csv')
dataset_races = load_dataset('csv', data_files='dataset/races.csv')

# Convert to pandas DataFrames
df_cyclists = dataset_cyclists['train'].to_pandas()
df_races = dataset_races['train'].to_pandas()

In [None]:
# Display the first few rows of the cyclists dataset
df_cyclists.head()

In [None]:
# Display the first few rows of races dataset
df_races.head()

### Info cyclist dataset

In [None]:
# Cyclists dataset type columns
df_cyclists.dtypes

In [None]:
# Cyclists dataset info
df_cyclists.info()

In [None]:
# Cyclists dataset statistic description (numerical columns)
df_cyclists.describe()

In [None]:
# Cyclists dataset statistic description (categorical columns)
df_cyclists.describe(include='object')

In [None]:
# Shape of the dataset
df_cyclists.shape

In [None]:
# Number of unique values in each column
df_cyclists.nunique()

In [None]:
# Number of values in each column
df_cyclists.count()

In [None]:
# for each categorical column, the number of times each unique value appears in the column
for col in df_cyclists.select_dtypes(include='object').columns:
    print(df_cyclists[col].value_counts())

In [None]:
# Check for cyclists named Andrea Peron (for example) to see if there are any duplicates
df_cyclists[df_cyclists['name'] == 'Andrea  Peron']

In [None]:
# for each numerical column, the number of times each unique value appears in the column
for col in df_cyclists.select_dtypes(include='number').columns:
    print(df_cyclists[col].value_counts())

### Filter out duplicate records

In [None]:
# Extract duplicate rows
dups = df_cyclists.duplicated()

# Number of duplicate rows
dups.sum()


In [None]:
# Extract duplicate rows
dup_df = df_cyclists[df_cyclists.duplicated(keep=False)]

# Display duplicate rows
dup_df.head()

In [286]:
# Drop duplicate rows
df_cyclists = df_cyclists.drop_duplicates()

### Missing values

In [None]:
# Check for missing values
df_cyclists.isnull().sum()

In [None]:
# Extract rows with missing values
df_cyclists[df_cyclists.isnull().any(axis=1)]

In [None]:
# Show the record with missing value in birth_year
df_cyclists[df_cyclists['birth_year'].isnull()]

In [None]:
# show the record with missing values height
df_cyclists[df_cyclists['height'].isnull()]

In [None]:
# show the record with missing values weight
df_cyclists[df_cyclists['weight'].isnull()]

In [None]:
# Show the record with missing value in nationality
df_cyclists[df_cyclists['nationality'].isna()]

In [None]:
# Show the record with missing value both in height and weight
df_cyclists[(df_cyclists['height'].isnull()) & (df_cyclists['weight'].isnull())]

In [294]:
# drop rows with missing values -> NON SO SE E' IL CASO forse solo per chi ha missing values in height e weight
#df_cyclists = df_cyclists.dropna(subset=['height', 'weight'])

# df_cyclists = df_cyclists.dropna()

### Imputations

In [295]:
# Sturges rule for approximately Normal distributions
def sturges_bin_count(df_cyclists):
    """Calculate optimal number of bins based on Sturges' rule"""
    return int(np.ceil(np.log2(len(df_cyclists)) + 1))
# Freedman-Diaconis Rule is better for data with outliers and skewed distributions
def freedman_diaconis_bin_count(data):
    """Calculate optimal number of bins based on Freedman-Diaconis rule"""
    iqr = np.percentile(data, 75) - np.percentile(data, 25)
    bin_width = 2 * iqr * len(data) ** (-1/3)
    return int(np.ceil((data.max() - data.min()) / bin_width))

#bins_1 = sturges_bin_count(df_cyclists["birth_year"].dropna())
#bins_2 = freedman_diaconis_bin_count(df_cyclists["birth_year"].dropna())


In [None]:
# Plot histogram of birth year
seaborn.displot(df_cyclists, 
                x="birth_year", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
# Plot histogram of height
seaborn.displot(df_cyclists, 
                x="height", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
# Plot histogram of weight
seaborn.displot(df_cyclists, 
                x="weight", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
# Plot histogram of nationality
g = seaborn.displot(df_cyclists, 
                x="nationality", 
                kind="hist",
                row_order="desc",
                bins=len(df_cyclists['nationality'].unique()))
plt.xticks(rotation=90)

### Imputations on height and weight

In [None]:
df_cyclists.isnull().sum() # check for missing values, sum them up by column

### Drop rows with missing values in the nationality column

In [None]:
df_cyclists = df_cyclists.dropna(subset=['nationality']) # drop rows with missing values in the nationality column

df_cyclists.isnull().sum() # check for missing values, sum them up by column

### Imputazione del peso in base all'altezza (e viceversa) - imputazione basata su gruppi

In [None]:
df_cyclists_filtered = df_cyclists.dropna(subset=["height", "weight"], how='all')

df_cyclists_filtered['height_group'] = pandas.cut(df_cyclists_filtered['height'], bins=5)

group_means = df_cyclists_filtered.groupby('height_group')['weight'].mean()

df_cyclists_filtered['weight'] = df_cyclists_filtered.apply(
    lambda row: group_means[row['height_group']] if pandas.isnull(row['weight']) else row['weight'],
    axis=1
)

df_cyclists.update(df_cyclists_filtered)


In [None]:
df_cyclists_filtered = df_cyclists.dropna(subset=["height", "weight"], how='all')
df_cyclists_filtered['weight_group'] = pandas.cut(df_cyclists_filtered['weight'], bins=5)

group_means = df_cyclists_filtered.groupby('weight_group')['height'].mean()

df_cyclists_filtered['height'] = df_cyclists_filtered.apply(
    lambda row: group_means[row['weight_group']] if pandas.isnull(row['height']) else row['height'],
    axis=1
)

df_cyclists.update(df_cyclists_filtered)

df_cyclists.isnull().sum()

In [None]:
seaborn.displot(df_cyclists,
                x="weight",
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
seaborn.displot(df_cyclists,
                x="height",
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
correlation, _ = pearsonr(df_cyclists['height'].dropna(), df_cyclists['weight'].dropna())
print(f'Correlazione tra altezza e peso: {correlation}')

In [None]:
seaborn.scatterplot(x='height', y='weight', data=df_cyclists)
plt.title('Scatter plot di altezza vs peso')
plt.show()

### Imputazione con valori casuali basati sulla distribuzione per i record con sia peso che altezza a NaN 

In [308]:
both_missing = df_cyclists[df_cyclists[['height', 'weight']].isna().all(axis=1)]

random_heights = np.random.choice(df_cyclists['height'].dropna(), size=len(both_missing))
random_weights = np.random.choice(df_cyclists['weight'].dropna(), size=len(both_missing))

df_cyclists.loc[both_missing.index, 'height'] = random_heights
df_cyclists.loc[both_missing.index, 'weight'] = random_weights

In [None]:
seaborn.displot(df_cyclists,
                x="weight",
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
seaborn.displot(df_cyclists,
                x="height",
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
df_cyclists.isnull().sum()

### KNN imputation for birth_year column

In [312]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_cyclists['nationality'] = le.fit_transform(df_cyclists['nationality'].astype(str))

df_impute = df_cyclists.copy()
columns_for_imputation = ['height', 'weight', 'nationality', 'birth_year']
df_impute = df_impute[columns_for_imputation]

imputer = KNNImputer(n_neighbors=5)
df_imputed = imputer.fit_transform(df_impute)

df_cyclists['birth_year'] = df_imputed[:, 3]

In [None]:
df_cyclists.isnull().sum()

In [None]:
seaborn.displot(df_cyclists, 
                x="birth_year", 
                kind="hist",
                row_order="desc",
                bins=15)

### KNN imputations varying K on height and weight

In [None]:
dataset_cyclists_2 = load_dataset('csv', data_files='dataset/cyclists.csv')
df_cyclists_2 = dataset_cyclists_2['train'].to_pandas()

df_cyclists_2.isnull().sum()


In [316]:
df_cyclists_2['nationality'] = le.fit_transform(df_cyclists_2['nationality'].astype(str))

columns_for_imputation = ['height', 'weight', 'nationality', 'birth_year']
df_impute = df_cyclists_2[columns_for_imputation]


imputer = KNNImputer(n_neighbors=30)
df_imputed = imputer.fit_transform(df_impute)

df_cyclists_2['height'] = df_imputed[:, 0]
df_cyclists_2['weight'] = df_imputed[:, 1]
df_cyclists_2['nationality'] = df_imputed[:, 2]
df_cyclists_2['birth_year'] = df_imputed[:, 3]

In [None]:
seaborn.displot(df_cyclists_2, 
                x="weight", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
seaborn.displot(df_cyclists_2, 
                x="weight", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
seaborn.displot(df_cyclists_2, 
                x="nationality", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
seaborn.displot(df_cyclists_2, 
                x="birth_year", 
                kind="hist",
                row_order="desc",
                bins=15)

In [None]:
df_cyclists_2.isnull().sum()