### Importazione librerie

In [103]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from datasets import load_dataset




### Load datasets

In [104]:
# Load the dataset
dataset_cyclists = load_dataset('csv', data_files='dataset/cyclists.csv')
dataset_races = load_dataset('csv', data_files='dataset/races.csv')

# Convert to pandas DataFrames
df_cyclists = dataset_cyclists['train'].to_pandas()
df_races = dataset_races['train'].to_pandas()

In [None]:
# Display the first few rows of the cyclists dataset
df_cyclists.head()

In [None]:
# Display the first few rows of races dataset
df_races.head()

### Info cyclist dataset

In [None]:
# Cyclists dataset type columns
df_cyclists.dtypes

In [None]:
# Cyclists dataset info
df_cyclists.info()

In [None]:
# Cyclists dataset statistic description (numerical columns)
df_cyclists.describe()

In [None]:
# Cyclists dataset statistic description (categorical columns)
df_cyclists.describe(include='object')

In [None]:
# Shape of the dataset
df_cyclists.shape

In [None]:
# Number of unique values in each column
df_cyclists.nunique()

In [None]:
# Number of values in each column
df_cyclists.count()

In [None]:
# for each categorical column, the number of times each unique value appears in the column
for col in df_cyclists.select_dtypes(include='object').columns:
    print(df_cyclists[col].value_counts())

In [None]:
# Check for cyclists named Andrea Peron (for example) to see if there are any duplicates
df_cyclists[df_cyclists['name'] == 'Andrea  Peron']

In [None]:
# for each numerical column, the number of times each unique value appears in the column
for col in df_cyclists.select_dtypes(include='number').columns:
    print(df_cyclists[col].value_counts())

### Filter out duplicate records

In [None]:
# Extract duplicate rows
dups = df_cyclists.duplicated()

# Number of duplicate rows
dups.sum()


In [None]:
# Extract duplicate rows
dup_df = df_cyclists[df_cyclists.duplicated(keep=False)]

# Display duplicate rows
dup_df.head()

In [119]:
# Drop duplicate rows
df_cyclists = df_cyclists.drop_duplicates()

### Missing values

In [None]:
# Check for missing values
df_cyclists.isnull().sum()

# df_cyclists.isna().sum()

In [None]:
# Extract rows with missing values
df_cyclists[df_cyclists.isnull().any(axis=1)]

# df_cyclists[df_cyclists.isna().any(axis=1)]

In [None]:
# Show the record with missing value in birth_year
df_cyclists[df_cyclists['birth_year'].isnull()]

In [None]:
# show the record with missing values height
df_cyclists[df_cyclists['height'].isnull()]

In [None]:
# show the record with missing values weight
df_cyclists[df_cyclists['weight'].isnull()]

In [None]:
# Show the record with missing value in nationality
df_cyclists[df_cyclists['nationality'].isna()]

In [None]:
# Show the record with missing value both in height and weight
df_cyclists[(df_cyclists['height'].isnull()) & (df_cyclists['weight'].isnull())]

In [127]:
# drop rows with missing values -> NON SO SE E' IL CASO forse solo per chi ha missing values in height e weight
#df_cyclists = df_cyclists.dropna(subset=['height', 'weight'])

# df_cyclists = df_cyclists.dropna()

### Imputations

In [None]:
# Plot histogram of birth year
seaborn.displot(df_cyclists, 
                x="birth_year", 
                kind="hist",
                row_order="desc",
                bins=20)

In [None]:
# Plot histogram of height
seaborn.displot(df_cyclists, 
                x="height", 
                kind="hist",
                row_order="desc",
                bins=20)

In [None]:
# Plot histogram of weight
seaborn.displot(df_cyclists, 
                x="weight", 
                kind="hist",
                row_order="desc",
                bins=20)

In [None]:
# Plot histogram of nationality
g = seaborn.displot(df_cyclists, 
                x="nationality", 
                kind="hist",
                row_order="desc",
                bins=len(df_cyclists['nationality'].unique()))
plt.xticks(rotation=90)