## 1. sklearn.datasets: Caricamento e Generazione di Dataset

### Esercizio 1.1 (Base): Esplorazione Dataset Integrati
Carica il dataset Wine e analizza le sue caratteristiche principali.

In [None]:
# Esercizio 1.1: Carica il dataset Wine e rispondi alle seguenti domande:
# 1. Quanti campioni e feature contiene?
# 2. Quante classi ci sono?
# 3. Quali sono i nomi delle feature?
# 4. Visualizza le prime 3 righe dei dati


In [64]:
from sklearn.datasets import load_wine
import pandas as pd

wine = load_wine()


In [4]:
# 1. Quanti campioni e feature contiene?
n_samples, n_features = wine.data.shape
print(f"Campioni: {n_samples}, Feature: {n_features}")


Campioni: 178, Feature: 13


In [54]:
print(wine.keys())


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [5]:
# 2. Quante classi ci sono?
n_classi = len(set(wine.target))
print(f"Numero di classi: {n_classi}")


Numero di classi: 3


In [6]:
# 3. Quali sono i nomi delle feature?
feature_names = wine.feature_names
print(f"nomi feature: {feature_names}")


nomi feature: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [12]:
# 4. Visualizza le prime 3 righe dei dati
df = pd.DataFrame(wine.data, columns=feature_names)
print("prime 3 righe:")
print(df.head(3))


prime 3 righe:
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   

   od280/od315_of_diluted_wines  proline  
0                          3.92   1065.0  
1                          3.40   1050.0  
2                          3.17   1185.0  


### Esercizio 1.2 (Intermedio): Generazione Dataset Personalizzato
Crea un dataset sintetico per un problema di classificazione con specifiche caratteristiche.

In [None]:
# Esercizio 1.2: Genera un dataset di classificazione con:
# - 500 campioni
# - 8 feature totali (4 informative, 2 ridondanti)
# - 3 classi
# - 2 cluster per classe
# Poi visualizza le statistiche di base del dataset creato

In [None]:
from sklearn.datasets import make_classification
import pandas as pd


In [58]:
# Generazione del dataset
X, y = make_classification(
    n_samples=500,# - 500 campioni
    n_features=8,# - 8 feature totali (4 informative, 2 ridondanti)
    n_informative=4,
    n_redundant=2,
    n_classes=3,# - 3 classi
    n_clusters_per_class=2,# - 2 cluster per classe
    random_state=42
)


In [59]:
df = pd.DataFrame(X, columns=[f'Feature{i+1}' for i in range(X.shape[1])])
df['Classe'] = y

print(df.describe())


         Feature1    Feature2    Feature3    Feature4    Feature5    Feature6  \
count  500.000000  500.000000  500.000000  500.000000  500.000000  500.000000   
mean     0.027704    0.063435    0.334700    0.092092   -0.040661   -0.305071   
std      1.648191    1.016804    1.567857    1.530861    1.457437    1.515622   
min     -4.966780   -3.386814   -3.715971   -3.970294   -5.170028   -4.756125   
25%     -1.151050   -0.628710   -0.715418   -0.942849   -1.112372   -1.324587   
50%      0.079064    0.061462    0.486138    0.146235   -0.082120   -0.216098   
75%      1.276206    0.757136    1.403241    1.157614    0.994071    0.807073   
max      5.024299    2.562528    5.521261    5.435939    4.058305    4.392493   

         Feature7    Feature8      Classe  
count  500.000000  500.000000  500.000000  
mean     0.388791    0.050137    0.996000  
std      1.694436    0.994684    0.815668  
min     -5.132732   -2.924925    0.000000  
25%     -0.893219   -0.638717    0.000000  
50%   

### Esercizio 1.3 (Avanzato): Confronto Dataset Reali vs Sintetici
Confronta le caratteristiche del dataset Diabetes con un dataset sintetico di regressione.

In [None]:
# Esercizio 1.3: 
# 1. Carica il dataset Diabetes
# 2. Crea un dataset sintetico di regressione con le stesse dimensioni
# 3. Confronta le statistiche (media, std, range) delle feature

In [None]:
from sklearn.datasets import load_diabetes, make_regression
import pandas as pd


In [25]:
# 1. Carica il dataset Diabetes
diabetes = load_diabetes()
X_real = diabetes.data
y_real = diabetes.target
feature_names = diabetes.feature_names


In [None]:
# 2. Crea un dataset sintetico di regressione con le stesse dimensioni

X_sint, y_sint = make_regression(
    n_samples=X_real.shape[0],
    n_features=X_real.shape[1],
    noise=10,
    random_state=42
)


In [28]:
# 3. Confronta le statistiche: media, std, min, max
df_real = pd.DataFrame(X_real, columns=feature_names)
df_sint = pd.DataFrame(X_sint, columns=feature_names)


In [32]:
# Calcolo statistiche descrittive
stats_real = df_real.describe().loc[['mean', 'std', 'min', 'max']]
stats_sint = df_sint.describe().loc[['mean', 'std', 'min', 'max']]

print('Stat Reali')
print(stats_real)
print('\nStat Sintetici')
print(stats_sint)


Stat Reali
               age           sex           bmi            bp            s1  \
mean -2.511817e-19  1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17   
std   4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   
min  -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01   
max   1.107267e-01  5.068012e-02  1.705552e-01  1.320436e-01  1.539137e-01   

                s2            s3            s4            s5            s6  
mean  3.918434e-17 -5.777179e-18 -9.042540e-18  9.293722e-17  1.130318e-17  
std   4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  
min  -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01  
max   1.987880e-01  1.811791e-01  1.852344e-01  1.335973e-01  1.356118e-01  

Stat Sintetici
           age       sex       bmi        bp        s1        s2        s3  \
mean -0.005560  0.085763  0.007673 -0.008875  0.058464  0.018350  0.014846   
std   1.024488  0.953936  1.035117  1.027

## 2. sklearn.preprocessing: Preprocessing dei Dati

### Esercizio 2.1 (Base): Scalatura Dati
Applica diverse tecniche di scalatura sui dati del Wine dataset.

In [None]:
# Esercizio 2.1: Applica StandardScaler e MinMaxScaler al dataset Wine
# Confronta i risultati mostrando media e deviazione standard prima e dopo

In [33]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd


In [34]:
wine = load_wine()
X = wine.data
feature_names = wine.feature_names

df_original = pd.DataFrame(X, columns=feature_names)


In [35]:
# Applica StandardScaler
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)
df_std = pd.DataFrame(X_std, columns=feature_names)



In [36]:
# Applica MinMaxScaler
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)
df_minmax = pd.DataFrame(X_minmax, columns=feature_names)


In [53]:
# Funzione per stampare media e std
def show_stats(title, df):
    print(f"{title}")
    stats = pd.DataFrame({
        "Media": df.mean().round(3),
        "Deviazione Std": df.std().round(3)
    })
    print(stats)

show_stats("Dati originali", df_original)
show_stats("StandardScaler", df_std)
show_stats("MinMaxScaler", df_minmax)


Dati originali
                                Media  Deviazione Std
alcohol                        13.001           0.812
malic_acid                      2.336           1.117
ash                             2.367           0.274
alcalinity_of_ash              19.495           3.340
magnesium                      99.742          14.282
total_phenols                   2.295           0.626
flavanoids                      2.029           0.999
nonflavanoid_phenols            0.362           0.124
proanthocyanins                 1.591           0.572
color_intensity                 5.058           2.318
hue                             0.957           0.229
od280/od315_of_diluted_wines    2.612           0.710
proline                       746.893         314.907
StandardScaler
                              Media  Deviazione Std
alcohol                         0.0           1.003
malic_acid                      0.0           1.003
ash                            -0.0           1.003
alcali

### Esercizio 2.2 (Intermedio): Gestione Dati Mancanti
Crea e gestisci un dataset con valori mancanti usando diverse strategie di imputazione.

In [None]:
# Esercizio 2.2: 
# 1. Crea un array con valori mancanti (NaN) in posizioni casuali
# 2. Applica SimpleImputer con strategie: 'mean', 'median', 'most_frequent'
# 3. Confronta i risultati

In [43]:
import numpy as np
from sklearn.impute import SimpleImputer
import pandas as pd


In [44]:
# 1. Crea dati con valori mancanti (NaN)
data_nan = np.array([
    [1, 2, np.nan],
    [4, np.nan, 6],
    [7, 8, 9]
])


In [45]:
# 2. Applica le 3 strategie di imputazione
strategies = ['mean', 'median', 'most_frequent']
results = {}

for strategy in strategies:
    imputer = SimpleImputer(strategy=strategy)
    imputed_data = imputer.fit_transform(data_nan)
    results[strategy] = pd.DataFrame(imputed_data, columns=["Col1", "Col2", "Col3"])


In [47]:
print("Dati con NaN:")
print(pd.DataFrame(data_nan, columns=["Col1", "Col2", "Col3"]))

for strategy in strategies:
    print(f"\nStrategia: {strategy.upper()}")
    print(results[strategy])


Dati con NaN:
   Col1  Col2  Col3
0   1.0   2.0   NaN
1   4.0   NaN   6.0
2   7.0   8.0   9.0

Strategia: MEAN
   Col1  Col2  Col3
0   1.0   2.0   7.5
1   4.0   5.0   6.0
2   7.0   8.0   9.0

Strategia: MEDIAN
   Col1  Col2  Col3
0   1.0   2.0   7.5
1   4.0   5.0   6.0
2   7.0   8.0   9.0

Strategia: MOST_FREQUENT
   Col1  Col2  Col3
0   1.0   2.0   6.0
1   4.0   2.0   6.0
2   7.0   8.0   9.0
