<a href="https://colab.research.google.com/github/EnFiore/ai-machine-learning-modelli-e-algoritmi/blob/main/1%20-%20L'algoritmo%20Gradient%20Descent/gradient_descent_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mini Batch Gradient Descent e Online Learning
L'ospedale San Giuseppe del Santissimo Cuore ti ha incaricato di addestrare un modello in grado di riconoscere tumori al seno maligni, in modo da poter aiutare i propri medici nelle diagnosi. Per farlo ti ha fornito [questi dati](https://github.com/ProfAI/machine-learning-modelli-e-algoritmi/blob/main/datasets/breast_cancer.csv).
</br>
Nel farlo utilizza il mini-batch gradient descent, testando diversi batch size: 8, 16, 32, 64, 128.
</br>
Seleziona il modello con le metriche migliori sul test set.
</br>
L'ospedale utilizza il modello da te realizzato per eseguire delle classificazioni e ti fornisce questi [nuovi dati](https://github.com/ProfAI/machine-learning-modelli-e-algoritmi/blob/main/datasets/breast_cancer_update.csv), sfruttali per migliorare il modello.

## Soluzione

### Dipendenze

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score, log_loss

### Costanti

In [None]:
RANDOM_SEED = 0
BASE_URL = "https://raw.githubusercontent.com/ProfAI/machine-learning-modelli-e-algoritmi/main/datasets/"

### Importiamo il dataset

In [None]:
df = pd.read_csv(BASE_URL+"breast_cancer.csv")
df.head()

Unnamed: 0,ID number,diagnosis,radius mean,texture mean,perimeter mean,area mean,smoothness mean,compactness mean,concavity mean,concave points mean,...,radius worst,texture worst,perimeter worst,area worst,smoothness worstse,compactness worst,concavity worst,concave points worst,symmetry worst,fractal dimension worst
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


### Preprocessing dei dati

In [None]:
map_dict = {"M":1, "B":0}
df["diagnosis"] = df["diagnosis"].map(lambda x: map_dict[x])
df.head()

Unnamed: 0,ID number,diagnosis,radius mean,texture mean,perimeter mean,area mean,smoothness mean,compactness mean,concavity mean,concave points mean,...,radius worst,texture worst,perimeter worst,area worst,smoothness worstse,compactness worst,concavity worst,concave points worst,symmetry worst,fractal dimension worst
0,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [None]:
X = df.drop(["diagnosis", "ID number"], axis=1).values
y = df["diagnosis"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=RANDOM_SEED)

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

### Addestriamo il modello

minibatch gradien descent

classes --> si recuperano le classi per essere sicuri che per ogni epoca ci siano tutt i valori della variabile target

shuffle --> si rimescola i dati nei batch

partial_fit --> funzione di addestramento
log_loss --> metrica



In [None]:
def fit(model, X, y, batch_size=1, epochs=100, verbose=True):

  n_batches = int(X.shape[0]//batch_size)+1

  classes = np.unique(y)

  batch_losses = []

  for epoch in range(epochs):
          X_shuffled, Y_shuffled = shuffle(X, y)
          for batch in range(n_batches):
              batch_start = int(batch*batch_size)
              batch_end = int((batch+1)*batch_size)
              X_batch = X_shuffled[batch_start:batch_end,:]
              Y_batch = Y_shuffled[batch_start:batch_end]
              model.partial_fit(X_batch, Y_batch, classes=classes)
              loss = log_loss(y_test, model.predict_proba(X_test),labels=classes)
              batch_losses.append(loss)

          if verbose:
            print("Loss all'epoca %d = %.4f" % (epoch+1, loss))

  return model

In [None]:
def evaluate(model, X, y, label=None):
  y_pred = model.predict(X)
  y_proba = model.predict_proba(X)

  if label is not None:
    print(label)

  accuracy = accuracy_score(y, y_pred)
  loss = log_loss(y, y_proba)
  print(f"Accuracy={accuracy} Log Loss={loss:.3f})")
  return accuracy, loss

In [None]:
batchs_sizes = [8, 16, 32, 64, 128]

best_model = None
best_loss = 1

for batch_size  in batchs_sizes:
  print(f"BATCH SIZE = {batch_size}")
  sgd = SGDClassifier(loss="log")
  sgd = fit(sgd, X_train, y_train, batch_size=batch_size, epochs=200, verbose=False)
  evaluate(sgd, X_train, y_train, label="TRAIN SET")
  accuracy, loss = evaluate(sgd, X_test, y_test, label="TEST SET")

  if best_model is None or loss<best_loss:
    best_model = sgd
    best_loss = loss

BATCH SIZE = 8
TRAIN SET
Accuracy=0.9714285714285714 Log Loss=0.053)
TEST SET
Accuracy=0.9733333333333334 Log Loss=0.053)
+
BATCH SIZE = 16
TRAIN SET
Accuracy=0.9828571428571429 Log Loss=0.039)
TEST SET
Accuracy=0.98 Log Loss=0.063)
BATCH SIZE = 32
TRAIN SET
Accuracy=0.9771428571428571 Log Loss=0.051)
TEST SET
Accuracy=0.9866666666666667 Log Loss=0.064)
BATCH SIZE = 64
TRAIN SET
Accuracy=0.9857142857142858 Log Loss=0.040)
TEST SET
Accuracy=0.98 Log Loss=0.065)
BATCH SIZE = 128
TRAIN SET
Accuracy=0.98 Log Loss=0.041)
TEST SET
Accuracy=0.98 Log Loss=0.053)
+


accuracy molto alta e miglio log loss con batch size 16

### Valutiamo il Modello
sul set di test

In [None]:
print(classification_report(y_train, best_model.predict(X_train)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       211
           1       0.98      0.97      0.97       139

    accuracy                           0.98       350
   macro avg       0.98      0.98      0.98       350
weighted avg       0.98      0.98      0.98       350



In [None]:
print(classification_report(y_test, best_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98        94
           1       0.96      0.98      0.97        56

    accuracy                           0.98       150
   macro avg       0.98      0.98      0.98       150
weighted avg       0.98      0.98      0.98       150



### Miglioriamo il Modello

In [None]:
df_update = pd.read_csv(BASE_URL+"breast_cancer_update.csv")
df_update.head()

Unnamed: 0,ID number,diagnosis,radius mean,texture mean,perimeter mean,area mean,smoothness mean,compactness mean,concavity mean,concave points mean,...,radius worst,texture worst,perimeter worst,area worst,smoothness worstse,compactness worst,concavity worst,concave points worst,symmetry worst,fractal dimension worst
0,91544001,B,12.22,20.04,79.47,453.1,0.1096,0.1152,0.08175,0.02166,...,13.16,24.17,85.13,515.3,0.1402,0.2315,0.3535,0.08088,0.2709,0.08839
1,91544002,B,11.06,17.12,71.25,366.5,0.1194,0.1071,0.04063,0.04268,...,11.69,20.74,76.08,411.1,0.1662,0.2031,0.1256,0.09514,0.278,0.1168
2,915452,B,16.3,15.7,104.7,819.8,0.09427,0.06712,0.05526,0.04563,...,17.32,17.76,109.8,928.2,0.1354,0.1361,0.1947,0.1357,0.23,0.0723
3,915460,M,15.46,23.95,103.8,731.3,0.1183,0.187,0.203,0.0852,...,17.11,36.33,117.7,909.4,0.1732,0.4967,0.5911,0.2163,0.3013,0.1067
4,91550,B,11.74,14.69,76.31,426.0,0.08099,0.09661,0.06726,0.02639,...,12.45,17.6,81.25,473.8,0.1073,0.2793,0.269,0.1056,0.2604,0.09879


In [None]:
df_update["diagnosis"] = df_update["diagnosis"].map(lambda x: map_dict[x])
df_update.head()

Unnamed: 0,ID number,diagnosis,radius mean,texture mean,perimeter mean,area mean,smoothness mean,compactness mean,concavity mean,concave points mean,...,radius worst,texture worst,perimeter worst,area worst,smoothness worstse,compactness worst,concavity worst,concave points worst,symmetry worst,fractal dimension worst
0,91544001,0,12.22,20.04,79.47,453.1,0.1096,0.1152,0.08175,0.02166,...,13.16,24.17,85.13,515.3,0.1402,0.2315,0.3535,0.08088,0.2709,0.08839
1,91544002,0,11.06,17.12,71.25,366.5,0.1194,0.1071,0.04063,0.04268,...,11.69,20.74,76.08,411.1,0.1662,0.2031,0.1256,0.09514,0.278,0.1168
2,915452,0,16.3,15.7,104.7,819.8,0.09427,0.06712,0.05526,0.04563,...,17.32,17.76,109.8,928.2,0.1354,0.1361,0.1947,0.1357,0.23,0.0723
3,915460,1,15.46,23.95,103.8,731.3,0.1183,0.187,0.203,0.0852,...,17.11,36.33,117.7,909.4,0.1732,0.4967,0.5911,0.2163,0.3013,0.1067
4,91550,0,11.74,14.69,76.31,426.0,0.08099,0.09661,0.06726,0.02639,...,12.45,17.6,81.25,473.8,0.1073,0.2793,0.269,0.1056,0.2604,0.09879


In [None]:
X_update = df.drop(["diagnosis", "ID number"], axis=1).values
y_update = df["diagnosis"].values

In [None]:
X_update = ss.transform(X_update)

In [None]:
best_model.partial_fit(X_update, y_update)

SGDClassifier(loss='log')

In [None]:
print(classification_report(y_test, best_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98        94
           1       0.96      0.98      0.97        56

    accuracy                           0.98       150
   macro avg       0.98      0.98      0.98       150
weighted avg       0.98      0.98      0.98       150

