In [1]:
# Importazione delle librerie necessarie
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
import os

In [2]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [3]:
base_path = "../input/cifrar-10"

# Lista dei file di training
train_files = [os.path.join(base_path, f"data_batch_{i}") for i in range(1, 6)]
test_file = os.path.join(base_path, "test_batch")


In [4]:
# Carica i dati di training
train_images = []
train_labels = []

for file in train_files:
    batch = unpickle(file)
    train_images.append(batch[b'data'])
    train_labels.append(batch[b'labels'])

In [5]:
# Concateno i batch di training
train_images = np.concatenate(train_images).reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
train_labels = np.concatenate(train_labels)

# Carico e sistemo il test set
test_batch = unpickle(os.path.join(base_path, 'test_batch'))
test_images = test_batch[b'data'].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
test_labels = np.array(test_batch[b'labels'])

# Mostro informazioni sui dataset
print(f"Train set: {train_images.shape[0]} immagini, {train_images.shape[1:]} dimensioni")
print(f"Test set: {test_images.shape[0]} immagini, {test_images.shape[1:]} dimensioni")

Train set: 50000 immagini, (32, 32, 3) dimensioni
Test set: 10000 immagini, (32, 32, 3) dimensioni


In [8]:
#sottocampionamento
sottocampionamento = 1000
index = np.random.choice(len(train_images), sottocampionamento, replace = False)
new_train_images = train_images[index]
new_train_labels = train_labels[index]

In [9]:
X = new_train_images
y = new_train_labels

N_campioni = 1000
X = X[:N_campioni]
y = y[:N_campioni]

# Usare le seguenti proporzioni per il train, validation e test
train_fraction = 0.7  
validation_fraction = 0.1  
test_fraction = 0.2

# Calcola le lunghezze
num_train = int(train_fraction * X.shape[0])
num_validation = int(validation_fraction * X.shape[0])

# Suddivisione
X_train = X[:num_train]
y_train  = y[:num_train]

X_validation  = X[num_train:num_train + num_validation]
y_validation  = y[num_train:num_train + num_validation]

X_test  = X[num_train + num_validation:]
y_test  = y[num_train + num_validation:]

# Controllo 
print("Train:", X_train.shape)
print("Validation:", X_validation.shape)
print("Test:", X_test.shape)

# scaler lavora con array 2D, le immagini devono essere appiattite
X_train_app = X_train.reshape(X_train.shape[0], -1)
X_validation_app = X_validation.reshape(X_validation.shape[0], -1)
X_test_app = X_test.reshape(X_test.shape[0], -1)

scaler = StandardScaler()
trainX_STD = scaler.fit_transform(X_train_app)
validationX_STD = scaler.transform(X_validation_app)
testX_STD = scaler.transform(X_test_app)

Train: (700, 32, 32, 3)
Validation: (100, 32, 32, 3)
Test: (200, 32, 32, 3)


In [11]:
#KNN
#Creazione del modello 
model_knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

In [12]:
#Alleniamo il modello
model_knn.fit(trainX_STD, y_train)

In [13]:
#Calcolo delle predizioni
predictions_validation_knn = model_knn.predict(validationX_STD)
predictions_test_knn = model_knn.predict(testX_STD)

print(predictions_validation_knn.shape)
print(predictions_test_knn.shape)

(100,)
(200,)


In [14]:
#Calcolare l'accuracy con accuracy_score
accuracy_validation_knn = accuracy_score(y_validation, predictions_validation_knn)
accuracy_test_knn = accuracy_score(y_test, predictions_test_knn)

print(f"Validation accuracy: {accuracy_validation_knn:.2f}")
print(f"Test accuracy: {accuracy_test_knn:.2f}")

Validation accuracy: 0.17
Test accuracy: 0.21


In [15]:
#Calcolare la matrice di confusione
matrixValidation_knn = confusion_matrix (y_validation, predictions_validation_knn) 
matrixTest_knn = confusion_matrix (y_test, predictions_test_knn)

print("Matrix Validation:", matrixValidation_knn)
print("Matrix Test:", matrixTest_knn)

Matrix Validation: [[6 0 2 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 2 0 1 0]
 [3 0 4 0 2 0 3 0 2 0]
 [1 0 3 1 4 1 2 0 1 0]
 [2 0 2 0 1 0 1 0 0 0]
 [2 0 6 3 2 1 0 0 0 0]
 [2 0 6 0 2 0 0 0 0 0]
 [3 0 6 0 4 0 0 1 1 0]
 [7 1 1 0 0 0 0 0 2 0]
 [3 0 1 0 0 0 0 0 1 0]]
Matrix Test: [[12  0  2  0  1  0  2  0  3  0]
 [ 6  2  6  0  2  0  3  0  1  0]
 [ 6  0  9  1  4  0  0  1  0  0]
 [ 3  0  5  4  2  0  8  2  0  0]
 [ 6  0 10  0  4  0  1  1  0  0]
 [ 5  0  3  4  5  3  3  2  0  0]
 [ 2  0  3  0  1  1  3  0  0  0]
 [ 6  0  5  1  3  0  1  2  1  0]
 [11  0  3  0  1  1  1  0  3  0]
 [ 5  0  6  0  2  0  0  2  3  1]]


In [16]:
#Regressione logistica
#Creazione del modello
model_logic = LogisticRegression(max_iter=100, solver='liblinear',C=1.0)

In [17]:
#Alleniamo il modello
model_logic.fit(trainX_STD, y_train)

In [18]:
#Calcolo delle predizioni
predictionsValidation_logic = model_logic.predict(validationX_STD)
predictionsTest_logic = model_logic.predict(testX_STD)

print("Predictions Validation:", predictionsValidation_logic.shape)
print("Predictions Test:", predictionsTest_logic.shape)

Predictions Validation: (100,)
Predictions Test: (200,)


In [19]:
#Calcolare l'accuracy con accuracy_score
accuracy_validation_logic = accuracy_score(y_validation, predictionsValidation_logic)
accuracy_test_logic = accuracy_score(y_test, predictionsTest_logic)

print(f"Validation accuracy: {accuracy_validation_logic:.2f}")
print(f"Test accuracy: {accuracy_test_logic:.2f}")

Validation accuracy: 0.25
Test accuracy: 0.26


In [20]:
#Calcolare la matrice di confusione
matrixValidation_logic = confusion_matrix (y_validation, predictionsValidation_logic) 
matrixTest_logic = confusion_matrix (y_test, predictionsTest_logic)

print("Matrix Validation:", matrixValidation_logic)
print("Matrix Test:", matrixTest_logic)

Matrix Validation: [[3 1 0 0 0 2 1 1 0 0]
 [0 1 1 1 0 0 1 0 0 0]
 [3 1 6 0 2 0 1 0 1 0]
 [1 0 2 4 1 1 3 1 0 0]
 [2 1 0 0 0 2 0 0 0 1]
 [2 1 1 1 3 4 1 1 0 0]
 [0 0 1 0 1 3 2 2 1 0]
 [2 3 3 2 2 0 1 2 0 0]
 [3 1 2 0 0 0 1 2 2 0]
 [1 1 2 0 0 0 0 0 0 1]]
Matrix Test: [[8 0 5 0 1 0 0 3 2 1]
 [1 2 2 4 3 2 1 1 2 2]
 [2 0 4 3 5 1 2 1 1 2]
 [2 0 6 3 4 3 4 1 0 1]
 [2 0 4 1 8 1 1 2 3 0]
 [2 1 1 4 5 4 4 3 1 0]
 [0 1 3 1 2 0 2 0 1 0]
 [4 1 2 1 2 2 0 7 0 0]
 [2 3 2 2 2 0 0 1 8 0]
 [2 2 0 2 0 2 1 2 3 5]]


In [21]:
#SVM
#Creazione del modello
model_SVM = SVC(kernel='linear', C=0.01)

In [22]:
#Alleniamo il modello

model_SVM.fit(trainX_STD, y_train)

In [23]:
#Calcolo delle predizioni
predictionsValidation_SVM = model_SVM.predict(validationX_STD)
predictionsTest_SVM = model_SVM.predict(testX_STD)

print("Predictions Validation:", predictionsValidation_SVM.shape)
print("Predictions Test:", predictionsTest_SVM.shape)

Predictions Validation: (100,)
Predictions Test: (200,)


In [24]:
#Calcolare l'accuracy con accuracy_score
accuracyValidation_SVM = accuracy_score(y_validation, predictionsValidation_SVM)
accuracyTest_SVM = accuracy_score(y_test, predictionsTest_SVM)

print(f"Validation accuracy: {accuracyValidation_SVM:.2f}")
print(f"Test accuracy: {accuracyTest_SVM:.2f}")

Validation accuracy: 0.29
Test accuracy: 0.28


In [25]:
#Calcolare la matrice di confusione
matrixValidation_SVM = confusion_matrix (y_validation, predictionsValidation_SVM) 
matrixTest_SVM = confusion_matrix (y_test, predictionsTest_SVM)

print("Matrix Validation:", matrixValidation_SVM)
print("Matrix Test:", matrixTest_SVM)

Matrix Validation: [[4 1 1 0 1 0 0 1 0 0]
 [0 1 1 1 0 0 1 0 0 0]
 [4 0 6 0 1 2 0 0 1 0]
 [0 1 3 5 2 0 1 1 0 0]
 [0 1 2 1 0 2 0 0 0 0]
 [2 1 0 4 2 5 0 0 0 0]
 [0 2 3 0 2 0 1 2 0 0]
 [1 2 2 3 3 1 0 3 0 0]
 [3 3 1 2 0 0 0 0 1 1]
 [1 1 0 0 0 0 0 0 0 3]]
Matrix Test: [[9 0 3 1 1 1 0 3 1 1]
 [2 2 4 4 2 2 1 1 1 1]
 [4 0 6 3 5 0 1 1 0 1]
 [2 0 1 5 3 3 5 2 1 2]
 [2 0 4 2 8 3 2 1 0 0]
 [1 0 1 4 3 3 4 3 6 0]
 [0 1 0 1 1 1 6 0 0 0]
 [4 1 2 0 2 3 0 6 0 1]
 [5 3 1 1 2 3 0 0 4 1]
 [0 3 0 1 1 1 3 3 0 7]]


In [26]:
#Decision Tree
#Creazione del modello
TreeModel = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=10, random_state=42)

In [27]:
#Alleniamo il modello
TreeModel.fit(trainX_STD, y_train)

In [28]:
#Calcoliamo le predizioni
predictions_validation_Tree = TreeModel.predict(validationX_STD)
predictions_test_Tree = TreeModel.predict(testX_STD)

print(predictions_validation_Tree.shape)
print(predictions_test_Tree.shape)

(100,)
(200,)


In [29]:
#Calcolare l'accuracy con accuracy_score
accuracy_validation_Tree = accuracy_score(y_validation, predictions_validation_Tree)
accuracy_test_Tree = accuracy_score(y_test, predictions_test_Tree)

print(f"Validation accuracy: {accuracy_validation_Tree:.2f}")
print(f"Test accuracy: {accuracy_test_Tree:.2f}")

Validation accuracy: 0.22
Test accuracy: 0.24


In [30]:
#Calcolare la matrice di confusione
matrixValidation_Tree = confusion_matrix (y_validation, predictions_validation_Tree) 
matrixTest_Tree = confusion_matrix (y_test, predictions_test_Tree)

print("Matrix Validation:", matrixValidation_Tree)
print("Matrix Test:", matrixTest_Tree)

Matrix Validation: [[4 0 0 1 1 0 0 2 0 0]
 [0 2 0 0 0 0 1 0 0 1]
 [3 0 0 3 1 0 2 2 1 2]
 [0 0 1 6 2 2 2 0 0 0]
 [1 0 1 1 0 0 3 0 0 0]
 [1 1 3 0 2 4 1 0 1 1]
 [2 2 2 0 1 0 1 0 1 1]
 [4 1 2 3 2 1 0 1 0 1]
 [4 1 2 0 0 0 0 0 1 3]
 [0 0 0 0 0 0 0 1 1 3]]
Matrix Test: [[9 1 1 2 2 0 0 1 3 1]
 [3 5 1 5 0 1 1 0 2 2]
 [4 1 2 3 9 0 0 1 1 0]
 [2 2 0 3 1 2 3 7 1 3]
 [3 0 1 6 5 2 2 2 0 1]
 [2 1 1 4 2 6 1 4 2 2]
 [0 0 0 3 2 0 3 2 0 0]
 [4 2 0 5 2 1 1 4 0 0]
 [4 3 0 3 1 1 1 1 6 0]
 [1 1 1 3 0 0 1 6 0 6]]


In [31]:
print("\nCROSS-VALIDATION ACCURACY\n")

# Logistic Regression
cv_logic = cross_val_score(model_logic, trainX_STD, y_train, cv=5, scoring="accuracy")
print("Logistic Regression CV Accuracy:", cv_logic.mean())

# k-NN
cv_knn = cross_val_score(model_knn, trainX_STD, y_train, cv=5, scoring="accuracy")
print("k-NN CV Accuracy:", cv_knn.mean())

# SVM
cv_svm = cross_val_score(model_SVM, trainX_STD, y_train, cv=5, scoring="accuracy")
print("SVM (Linear Kernel) CV Accuracy:", cv_svm.mean())

# Decision Tree
cv_tree = cross_val_score(TreeModel, trainX_STD, y_train, cv=5, scoring="accuracy")
print("Decision Tree CV Accuracy:", cv_tree.mean())


CROSS-VALIDATION ACCURACY

Logistic Regression CV Accuracy: 0.23571428571428568
k-NN CV Accuracy: 0.22285714285714286
SVM (Linear Kernel) CV Accuracy: 0.2842857142857143
Decision Tree CV Accuracy: 0.21285714285714286


In [32]:
# Definisci i valori di C da esplorare
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}

# Imposta la GridSearch
grid_search = GridSearchCV(model_SVM, param_grid, cv=5, scoring='accuracy')

# Esegui la ricerca
grid_search.fit(trainX_STD, y_train)

# Miglior modello trovato
best_svm_model = grid_search.best_estimator_

# Predizioni con il miglior modello
predictionsValidation_SVM = best_svm_model.predict(validationX_STD)
predictionsTest_SVM = best_svm_model.predict(testX_STD)

# Accuratezza
accuracyValidation_SVM = accuracy_score(y_validation, predictionsValidation_SVM)
accuracyTest_SVM = accuracy_score(y_test, predictionsTest_SVM)

print("Migliori parametri SVM con kernel lineare:", grid_search.best_params_)
print("Validation Accuracy:", accuracyValidation_SVM)
print("Test Accuracy:", accuracyTest_SVM)

Migliori parametri SVM con kernel lineare: {'C': 0.1}
Validation Accuracy: 0.31
Test Accuracy: 0.28


In [33]:
#Visualizzazione dei report di classificazione per ciascun modello
models = {
    "Logistic Regression": predictionsTest_logic,
    "k-NN": predictions_test_knn,
    "SVM": predictionsTest_SVM,
    "Decision Tree": predictions_test_Tree
}

for model_name, predictions in models.items():
    print(f"\n{model_name} - Classification Report:")
    print(classification_report(y_test, predictions))


Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.40      0.36        20
           1       0.20      0.10      0.13        20
           2       0.14      0.19      0.16        21
           3       0.14      0.12      0.13        24
           4       0.25      0.36      0.30        22
           5       0.27      0.16      0.20        25
           6       0.13      0.20      0.16        10
           7       0.33      0.37      0.35        19
           8       0.38      0.40      0.39        20
           9       0.45      0.26      0.33        19

    accuracy                           0.26       200
   macro avg       0.26      0.26      0.25       200
weighted avg       0.26      0.26      0.25       200


k-NN - Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.60      0.29        20
           1       1.00      0.10      0.18        20
 