In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
url = "https://proai-datasets.s3.eu-west-3.amazonaws.com/fruits.csv"
df = pd.read_csv(url)

display(df.head())
df.info()
df.describe()

In [None]:
print("Valori mancanti per colonna:\n", df.isnull().sum())
print("Duplicati:", df.duplicated().sum())

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Frutto')
plt.title('Distribuzione delle classi')
plt.show()

In [None]:
df.drop('Frutto', axis=1).plot(kind='box', subplots=True, layout=(2,3), figsize=(12,8), sharex=False)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(df.drop('Frutto', axis=1).corr(), annot=True, cmap='coolwarm')
plt.title('Matrice di correlazione')
plt.show()

In [None]:
y = df['Frutto']
X = df.drop('Frutto', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
k_range = range(1, 21)
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

plt.plot(k_range, cv_scores, marker='o')
plt.xlabel('Numero di vicini (k)')
plt.ylabel('Accuracy media (CV)')
plt.title('Scelta del miglior k')
plt.show()

best_k = k_range[np.argmax(cv_scores)]
print("Miglior k:", best_k)

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

print("Accuracy test:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=knn.classes_, yticklabels=knn.classes_)
plt.xlabel('Predetto')
plt.ylabel('Reale')
plt.title('Matrice di confusione')
plt.show()