In [3]:
try:
    import google.colab
    print("🚀 Running in Google Colab - all packages pre-installed!")
    IN_COLAB = True
except:
    print("💻 Running locally")
    IN_COLAB = False
    # Uncomment below if packages missing:
    # !pip install pandas numpy scikit-learn matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# LOAD AND EXPLORE DATA
print("=== WINE CLASSIFICATION PROJECT ===\n")

# Loading the sklearn wine dataset (Wine Recognition Dataset)
from sklearn.datasets import load_wine

wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target  # Wine classes: 0, 1, 2

print("✅ Dataset loaded successfully!")
print(f"Dataset Shape: {X.shape}")
print(f"Features: {list(X.columns)}")
print(f"Target Classes: {wine.target_names}")
print(f"Problem: Classify wine cultivar (3 classes)")

# Converting to DataFrame for easier handling
df = X.copy()
df['target'] = y

print(f"Dataset Shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: Wine cultivar classification")

# EXPLORATORY DATA ANALYSIS
print("\n=== EXPLORATORY DATA ANALYSIS ===")
print(f"Missing values:\n{X.isnull().sum()}")
print(f"\nTarget distribution:\n{pd.Series(y).value_counts().sort_index()}")
print(f"Class names: {wine.target_names}")
print(f"Dataset info:\n{X.describe()}")

# DATA PREPROCESSING
print("\n=== DATA PREPROCESSING ===")

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Wine classes: {sorted(np.unique(y))} - {wine.target_names}")

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scaling the features (IMPORTANT for both SVM and KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled using StandardScaler")

# IMPLEMENT SVC (SUPPORT VECTOR CLASSIFIER)
print("\n=== SVC IMPLEMENTATION ===")

# Basic SVC
svc_basic = SVC(random_state=42)
svc_basic.fit(X_train_scaled, y_train)
svc_pred = svc_basic.predict(X_test_scaled)
svc_accuracy = accuracy_score(y_test, svc_pred)

print(f"SVC Basic Accuracy: {svc_accuracy:.4f}")

# SVC with hyperparameter tuning
print("\n🔧 Tuning SVC hyperparameters...")
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

svc_grid = GridSearchCV(SVC(random_state=42), svc_param_grid, cv=5, scoring='accuracy')
svc_grid.fit(X_train_scaled, y_train)
svc_best = svc_grid.best_estimator_

svc_best_pred = svc_best.predict(X_test_scaled)
svc_best_accuracy = accuracy_score(y_test, svc_best_pred)

print(f"SVC Best Parameters: {svc_grid.best_params_}")
print(f"SVC Best Accuracy: {svc_best_accuracy:.4f}")

# IMPLEMENT KNN
print("\n=== KNN IMPLEMENTATION ===")

# Testing different K values
k_values = range(1, 21)
knn_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
    knn_scores.append(scores.mean())

# Finding best K
best_k = k_values[np.argmax(knn_scores)]
print(f"Best K value: {best_k}")

# Training KNN with best K
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train_scaled, y_train)
knn_pred = knn_best.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_pred)

print(f"KNN Best Accuracy: {knn_accuracy:.4f}")

# COMPARE MODELS
print("\n=== MODEL COMPARISON ===")
results = {
    'SVC Basic': svc_accuracy,
    'SVC Tuned': svc_best_accuracy,
    'KNN': knn_accuracy
}

print("Model Performance:")
for model, score in results.items():
    print(f"{model}: {score:.4f}")

# Determining winner
best_model = max(results, key=results.get)
print(f"\n🏆 Best Model: {best_model} with {results[best_model]:.4f} accuracy")

# DETAILED EVALUATION
print(f"\n=== DETAILED EVALUATION FOR {best_model} ===")

if best_model == 'SVC Tuned':
    final_pred = svc_best_pred
    final_model = svc_best
elif best_model == 'KNN':
    final_pred = knn_pred
    final_model = knn_best
else:
    final_pred = svc_pred
    final_model = svc_basic

print("Classification Report:")
print(classification_report(y_test, final_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, final_pred))

=== WINE CLASSIFICATION PROJECT ===

✅ Dataset loaded successfully!
Dataset Shape: (178, 13)
Features: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Target Classes: ['class_0' 'class_1' 'class_2']
Problem: Classify wine cultivar (3 classes)
Dataset Shape: (178, 14)
Features: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Target: Wine cultivar classification

=== EXPLORATORY DATA ANALYSIS ===
Missing values:
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols    