In [3]:
# Model Evaluation and Hyperparameter Tuning (Wine Dataset)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Basic EDA
print("\n--- Dataset Info ---")
print(df.info())
print("\n--- Class Distribution ---")
print(df['target'].value_counts())

# Split dataset
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
log_reg = LogisticRegression(max_iter=1000)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)

# Function to evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Logistic Regression
print("\n--- Logistic Regression ---")
log_reg.fit(X_train_scaled, y_train)
evaluate_model(log_reg, X_test_scaled, y_test)

# Decision Tree
print("\n--- Decision Tree ---")
dt.fit(X_train, y_train)
evaluate_model(dt, X_test, y_test)

# Random Forest
print("\n--- Random Forest ---")
rf.fit(X_train, y_train)
evaluate_model(rf, X_test, y_test)

# GridSearchCV on Random Forest
print("\n--- GridSearchCV on Random Forest ---")
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
evaluate_model(grid_search.best_estimator_, X_test, y_test)

# RandomizedSearchCV on Decision Tree
print("\n--- RandomizedSearchCV on Decision Tree ---")
param_dist = {
    'max_depth': [None] + list(np.arange(5, 30, 5)),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 10)
}
random_search = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), param_dist, n_iter=10, cv=5, scoring='f1_macro')
random_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
evaluate_model(random_search.best_estimator_, X_test, y_test)

print("\n--- DONE ---")



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null