# Machine Learning Classification Assignment

**Student Information:**
- BITS ID: 2025AB05021
- Name: Bhavani Mallem
- Email: 2025ab05021@wilp.bits-pilani.ac.in
- Date: 6 Feb 2026

## Assignment Overview
This notebook implements 6 different classification models and evaluates them on a chosen dataset:
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbor Classifier
4. Naive Bayes Classifier
5. Random Forest Classifier
6. XGBoost Classifier

Each model will be evaluated using:
- Accuracy
- AUC Score
- Precision
- Recall
- F1 Score

## 1. Import Required Libraries

In [1]:
# Data handling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, classification_report,
    confusion_matrix, roc_curve, auc
)

# Utilities
import warnings
import os
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Dataset Selection and Loading

For this assignment, I'll use the **Diabetes Dataset** which is a binary classification problem with health indicators.

**Dataset Details:**
- **Source**: UCI Machine Learning Repository / Kaggle
- **Features**: 21+ features including BMI, age, smoking history, glucose levels, etc.
- **Instances**: 100,000+ instances
- **Target**: Binary classification (diabetes: 0 = No, 1 = Yes)
- **Problem Type**: Binary Classification

In [None]:
# Load the Breast Cancer Wisconsin dataset from scikit-learn
from sklearn.datasets import load_breast_cancer

# Load the dataset
cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df['target'] = cancer_data.target

print(f"Dataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")
print(f"Samples: {df.shape[0]}")

print(f"\nTarget classes: {cancer_data.target_names}")
print(f"Target mapping: 0 = {cancer_data.target_names[0]}, 1 = {cancer_data.target_names[1]}")

# Display basic info
print("\nDataset Info:")
df.info()

print("\nFirst 5 rows:")
df.head()

In [None]:
# Explore the dataset
print("Dataset Description:")
print(df.describe())

print("\nTarget Variable Distribution:")
print(df['target'].value_counts())
print(f"\nClass Balance:")
class_counts = df['target'].value_counts(normalize=True)
print(f"Malignant (0): {class_counts[0]:.3f}")
print(f"Benign (1): {class_counts[1]:.3f}")

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum().sum(), "total missing values")

# Check data types
print("\nData Types:")
print("All features are numeric - no encoding needed!")
print(f"Feature data types: {df.dtypes.value_counts()}")

## 3. Data Preprocessing

In [None]:
# Create a copy for preprocessing
data = df.copy()

# Check for categorical variables (should be none for this dataset)
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
if 'target' in categorical_columns:
    categorical_columns.remove('target')

print(f"Categorical columns: {categorical_columns}")

# Since Breast Cancer dataset has only numeric features, no encoding needed
if categorical_columns:
    # Encode categorical variables if any exist
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
        print(f"Encoded {col}: {le.classes_}")
else:
    print("No categorical variables found - all features are numeric!")
    label_encoders = {}

print("\nDataset after preprocessing:")
print(data.head())
print(f"Shape: {data.shape}")
print(f"Data types: {data.dtypes.value_counts()}")

In [None]:
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {X.columns.tolist()[:5]}...") # Show first 5 feature names

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully!")
print(f"Feature scaling - Original range example: {X_train.iloc[0, 0]:.2f}")
print(f"Feature scaling - Scaled range example: {X_train_scaled[0, 0]:.2f}")

## 4. Model Implementation and Evaluation

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Evaluate a model and return metrics"""

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # AUC score
    if y_pred_proba is not None:
        auc = roc_auc_score(y_test, y_pred_proba)
    else:
        auc = "N/A"

    results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'AUC Score': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC Score: {auc if isinstance(auc, str) else f'{auc:.4f}'}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return results, model

### 4.1 Logistic Regression

In [None]:
# 1. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_results, lr_fitted = evaluate_model(
    lr_model, X_train_scaled, X_test_scaled, y_train, y_test, "Logistic Regression"
)

### 4.2 Decision Tree Classifier

In [None]:
# 2. Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_results, dt_fitted = evaluate_model(
    dt_model, X_train, X_test, y_train, y_test, "Decision Tree"
)

### 4.3 K-Nearest Neighbors Classifier

In [None]:
# 3. K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_results, knn_fitted = evaluate_model(
    knn_model, X_train_scaled, X_test_scaled, y_train, y_test, "K-Nearest Neighbors"
)

### 4.4 Naive Bayes Classifier

In [None]:
# 4. Naive Bayes Classifier (Gaussian)
nb_model = GaussianNB()
nb_results, nb_fitted = evaluate_model(
    nb_model, X_train_scaled, X_test_scaled, y_train, y_test, "Naive Bayes (Gaussian)"
)

### 4.5 Random Forest Classifier

In [None]:
# 5. Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_results, rf_fitted = evaluate_model(
    rf_model, X_train, X_test, y_train, y_test, "Random Forest"
)

### 4.6 XGBoost Classifier

In [None]:
# 6. XGBoost Classifier
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_results, xgb_fitted = evaluate_model(
    xgb_model, X_train, X_test, y_train, y_test, "XGBoost"
)

## 5. Results Summary

In [None]:
# Compile all results
all_results = [lr_results, dt_results, knn_results, nb_results, rf_results, xgb_results]

# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df = results_df.round(4)

print("Model Performance Comparison:")
print("=" * 80)
print(results_df.to_string(index=False))

# Identify best models
print(f"\nBest Models:")
print(f"Highest Accuracy: {results_df.loc[results_df['Accuracy'].idxmax(), 'Model']} ({results_df['Accuracy'].max():.4f})")
print(f"Highest AUC: {results_df.loc[results_df['AUC'].idxmax(), 'Model']} ({results_df['AUC'].max():.4f})")
print(f"Highest F1: {results_df.loc[results_df['F1 Score'].idxmax(), 'Model']} ({results_df['F1 Score'].max():.4f})")

results_df

## 6. Visualization

In [None]:
# Create visualization of results
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics = ['Accuracy', 'AUC Score', 'Precision', 'Recall', 'F1 Score']
axes = axes.flatten()

for i, metric in enumerate(metrics):
    if metric == 'AUC Score':
        # Handle AUC score which might have N/A values
        plot_data = results_df[results_df[metric] != 'N/A'].copy()
        plot_data[metric] = plot_data[metric].astype(float)
    else:
        plot_data = results_df.copy()

    if not plot_data.empty:
        bars = axes[i].bar(plot_data['Model'], plot_data[metric],
                          color=plt.cm.Set3(np.linspace(0, 1, len(plot_data))))
        axes[i].set_title(f'{metric} by Model', fontweight='bold')
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=45)

        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            axes[i].text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

# Remove empty subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

# Save the results for the Streamlit app
results_df.to_csv('model_results.csv', index=False)
print("\nResults saved to model_results.csv")

## 7. Save Models and Preprocessors

In [None]:
# Save models and preprocessors for the Streamlit app
import joblib
import pickle

# Save fitted models
models_dict = {
    'logistic_regression': lr_fitted,
    'decision_tree': dt_fitted,
    'knn': knn_fitted,
    'naive_bayes': nb_fitted,
    'random_forest': rf_fitted,
    'xgboost': xgb_fitted
}

# Save each model
for name, model in models_dict.items():
    joblib.dump(model, f'{name}_model.pkl')
    print(f"Saved {name} model")

# Save preprocessors
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Save feature names and other metadata
metadata = {
    'feature_names': X.columns.tolist(),
    'categorical_columns': categorical_columns,
    'target_classes': ['No Diabetes', 'Diabetes']
}

with open('metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("\nAll models and preprocessors saved successfully!")
print("Ready to build Streamlit app.")