# Level One :

**Task One : Data preprocessing for Machine Learning**

  * **Description**: Preprocess a raw dataset to make it ready for machine learning.

**Objective**:
  * Handle missing data (e.g., filling with mean/median,dropping).
  * Encode categorical variables (e.g. using one-hot encoding or label encoding).
  * Normalize or standardize numerical feaures.
  * Split the dataset into training and testing sets.
  * Tools: Python, pandas, sckikit-learn.



## **0.1 Connect to google colab**

In [454]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [455]:
import os
os.chdir('/content/drive/MyDrive/Codveda Technologies')

## **0.2 Setup the device**

In [456]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## **0.3 Import Dependencies**

In [457]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from rich.console import Console
from rich.table import Table
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.box import ROUNDED
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from rich.console import Console
from rich.table import Table
import pickle
from pathlib import Path
console = Console()


## **1.1 Load dataset**

In [458]:
# Load Iris dataset
file_path_iris = "/content/drive/MyDrive/Codveda Technologies/data/1) iris.csv"
df_iris = pd.read_csv(file_path_iris)
# Load House dataset
file_path_house = "/content/drive/MyDrive/Codveda Technologies/data/4) house Prediction Data Set.csv"
df_house = pd.read_csv(file_path_house)
# Load Stock dataset
file_path_stock = "/content/drive/MyDrive/Codveda Technologies/data/2) Stock Prices Data Set.csv"
df_stock = pd.read_csv(file_path_stock)
# Load Sentiment dataset
file_path_sentiment = "/content/drive/MyDrive/Codveda Technologies/data/3) Sentiment dataset.csv"
df_sentiment = pd.read_csv(file_path_sentiment)

file_path_churn_20 = "/content/drive/MyDrive/Codveda Technologies/data/Churn Prdiction Data/churn-bigml-20.csv"
df_churn_20 = pd.read_csv(file_path_churn_20)

file_path_churn_80 = "/content/drive/MyDrive/Codveda Technologies/data/Churn Prdiction Data/churn-bigml-80.csv"
df_churn_80 = pd.read_csv(file_path_churn_80)

## **1.2 Dataset Overview**

In [459]:
def dataset_report(df: pd.DataFrame, file_path: str):
    """Generate a stunning dataset report using Rich tables and panels."""
    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path) / 1024  # KB

    # --- Header ---
    console.print(Panel.fit(
        f"[bold cyan]📊 Dataset Report[/bold cyan]\n"
        f"[white]📂 {file_name}[/white]  |  [green]{df.shape[0]} rows[/green] × [green]{df.shape[1]} cols[/green]  |  [yellow]{file_size:.2f} KB[/yellow]",
        border_style="cyan", box=ROUNDED))

    # --- Columns Overview ---
    t_cols = Table(title="🧾 Columns Overview", box=ROUNDED, show_lines=True)
    t_cols.add_column("Column", style="bold magenta")
    t_cols.add_column("Type")
    t_cols.add_column("Non-Null Count")
    t_cols.add_column("Missing Values")
    for col in df.columns:
        t_cols.add_row(
            col,
            str(df[col].dtype),
            str(df[col].notna().sum()),
            str(df[col].isna().sum())
        )
    console.print(t_cols)

    # --- Column Type Summary ---
    num_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(exclude=[np.number]).columns
    console.print(Panel.fit(
        f"[cyan]🔢 Numerical:[/cyan] {len(num_cols)} → {', '.join(num_cols[:10])}{' ...' if len(num_cols) > 10 else ''}\n"
        f"[yellow]🔤 Categorical:[/yellow] {len(cat_cols)} → {', '.join(cat_cols[:10])}{' ...' if len(cat_cols) > 10 else ''}",
        title="Column Types", border_style="magenta", box=ROUNDED))

    # --- Missing Values ---
    miss = df.isna().sum()
    if miss.sum() > 0:
        t_miss = Table(title="⚠ Missing Values", box=ROUNDED, show_lines=True)
        t_miss.add_column("Column", style="bold cyan")
        t_miss.add_column("Missing Count")
        for c, v in miss[miss > 0].sort_values(ascending=False).items():
            t_miss.add_row(c, str(int(v)))
        console.print(t_miss)
    else:
        console.print(Panel.fit("[green]✓ No Missing Values[/green]", border_style="green", box=ROUNDED))

    # --- Duplicates ---
    dups = df.duplicated().sum()
    if dups > 0:
        console.print(Panel.fit(f"[red]✗ Duplicated Rows:[/red] {dups}", border_style="red", box=ROUNDED))
    else:
        console.print(Panel.fit("[green]✓ No Duplicated Rows[/green]", border_style="green", box=ROUNDED))

    # --- Descriptive Stats ---
    desc = df.describe(include="all", percentiles=[.25, .5, .75]).T.fillna("")
    t_desc = Table(title="📈 Descriptive Statistics", box=ROUNDED, show_lines=True)
    t_desc.add_column("Column", style="bold magenta")
    for col in desc.columns:
        t_desc.add_column(str(col))
    for idx, row in desc.iterrows():
        t_desc.add_row(str(idx), *[str(val) for val in row.values])
    console.print(t_desc)

    # --- First 5 Examples ---
    t_head = Table(title="📝 First 5 Rows", box=ROUNDED, show_lines=True)
    for col in df.columns:
        t_head.add_column(str(col), overflow="fold", style="cyan")
    for _, row in df.head(5).iterrows():
        t_head.add_row(*[str(val) for val in row.values])
    console.print(t_head)


In [460]:
dataset_report(df_iris, file_path_iris)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [461]:
dataset_report(df_house, file_path_house)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [462]:
dataset_report(df_stock, file_path_stock)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [463]:
dataset_report(df_sentiment, file_path_sentiment)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [464]:
dataset_report(df_churn_20, file_path_churn_20)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [465]:
dataset_report(df_churn_80, file_path_churn_80)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



## **1.3 Dataset Observations & Insights**

---
**🌸 Iris Dataset (1)**
* The dataset contains `150 rows` × `5 columns` with `4 numerical` features (sepal & petal dimensions) and `1 categorical target` (species).
* **No missing values**, but **3 duplicated rows exist**, which may need removal before model training.
* All features are cleanly separated into numeric and categorical, making it ideal for classification tasks.
* Well-structured and balanced dataset — a great candidate for supervised ML experiments.

---
**🏠 House Prediction Dataset (4)**
* The dataset shows `505 rows` × `1 column`, but the entire row is stored as a single object column.
* This indicates the dataset is not properly delimited (possibly missing commas or separators in the CSV).
* Currently, no missing values or duplicates, but parsing is required to split the data into meaningful columns (e.g., price, rooms, location, etc.).
* Needs preprocessing before it can be used in ML task

---
**📈 Stock Prices Dataset (2)**
* Extremely large dataset with `497,472 rows` × `7 columns` (symbols, date, OHLC prices, and volume).
* Contains `11 missing` values in open, 8 in `high`, and 8 in `low` → very small relative to dataset size but should be imputed or dropped.
* No duplicate rows detected.
* A mix of categorical (`symbol`, `date`) and numerical (`prices`, `volume`) data makes it suitable for time-series forecasting & financial analysis.

---
**💬 Sentiment Dataset (3)**
* `732 rows` × `15 columns`, covering text, sentiment, user metadata, and engagement metrics (likes, retweets).
* **No missing values** and **no duplicates**, dataset is clean.
* Features include a mix of textual data (`Text`, `Hashtags`), categorical metadata (`User`, `Platform`, `Country`), and numerical fields (`Retweets`, `Likes`, `Year`, etc.).
* Well-suited for **NLP sentiment classification** with potential for feature engineering from metadata.
---
**📞 Churn Dataset (Churn-bigml-80)**
* `2666 rows` × `20 columns`, covering customer demographics, service usage, and churn label.
* Balanced data types: 16 numerical columns (call minutes, charges, service usage) and 4 categorical columns (state, plans, churn).
* **No missing or duplicate values** — dataset is well-structured and ready for ML.
* Directly applicable for classification tasks (Churn prediction) and can be used for building explainable ML models.
---
**📞 Churn Dataset (Churn-bigml-20)**
* Contains `667 rows` × `20 columns`, making it a relatively small dataset, ideal for exploratory analysis and model prototyping.
* Numerical Features (16): Includes usage metrics such as total day minutes, total eve calls, total intl charge, customer service calls, etc.
* Categorical Features (4): State, International plan, Voice mail plan, and the target variable → Churn.
* No Missing Values: Ensures data completeness.
* No Duplicated Rows: Data is clean and unique across entries.

## **1.4 Preprocess Datasets**

In [466]:
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Checks and removes duplicate rows in a DataFrame.

    Args:
        df (pd.DataFrame): Input dataset

    Returns:
        pd.DataFrame: DataFrame with duplicates removed
    """
    # Before
    rows_before = len(df)
    duplicates_before = df.duplicated().sum()

    # Drop duplicates
    df_cleaned = df.drop_duplicates()

    # After
    rows_after = len(df_cleaned)
    duplicates_after = df_cleaned.duplicated().sum()

    # Pretty summary table
    table = Table(title="🔍 Duplicate Check Summary", show_lines=True)
    table.add_column("Metric", style="cyan", justify="left")
    table.add_column("Before", style="yellow", justify="center")
    table.add_column("After", style="green", justify="center")

    table.add_row("Rows", str(rows_before), str(rows_after))
    table.add_row("Duplicate Rows", str(duplicates_before), str(duplicates_after))

    console.print(table)

    return df_cleaned

In [467]:
def handle_missing_data(df, strategy="mean"):
    console.rule("[bold yellow]Handling Missing Data")

    missing_before = df.isnull().sum().sum()
    console.print(f"[cyan]Total Missing Values Before: [bold red]{missing_before}")

    if strategy == "mean":
        df = df.fillna(df.mean(numeric_only=True))
    elif strategy == "median":
        df = df.fillna(df.median(numeric_only=True))
    elif strategy == "drop":
        df = df.dropna()
    else:
        console.print("[red]Invalid strategy. Choose 'mean', 'median', or 'drop'.")
        return df

    missing_after = df.isnull().sum().sum()
    console.print(f"[cyan]Total Missing Values After: [bold green]{missing_after}")
    return df


In [468]:
def encode_categorical(df, method="onehot", sample_size=5):
    """
    Encode categorical variables in a DataFrame using One-Hot or Label Encoding.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        method (str): Encoding method ("onehot" or "label")
        sample_size (int): Number of rows to show in before/after preview

    Returns:
        pd.DataFrame: Encoded DataFrame
    """
    console.rule("[bold yellow]🔤 Encoding Categorical Variables")

    # Identify categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if not cat_cols:
        console.print("[red]No categorical columns found.[/red]")
        return df

    # Print summary before encoding
    console.print(f"[cyan]Categorical Columns Detected:[/cyan] {cat_cols}")
    console.print(f"[magenta]Shape Before:[/magenta] {df.shape[0]} rows × {df.shape[1]} columns")

    # Show a sample before encoding
    table_before = Table(title="📊 Sample Data (Before Encoding)", show_lines=True)
    for col in df.columns:
        table_before.add_column(col, style="cyan", overflow="fold")
    for _, row in df.head(sample_size).iterrows():
        table_before.add_row(*[str(x) for x in row.values])
    console.print(table_before)

    # Apply encoding
    if method == "onehot":
        df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    elif method == "label":
        df_encoded = df.copy()
        le = LabelEncoder()
        for col in cat_cols:
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    else:
        console.print("[red]❌ Invalid method. Choose 'onehot' or 'label'.")
        return df

    # Print summary after encoding
    console.print(f"[green]✅ Encoding done using: [bold]{method.upper()}[/bold]")
    console.print(f"[magenta]Shape After:[/magenta] {df_encoded.shape[0]} rows × {df_encoded.shape[1]} columns")

    # Show a sample after encoding
    table_after = Table(title="📊 Sample Data (After Encoding)", show_lines=True)
    for col in df_encoded.columns:
        table_after.add_column(col, style="green", overflow="fold")
    for _, row in df_encoded.head(sample_size).iterrows():
        table_after.add_row(*[str(x) for x in row.values])
    console.print(table_after)

    return df_encoded

In [469]:
def split_data(df, target, test_size=0.2, val_size=0.1):
    console.rule("[bold yellow]Splitting Dataset")

    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size+val_size, random_state=42)
    relative_val_size = val_size / (test_size + val_size)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=relative_val_size, random_state=42)

    console.print(f"[cyan]Train Size: {X_train.shape[0]}")
    console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
    console.print(f"[cyan]Test Size: {X_test.shape[0]}")

    return X_train, X_val, X_test, y_train, y_val, y_test

In [470]:
def split_dataset(X, y, test_size=0.2, val_size=0.1, random_state=42):
    """
    Splits dataset into train, validation, and test sets with stratification.

    Args:
        X (pd.DataFrame): Features
        y (pd.Series): Target labels
        test_size (float): Proportion for test set
        val_size (float): Proportion for validation set (from remaining after test)
        random_state (int): Random state for reproducibility

    Returns:
        dict: Dictionary containing X_train, X_val, X_test, y_train, y_val, y_test
    """

    console.rule("[bold green] Data Splitting")

    # First split into Train+Val and Test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # Adjust val_size relative to remaining set
    val_adjusted = val_size / (1 - test_size)

    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_adjusted, stratify=y_temp, random_state=random_state
    )

    # Print dataset summary in rich table
    table = Table(title="Dataset Split Summary", show_header=True, header_style="bold magenta")
    table.add_column("Set", justify="center")
    table.add_column("Shape (X)", justify="center")
    table.add_column("Shape (y)", justify="center")
    table.add_column("Class Distribution", justify="center")

    for name, X_part, y_part in [
        ("Train", X_train, y_train),
        ("Validation", X_val, y_val),
        ("Test", X_test, y_test)
    ]:
        class_dist = y_part.value_counts().to_dict()
        table.add_row(name, str(X_part.shape), str(y_part.shape), str(class_dist))

    console.print(table)

    # Show sample rows for each split
    console.rule("[bold yellow]Sample Rows")
    for name, X_part, y_part in [
        ("Train", X_train, y_train),
        ("Validation", X_val, y_val),
        ("Test", X_test, y_test)
    ]:
        console.print(f"[cyan]{name} Sample[/cyan]:")
        console.print(pd.concat([X_part.head(3), y_part.head(3)], axis=1))
        console.print("\n")

    return {
        "X_train": X_train, "X_val": X_val, "X_test": X_test,
        "y_train": y_train, "y_val": y_val, "y_test": y_test
    }

In [471]:
def standardize_numeric(X_train, X_val, X_test, scaler_path="models/scaler.pkl"):
    console.rule("[bold yellow]Standardizing Numerical Features")

    num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
    console.print(f"[cyan]Numerical Columns: {num_cols}")

    console.print("[red]Before Standardization (Train Set):")
    console.print(X_train[num_cols].agg(['mean', 'std']))

    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_val[num_cols] = scaler.transform(X_val[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    console.print("[green]After Standardization (Train Set):")
    console.print(X_train[num_cols].agg(['mean', 'std']))
    # Save scaler
    os.makedirs(os.path.dirname(scaler_path), exist_ok=True)
    joblib.dump((scaler, num_cols), scaler_path)
    console.print(f"[green]✅ Scaler saved at {scaler_path}[/green]")


    return X_train, X_val, X_test

In [472]:
def save_preprocessed(data: dict, filename: str):
    """
    Save preprocessed dataset dictionary to a pickle file.

    Parameters
    ----------
    data : dict
        Dictionary containing train/val/test splits.
    filename : str
        Name of the file to save (e.g., 'preprocessed.pkl').
    """
    filepath = Path(filename)
    with open(filepath, "wb") as f:
        pickle.dump(data, f)
    print(f"✅ Preprocessed dataset saved to {filepath.resolve()}")


In [473]:
def load_preprocessed(filename: str) -> dict:
    """
    Load preprocessed dataset dictionary from a pickle file.

    Parameters
    ----------
    filename : str
        Path to the saved pickle file.

    Returns
    -------
    dict
        Dictionary containing train/val/test splits.
    """
    filepath = Path(filename)
    if not filepath.exists():
        raise FileNotFoundError(f"❌ File not found: {filepath.resolve()}")

    with open(filepath, "rb") as f:
        data = pickle.load(f)

    print(f"✅ Preprocessed dataset loaded from {filepath.resolve()}")
    return data

### **1.4.1 Preprocess Iris dataset**

In [474]:
# Step 1: Remove Duplicates
df_clean = remove_duplicates(df_iris)

In [475]:
# Print number of rows
console.print(f"[cyan]Total Rows: {df_clean.shape[0]}")

In [476]:
# Step 2: Handle missing
df_clean = handle_missing_data(df_clean, strategy="median")

In [477]:
# Separate features (X) and target (y)
X = df_clean.drop("species", axis=1)
y = df_clean["species"]

In [478]:
result = split_dataset(X, y, test_size=0.2, val_size=0.1, random_state=42)

In [479]:
X_train_std, X_val_std, X_test_std = standardize_numeric(
    result["X_train"].copy(),
    result["X_val"].copy(),
    result["X_test"].copy()
)

In [480]:
result["X_train"] = X_train_std
result["X_val"] = X_val_std
result["X_test"] = X_test_std

In [481]:
save_preprocessed(result, "data/preprocessed/preprocessed_iris.pkl")
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_iris.pkl")

✅ Preprocessed dataset saved to /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_iris.pkl
✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_iris.pkl


In [377]:
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test = loaded_splits["X_test"], loaded_splits["y_test"]

# Print train/validate and test to make sure that we loaded them sucessfully
console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

### **1.4.2 Preprocess Stock dataset**

In [378]:
# Step 1: Remove Duplicates
df_clean = remove_duplicates(df_stock)

In [379]:
# Step 2: Handle missing
df_clean = handle_missing_data(df_clean, strategy="median")

In [380]:
# drop symbol column
df_clean = df_clean.drop("symbol", axis=1)

In [381]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_clean, target = "close")

In [382]:
X_train_std, X_val_std, X_test_std = standardize_numeric(
    X_train.copy(),
    X_val.copy(),
    X_test.copy()
)

In [383]:
# Convert splits into dictionary
preprocessed_data = {
    "X_train": X_train_std,
    "X_val": X_val_std,
    "X_test": X_test_std,
    "y_train": y_train,
    "y_val": y_val,
    "y_test": y_test
}

In [384]:
save_preprocessed(preprocessed_data, "data/preprocessed/preprocessed_stock.pkl")
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_stock.pkl")

✅ Preprocessed dataset saved to /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_stock.pkl
✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_stock.pkl


In [385]:
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test = loaded_splits["X_test"], loaded_splits["y_test"]

# Print train/validate and test to make sure that we loaded them sucessfully
console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

### **1.4.3 Preprocess house dataset**

In [386]:
# Load properly (handles spaces/tabs as separators)
df_house = pd.read_csv(file_path_house, delim_whitespace=True, header=None)

print(df_house.shape)
print(df_house.head())

(506, 14)
        0     1     2   3      4      5     6       7   8      9     10  \
0  0.00632  18.0  2.31   0  0.538  6.575  65.2  4.0900   1  296.0  15.3   
1  0.02731   0.0  7.07   0  0.469  6.421  78.9  4.9671   2  242.0  17.8   
2  0.02729   0.0  7.07   0  0.469  7.185  61.1  4.9671   2  242.0  17.8   
3  0.03237   0.0  2.18   0  0.458  6.998  45.8  6.0622   3  222.0  18.7   
4  0.06905   0.0  2.18   0  0.458  7.147  54.2  6.0622   3  222.0  18.7   

       11    12    13  
0  396.90  4.98  24.0  
1  396.90  9.14  21.6  
2  392.83  4.03  34.7  
3  394.63  2.94  33.4  
4  396.90  5.33  36.2  



The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead



In [387]:
# Assume we will use hte columns dataset from kaggle dataset
df_house.columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS",
    "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"
]

1.  `CRIM` – Per capita crime rate by town.
2.  `ZN` – Proportion of residential land zoned for lots over 25,000 sq. ft.
3.  `INDUS` – Proportion of non-retail business acres per town.
4.  `CHAS` – Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
5.  `NOX` – Nitric oxides concentration (parts per 10 million).
6.  `RM` – Average number of rooms per dwelling.
7.  `AGE` – Proportion of owner-occupied units built prior to 1940.
8.  `DIS` – Weighted distances to five Boston employment centres.
9.  `RAD` – Index of accessibility to radial highways.
10. `TAX` – Full-value property-tax rate per $10,000.
11. `PTRATIO` – Pupil-teacher ratio by town.
12. `B` – 1000(Bk − 0.63)² where Bk is the proportion of Black residents by town.
13. `LSTAT` – % lower status of the population.

In [388]:
# Step 1: Remove Duplicates
df_clean = remove_duplicates(df_house)

In [389]:
# Step 2: Handle missing
df_clean = handle_missing_data(df_clean, strategy="median")

In [390]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_clean, target = "MEDV")

In [391]:
X_train_std, X_val_std, X_test_std = standardize_numeric(
    X_train.copy(),
    X_val.copy(),
    X_test.copy()
)

In [392]:
result["X_train"] = X_train_std
result["X_val"] = X_val_std
result["X_test"] = X_test_std

In [393]:
# Convert splits into dictionary
preprocessed_data = {
    "X_train": X_train_std,
    "X_val": X_val_std,
    "X_test": X_test_std,
    "y_train": y_train,
    "y_val": y_val,
    "y_test": y_test
}

In [394]:
save_preprocessed(preprocessed_data, "data/preprocessed/preprocessed_house.pkl")
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_house.pkl")

✅ Preprocessed dataset saved to /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_house.pkl
✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_house.pkl


In [395]:
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test = loaded_splits["X_test"], loaded_splits["y_test"]

# Print train/validate and test to make sure that we loaded them sucessfully
console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

### **1.4.4 Preprocess churn-bigml-20 dataset**

In [396]:
# Step 1: Remove Duplicates
df_clean = remove_duplicates(df_churn_20)

In [397]:
# Step 2: Handle missing
df_clean = handle_missing_data(df_clean, strategy="median")

In [398]:
# Print cols names
df_clean.columns

Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')

In [399]:
# Print cols types
df_clean.dtypes

Unnamed: 0,0
State,object
Account length,int64
Area code,int64
International plan,object
Voice mail plan,object
Number vmail messages,int64
Total day minutes,float64
Total day calls,int64
Total day charge,float64
Total eve minutes,float64


In [400]:
df_clean.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [401]:
# Find unique values for objects
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        unique_values = df_clean[col].unique()
        print(f"Unique values for column '{col}':")
        print(unique_values)

Unique values for column 'State':
['LA' 'IN' 'NY' 'SC' 'HI' 'AK' 'MI' 'ID' 'VA' 'WI' 'MN' 'VT' 'MT' 'MA'
 'KY' 'CO' 'AZ' 'CA' 'WA' 'NE' 'OH' 'MO' 'AL' 'NH' 'NM' 'OR' 'TX' 'MS'
 'WY' 'FL' 'KS' 'NC' 'SD' 'OK' 'CT' 'RI' 'DE' 'UT' 'NV' 'DC' 'ME' 'IL'
 'NJ' 'MD' 'WV' 'PA' 'ND' 'AR' 'TN' 'IA' 'GA']
Unique values for column 'International plan':
['No' 'Yes']
Unique values for column 'Voice mail plan':
['No' 'Yes']


In [402]:
# # Define features and target
X = df_clean.drop(columns=["Churn"])   # Features (everything except target)
y = df_clean["Churn"]                  # Target

In [403]:
df_encoded = encode_categorical(X, method="onehot")

In [404]:
# combine X and y into df_encoded to split them
df_encoded["Churn"] = y

In [405]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
      df_encoded, target="Churn"
  )

In [406]:
X_train_std, X_val_std, X_test_std = standardize_numeric(
    X_train.copy(),
    X_val.copy(),
    X_test.copy()
)

In [407]:
result["X_train"] = X_train_std
result["X_val"] = X_val_std
result["X_test"] = X_test_std

In [408]:
# Convert splits into dictionary
preprocessed_data = {
    "X_train": X_train_std,
    "X_val": X_val_std,
    "X_test": X_test_std,
    "y_train": y_train,
    "y_val": y_val,
    "y_test": y_test
}

In [409]:
save_preprocessed(preprocessed_data, "data/preprocessed/preprocessed_churn_20.pkl")
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_churn_20.pkl")

✅ Preprocessed dataset saved to /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_churn_20.pkl
✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_churn_20.pkl


In [410]:
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test = loaded_splits["X_test"], loaded_splits["y_test"]

# Print train/validate and test to make sure that we loaded them sucessfully
console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

### **1.4.5 Preprocess churn-bigml-80 dataset**

In [411]:
# Step 1: Remove Duplicates
df_clean = remove_duplicates(df_churn_80)

In [412]:
# Step 2: Handle missing
df_clean = handle_missing_data(df_clean, strategy="median")

In [413]:
# Print cols names
df_clean.columns

Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')

In [414]:
# Print cols types
df_clean.dtypes

Unnamed: 0,0
State,object
Account length,int64
Area code,int64
International plan,object
Voice mail plan,object
Number vmail messages,int64
Total day minutes,float64
Total day calls,int64
Total day charge,float64
Total eve minutes,float64


In [415]:
df_clean.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [416]:
# Find unique values for objects
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        unique_values = df_clean[col].unique()
        print(f"Unique values for column '{col}':")
        print(unique_values)

Unique values for column 'State':
['KS' 'OH' 'NJ' 'OK' 'AL' 'MA' 'MO' 'WV' 'RI' 'IA' 'MT' 'ID' 'VT' 'VA'
 'TX' 'FL' 'CO' 'AZ' 'NE' 'WY' 'IL' 'NH' 'LA' 'GA' 'AK' 'MD' 'AR' 'WI'
 'OR' 'DE' 'IN' 'UT' 'CA' 'SD' 'NC' 'WA' 'MN' 'NM' 'NV' 'DC' 'NY' 'KY'
 'ME' 'MS' 'MI' 'SC' 'TN' 'PA' 'HI' 'ND' 'CT']
Unique values for column 'International plan':
['No' 'Yes']
Unique values for column 'Voice mail plan':
['Yes' 'No']


In [417]:
# # Define features and target
X = df_clean.drop(columns=["Churn"])   # Features (everything except target)
y = df_clean["Churn"]                  # Target

In [418]:
df_encoded = encode_categorical(X, method="onehot")

In [419]:
# combine X and y into df_encoded to split them
df_encoded["Churn"] = y

In [420]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
      df_encoded, target="Churn"
  )

In [421]:
X_train_std, X_val_std, X_test_std = standardize_numeric(
    X_train.copy(),
    X_val.copy(),
    X_test.copy()
)

In [422]:
result["X_train"] = X_train_std
result["X_val"] = X_val_std
result["X_test"] = X_test_std

In [423]:
# Convert splits into dictionary
preprocessed_data = {
    "X_train": X_train_std,
    "X_val": X_val_std,
    "X_test": X_test_std,
    "y_train": y_train,
    "y_val": y_val,
    "y_test": y_test
}

In [424]:
save_preprocessed(preprocessed_data, "data/preprocessed/preprocessed_churn_80.pkl")
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_churn_80.pkl")

✅ Preprocessed dataset saved to /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_churn_80.pkl
✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_churn_80.pkl


In [425]:
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test = loaded_splits["X_test"], loaded_splits["y_test"]

# Print train/validate and test to make sure that we loaded them sucessfully
console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

# **Task Two : Build a Simple Linear Regression Model**

  * **Description**: Build a linear regression model to predicta continuous variable (e.g., house prices).

  
**Objective**:
  * Load a dataset and preprocess it.
  * Train a linear regression model using scikit-learn.
  * Interpret he model coefficients
  * Evaluate the model using R-Squared and mean squared error (MSE)
  * Tools: Python, Pandas, scikit-learn.



In [426]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rich.console import Console
from rich.table import Table
from rich.box import ROUNDED
import plotly.express as px
import plotly.graph_objects as go
import joblib

## **2.1 Load Preprocessed Splits**

In [427]:
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_house.pkl")
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val     = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test   = loaded_splits["X_test"], loaded_splits["y_test"]

console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_house.pkl


## **2.2 Define the models**

In [428]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Decision Tree": DecisionTreeRegressor(max_depth=6, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

## **2.3 Train & Evaluate Models**

In [429]:
def evaluate_regression_model(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
    results = {"Model": name}

    # Fit model
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred   = model.predict(X_val)
    y_test_pred  = model.predict(X_test)

    # Train metrics
    results["Train R²"]   = r2_score(y_train, y_train_pred)
    results["Train MSE"]  = mean_squared_error(y_train, y_train_pred)
    results["Train RMSE"] = np.sqrt(results["Train MSE"])
    results["Train MAE"]  = mean_absolute_error(y_train, y_train_pred)

    # Validation metrics
    results["Val R²"]   = r2_score(y_val, y_val_pred)
    results["Val MSE"]  = mean_squared_error(y_val, y_val_pred)
    results["Val RMSE"] = np.sqrt(results["Val MSE"])
    results["Val MAE"]  = mean_absolute_error(y_val, y_val_pred)

    # Test metrics
    results["Test R²"]   = r2_score(y_test, y_test_pred)
    results["Test MSE"]  = mean_squared_error(y_test, y_test_pred)
    results["Test RMSE"] = np.sqrt(results["Test MSE"])
    results["Test MAE"]  = mean_absolute_error(y_test, y_test_pred)

    # Pretty print with Rich
    table = Table(title=f"{name} — Performance", show_lines=True, box=ROUNDED)
    table.add_column("Dataset", style="bold cyan")
    table.add_column("R²", style="bold green")
    table.add_column("MSE", style="bold yellow")
    table.add_column("RMSE", style="bold magenta")
    table.add_column("MAE", style="bold red")

    for split in ["Train", "Val", "Test"]:
        table.add_row(
            split,
            f"{results[f'{split} R²']:.4f}",
            f"{results[f'{split} MSE']:.2f}",
            f"{results[f'{split} RMSE']:.2f}",
            f"{results[f'{split} MAE']:.2f}"
        )

    console.print(table)

    return results

In [430]:
def visualize_results(results_list, metric="Val R²"):
    """
    results_list: list of dicts from evaluate_regression_model
    metric: str → which metric to visualize (e.g., 'Val R²', 'Test RMSE')
    """
    df = pd.DataFrame(results_list)
    fig = px.bar(df, x="Model", y=metric, text=metric,
                 title=f"Model Comparison: {metric}",
                 color="Model")
    fig.update_traces(texttemplate="%{text:.3f}", textposition="outside")
    fig.update_layout(title_x=0.5, uniformtext_minsize=8, uniformtext_mode="hide")
    fig.show()

In [431]:
all_results = []
for name, model in models.items():
    res = evaluate_regression_model(name, model, X_train, y_train, X_val, y_val, X_test, y_test)
    all_results.append(res)

In [432]:
visualize_results(all_results, metric="Val R²")
visualize_results(all_results, metric="Test RMSE")

In [433]:
def analyze_best_model(results_list, models_dict,
                       X_train, y_train, X_val, y_val, X_test, y_test,
                       criterion="Val R²"):
    """
    Selects the best model based on criterion (default Val R²),
    prints coefficients/feature importances,
    and plots predicted vs actual + residuals.
    """
    # Convert results to DataFrame
    df = pd.DataFrame(results_list)
    best_row = df.sort_values(by=criterion, ascending=False).iloc[0]
    best_model_name = best_row["Model"]
    best_model = models_dict[best_model_name]

    console.print(f"[bold green]🏆 Best Model Detected: {best_model_name}[/bold green]")

    # Retrain best model on Train + Val for full capacity
    X_full = pd.concat([X_train, X_val])
    y_full = pd.concat([y_train, y_val])
    best_model.fit(X_full, y_full)

    # Predict on Test
    y_pred_test = best_model.predict(X_test)

    # ===============================
    # 1) Coefficients / Feature Importance
    # ===============================
    if hasattr(best_model, "coef_"):
        coefs = best_model.coef_
        features = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(coefs))]

        coef_table = Table(title=f"{best_model_name} — Top Coefficients", show_lines=True, box=ROUNDED)
        coef_table.add_column("Feature", style="bold cyan")
        coef_table.add_column("Coefficient", style="bold yellow")

        top_idx = np.argsort(np.abs(coefs))[::-1][:15]  # top 15
        for i in top_idx:
            coef_table.add_row(features[i], f"{coefs[i]:.4f}")
        console.print(coef_table)

    elif hasattr(best_model, "feature_importances_"):
        imps = best_model.feature_importances_
        features = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(imps))]

        imp_table = Table(title=f"{best_model_name} — Feature Importances", show_lines=True, box=ROUNDED)
        imp_table.add_column("Feature", style="bold cyan")
        imp_table.add_column("Importance", style="bold yellow")

        top_idx = np.argsort(imps)[::-1][:15]
        for i in top_idx:
            imp_table.add_row(features[i], f"{imps[i]:.4f}")
        console.print(imp_table)

    else:
        console.print("[yellow]⚠ No coefficients or feature importances available for this model[/yellow]")

    # ===============================
    # 2) Predicted vs Actual Plot
    # ===============================
    fig1 = go.Figure()
    fig1.add_trace(go.Scatter(x=y_test, y=y_pred_test, mode="markers",
                              name="Predicted vs Actual",
                              marker=dict(color="cyan", size=7, opacity=0.7)))
    fig1.add_trace(go.Scatter(x=y_test, y=y_test, mode="lines",
                              name="Ideal Fit", line=dict(color="red", dash="dash")))

    fig1.update_layout(
        title=f"{best_model_name} — Predicted vs Actual (Test Set)",
        title_x=0.5,
        xaxis_title="Actual Values",
        yaxis_title="Predicted Values",
        legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)")
    )
    fig1.show()

    # ===============================
    # 3) Residual Plot
    # ===============================
    residuals = y_test - y_pred_test
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=y_pred_test, y=residuals, mode="markers",
                              name="Residuals", marker=dict(color="orange", size=7, opacity=0.7)))
    fig2.add_hline(y=0, line_dash="dash", line_color="red", annotation_text="Zero Line")

    fig2.update_layout(
        title=f"{best_model_name} — Residuals vs Predicted (Test Set)",
        title_x=0.5,
        xaxis_title="Predicted Values",
        yaxis_title="Residuals",
        legend=dict(x=0.02, y=0.98, bgcolor="rgba(0,0,0,0)")
    )
    fig2.show()
    return best_model

In [434]:
best_model = analyze_best_model(all_results, models,
                   X_train, y_train, X_val, y_val, X_test, y_test,
                   criterion="Val R²")

In [435]:
def save_model(model, filepath: str):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    joblib.dump(model, filepath)
    console.print(f"[green]✅ Model saved at:[/green] {filepath}")

In [436]:
def load_model(filepath: str):
    if not os.path.exists(filepath):
        console.print(f"[red]❌ Model not found at:[/red] {filepath}")
        return None
    model = joblib.load(filepath)
    console.print(f"[green]✅ Model loaded from:[/green] {filepath}")
    return model

In [437]:
save_model(best_model, "models/house_price_best_model.pkl")

In [438]:
loaded_model = load_model("models/house_price_best_model.pkl")

In [439]:
y_pred_loaded = loaded_model.predict(X_test)
console.print(f"[cyan]Prediction sample:[/cyan] {y_pred_loaded[:5]}")

# Level One :

**Task Three : Implement K-Nearest Neighbors (KNN) Classifier**

  * **Description**: Build a KNN to classify data points into categories.

  
**Objective**:
  * Train a KNN model on a labeled dataset.
  * Evaluate the performance using accuracy, confusion matrix and precision/recall.
  * Use different values of K and compare the results.
  * Evaluate the model using R-Squared and mean squared error (MSE)
  * Tools: Python, Pandas, scikit-learn.



In [440]:
!pip install optuna



In [441]:
from sklearn.neighbors import KNeighborsClassifier
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, r2_score, mean_squared_error
import plotly.figure_factory as ff

In [442]:
# load preprocessed dataset
loaded_splits = load_preprocessed("data/preprocessed/preprocessed_iris.pkl")
X_train, y_train = loaded_splits["X_train"], loaded_splits["y_train"]
X_val, y_val     = loaded_splits["X_val"], loaded_splits["y_val"]
X_test, y_test   = loaded_splits["X_test"], loaded_splits["y_test"]

console.print(f"[cyan]Train Size: {X_train.shape[0]}")
console.print(f"[cyan]Validation Size: {X_val.shape[0]}")
console.print(f"[cyan]Test Size: {X_test.shape[0]}")

✅ Preprocessed dataset loaded from /content/drive/MyDrive/Codveda Technologies/data/preprocessed/preprocessed_iris.pkl


In [443]:
def objective_knn(trial, X_train, y_train, X_val, y_val):
    k = trial.suggest_int("n_neighbors", 3, 20)
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

def objective_logreg(trial, X_train, y_train, X_val, y_val):
    C = trial.suggest_float("C", 0.01, 10.0, log=True)
    solver = trial.suggest_categorical("solver", ["liblinear", "lbfgs"])
    model = LogisticRegression(C=C, solver=solver, max_iter=200)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

def objective_rf(trial, X_train, y_train, X_val, y_val):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

def objective_svc(trial, X_train, y_train, X_val, y_val):
    C = trial.suggest_float("C", 0.01, 10.0, log=True)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
    model = SVC(C=C, kernel=kernel, probability=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

In [444]:
def tune_model(model_name, objective_fn, X_train, y_train, X_val, y_val, n_trials=30):
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective_fn(trial, X_train, y_train, X_val, y_val), n_trials=n_trials)

    console.print(f"[green]✅ Best {model_name} Params:[/green] {study.best_params}")
    console.print(f"[cyan]Best Validation Accuracy:[/cyan] {study.best_value:.4f}")

    # Re-train best model with best params
    if model_name == "KNN":
        best_model = KNeighborsClassifier(**study.best_params)
    elif model_name == "LogisticRegression":
        best_model = LogisticRegression(**study.best_params, max_iter=200)
    elif model_name == "RandomForest":
        best_model = RandomForestClassifier(**study.best_params, random_state=42)
    elif model_name == "SVC":
        best_model = SVC(**study.best_params, probability=True)

    best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

    return best_model, study.best_params, study.best_value

In [445]:
all_models = {
    "KNN": objective_knn,
    "LogisticRegression": objective_logreg,
    "RandomForest": objective_rf,
    "SVC": objective_svc
}

best_models = {}
results = []

for model_name, obj_fn in all_models.items():
    console.print(f"[bold yellow]🔎 Tuning {model_name}...[/bold yellow]")
    model, params, score = tune_model(model_name, obj_fn, X_train, y_train, X_val, y_val, n_trials=20)
    best_models[model_name] = model
    results.append({"Model": model_name, "Val Accuracy": score, "Params": params})

results_df = pd.DataFrame(results)
console.print(results_df)

[I 2025-08-25 18:22:48,638] A new study created in memory with name: no-name-56b8b50f-01b9-4338-9dc2-32cd9b3d1e2c
[I 2025-08-25 18:22:48,647] Trial 0 finished with value: 1.0 and parameters: {'n_neighbors': 15}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,653] Trial 1 finished with value: 1.0 and parameters: {'n_neighbors': 6}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,661] Trial 2 finished with value: 1.0 and parameters: {'n_neighbors': 13}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,667] Trial 3 finished with value: 1.0 and parameters: {'n_neighbors': 19}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,674] Trial 4 finished with value: 1.0 and parameters: {'n_neighbors': 6}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,681] Trial 5 finished with value: 1.0 and parameters: {'n_neighbors': 10}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,687] Trial 6 finished with value: 1.0 and parameters: {'n_neighbors': 20}. Best

[I 2025-08-25 18:22:48,826] A new study created in memory with name: no-name-646adccc-d895-405b-bc18-34a374af46eb
[I 2025-08-25 18:22:48,832] Trial 0 finished with value: 1.0 and parameters: {'C': 0.5454085408151761, 'solver': 'liblinear'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,844] Trial 1 finished with value: 1.0 and parameters: {'C': 0.16879797870170113, 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,851] Trial 2 finished with value: 1.0 and parameters: {'C': 1.7510618761915027, 'solver': 'liblinear'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,859] Trial 3 finished with value: 1.0 and parameters: {'C': 0.1090103147293033, 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,865] Trial 4 finished with value: 0.9333333333333333 and parameters: {'C': 0.16366834948268552, 'solver': 'liblinear'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:48,874] Trial 5 finished with value: 1.0 and parameters: 

[I 2025-08-25 18:22:49,102] A new study created in memory with name: no-name-09a558a4-23c3-4d0f-a53c-50585b2f8834
[I 2025-08-25 18:22:49,601] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 240, 'max_depth': 2}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:50,212] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 269, 'max_depth': 9}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:50,597] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 120, 'max_depth': 12}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:50,900] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 208, 'max_depth': 9}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:51,260] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 281, 'max_depth': 11}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:22:51,476] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 158, 'max_depth': 13}. Best is trial 0 with v

[I 2025-08-25 18:23:05,615] A new study created in memory with name: no-name-3b407d18-6ffa-47e2-ad8a-88c8218cb7b1
[I 2025-08-25 18:23:05,639] Trial 0 finished with value: 1.0 and parameters: {'C': 0.47161208129672033, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:23:05,662] Trial 1 finished with value: 0.9333333333333333 and parameters: {'C': 0.12253025289946524, 'kernel': 'poly'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:23:05,676] Trial 2 finished with value: 1.0 and parameters: {'C': 0.27865671820171517, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:23:05,708] Trial 3 finished with value: 1.0 and parameters: {'C': 0.4882611042377666, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:23:05,733] Trial 4 finished with value: 0.6666666666666666 and parameters: {'C': 0.033624266107475674, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0.
[I 2025-08-25 18:23:05,765] Trial 5 finished with value: 1.0 and parameters: {'C

In [446]:
results_df = pd.DataFrame(results)
console.print(results_df)

In [447]:
best_row = results_df.sort_values(by="Val Accuracy", ascending=False).iloc[0]
best_model_name = best_row["Model"]
best_model = best_models[best_model_name]

console.print(f"[bold green]🏆 Best Model: {best_model_name} with Val Accuracy {best_row['Val Accuracy']:.4f}[/bold green]")

save_model(best_model, f"models/{best_model_name}_best.pkl")

In [448]:
def train_best_model(model_name, best_params, X_train, y_train, X_val, y_val):
    """
    Train the best model with Optuna-selected hyperparameters.
    Uses Train + Validation data together for final training.
    """
    # Merge train + validation
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])

    if model_name == "KNN":
        model = KNeighborsClassifier(**best_params)
    elif model_name == "LogisticRegression":
        model = LogisticRegression(**best_params, max_iter=200)
    elif model_name == "RandomForest":
        model = RandomForestClassifier(**best_params, random_state=42)
    elif model_name == "SVC":
        model = SVC(**best_params, probability=True)
    else:
        raise ValueError(f"❌ Unsupported model type: {model_name}")

    model.fit(X_train_full, y_train_full)
    console.print(f"[green]✅ Final {model_name} trained with best parameters[/green]")
    return model

In [449]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, class_names=None):
    """Evaluate model on train, validation, and test sets with metrics + confusion matrix"""
    results = {}
    splits = {"Train": (X_train, y_train), "Validation": (X_val, y_val), "Test": (X_test, y_test)}

    table = Table(title="KNN Model Evaluation", show_lines=True, box=ROUNDED)
    table.add_column("Dataset", style="bold cyan")
    table.add_column("Accuracy", style="bold green")
    table.add_column("Precision", style="yellow")
    table.add_column("Recall", style="magenta")
    table.add_column("F1", style="red")
    for split_name, (X_split, y_split) in splits.items():
        y_pred = model.predict(X_split)

        acc = accuracy_score(y_split, y_pred)
        prec = precision_score(y_split, y_pred, average="weighted")
        rec = recall_score(y_split, y_pred, average="weighted")
        f1 = f1_score(y_split, y_pred, average="weighted")


        results[split_name] = {
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1": f1,
        }

        table.add_row(
            split_name,
            f"{acc:.3f}", f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}",
        )

    console.print(table)

    # Confusion Matrix (Test Set)
    y_test_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_test_pred)
    if class_names is None:
        class_names = [str(i) for i in np.unique(y_test)]

    fig_cm = ff.create_annotated_heatmap(z=cm, x=class_names, y=class_names,
                                         colorscale="Blues", showscale=True)
    fig_cm.update_layout(title="Confusion Matrix (Test Set)", title_x=0.5)
    fig_cm.show()

    return results

In [450]:
def save_model(model, filepath="model/knn_iris.pkl"):
    """Save trained model"""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    joblib.dump(model, filepath)
    console.print(f"[green]💾 Model saved at:[/green] {filepath}")


In [451]:
def load_model(filepath="model/knn_iris.pkl"):
    """Load model"""
    if not os.path.exists(filepath):
        console.print(f"[red]❌ Model not found at {filepath}[/red]")
        return None
    model = joblib.load(filepath)
    console.print(f"[green]✅ Model loaded from:[/green] {filepath}")
    return model

In [452]:
# Get the best model from Optuna
best_model_row = results_df.sort_values(by="Val Accuracy", ascending=False).iloc[0]
best_model_name = best_model_row["Model"]
best_params = best_model_row["Params"]

# Train final model
final_model = train_best_model(best_model_name, best_params, X_train, y_train, X_val, y_val)

# Evaluate on Test Set
evaluate_model(final_model, X_train, y_train, X_val, y_val, X_test, y_test,
               class_names=["Setosa", "Versicolor", "Virginica"])

# Save the trained model
save_model(final_model, f"models/{best_model_name}_best_Iris.pkl")

In [453]:
# Load model later
loaded_knn = load_model("models/KNN_best_Iris.pkl")