# Level 2 
## Task 2: Classification

Welcome to this notebook for Level 2 Task 2: **Classification**.  
In this task, we'll explore how to:

- Load and preview cleaned datasets
- Configure a classification pipeline
- Train and evaluate two models: Logistic Regression and Random Forest
- Analyze their performance with accuracy and confusion matrices

This notebook is interactive and modular—you're encouraged to experiment with your own dataset as well!

In [142]:
# -------------------- Imports --------------------
import os
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
from ipywidgets import Output, VBox

In [143]:
# -------------------- Paths & Defaults --------------------
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
cleaned_dir = os.path.join(root_dir, "data", "cleaned")

default_file = "iris_cleaned.csv"  # or any available cleaned dataset
available_csvs = [f for f in os.listdir(cleaned_dir) if f.endswith(".csv")]


In [144]:
# -------------------- Widgets --------------------
file_selector = widgets.Dropdown(options=available_csvs, value=default_file, description="File:")
file_upload = widgets.FileUpload(accept=".csv", multiple=False)
target_selector = widgets.Dropdown(options=[], description="Target:")
feature_selector = widgets.SelectMultiple(options=[], description="Features")
run_button = widgets.Button(description="Run Classification", button_style='success')

# UI containers
dataset_ui = Output()
output_area = Output()

In [145]:
def load_dataset(file_path=None, uploaded=None):
    try:
        if uploaded:
            return pd.read_csv(io.BytesIO(uploaded['content']))
        elif file_path:
            return pd.read_csv(file_path)
    except Exception as e:
        with output_area:
            clear_output()
            print(f"❌ Failed to load dataset: {e}")
    return None

# -------------------- Update full dataset section --------------------
def render_dataset_ui(df):
    global current_df
    current_df = df  # Set current dataset
    
    dataset_ui.clear_output()
    
    with dataset_ui:
        display(Markdown("## Dataset Preview"))
        display(Markdown("The loaded dataset is displayed below. This helps you understand the available columns before choosing your target and features."))
        display(df.head())

        # Update selector options
        

        target_selector.options = df.columns.tolist()
        feature_selector.options = df.columns.tolist()
        target_selector.value = df.columns[-1]
        feature_selector.value = tuple(col for col in df.columns if col != target_selector.value)

        display(Markdown("## Configure Target & Features"))
        display(Markdown("Choose which column to use as the **target** (classification label),  \n"
                         "and select one or more **numeric** columns to use as features."))
        display(target_selector, feature_selector, run_button)

# -------------------- Handle Dataset Change --------------------
def on_file_selected(change=None):
    path = os.path.join(cleaned_dir, file_selector.value)
    df = load_dataset(file_path=path)
    if df is not None:
        render_dataset_ui(df)

def on_file_uploaded(change):
    uploaded = list(file_upload.value.values())[0]
    df = load_dataset(uploaded=uploaded)
    if df is not None:
        render_dataset_ui(df)  # fully redraw preview + config

file_selector.observe(on_file_selected, names='value')
file_upload.observe(on_file_uploaded, names='value')

# Display UI blocks
display(Markdown("## Choose or Upload a Dataset"))
display(file_selector, file_upload)
display(dataset_ui)  # will show preview & config
# display(output_area)

## Choose or Upload a Dataset

Dropdown(description='File:', index=2, options=('churn-bigml-20_cleaned.csv', 'house_prediction_cleaned.csv', …

FileUpload(value=(), accept='.csv', description='Upload')

Output()

In [146]:
# Load the default dataset (e.g., Iris dataset) for auto-loading
default_df_path = os.path.join(cleaned_dir, default_file)
df_default  = load_dataset(file_path=default_df_path)

if df_default is not None:
    render_dataset_ui(df_default)
else:
    with dataset_ui:
        print("❌ Could not load default dataset.")


## Train & Evaluate Classification Models

We'll train two models and evaluate their performance using:
- Accuracy
- Precision
- Recall
- F1 Score
- Confusion Matrix

In [147]:
def is_classification_target(y):
    return y.dtype == 'O' or (y.nunique() <= 20 and np.all(np.equal(np.mod(y, 1), 0)))

def plot_confusion(cm, labels, title):
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [148]:
# --------------------  Run Classification --------------------
def train_and_evaluate(df, target_col, feature_cols):
    with output_area:
        clear_output()
   

        if not feature_cols:
            print("Please select at least one feature.!")
            return

        X = df[list(feature_cols)]
        y = df[target_col]

        if not is_classification_target(y):
            print("Target is not suitable for classification.!!!")
            return

        if X.select_dtypes(exclude=["number"]).shape[1] > 0:
            print("Only numeric features are supported.!!!")
            return

        # Encode target
        if y.dtype == 'O':
            le = LabelEncoder()
            y = le.fit_transform(y)
            class_labels = le.classes_
        else:
            class_labels = sorted(y.unique())

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        models = {
            "Logistic Regression": LogisticRegression(max_iter=200),
            "Random Forest": RandomForestClassifier()
        }

        for name, model in models.items():
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            display(Markdown(f"### {name}"))
            display(Markdown(f"**Accuracy:** {accuracy_score(y_test, preds):.2f}"))
            display(Markdown(f"**Precision:** {precision_score(y_test, preds, average='macro'):.2f}"))
            display(Markdown(f"**Recall:** {recall_score(y_test, preds, average='macro'):.2f}"))
            display(Markdown(f"**F1 Score:** {f1_score(y_test, preds, average='macro'):.2f}"))

            cm = confusion_matrix(y_test, preds)
            plot_confusion(cm, class_labels, f"{name} – Confusion Matrix")

        print("\n📋 Classification Report:")
        print(classification_report(y_test, preds, target_names=class_labels))


In [149]:
def on_run_clicked(b):
    path = os.path.join(cleaned_dir, file_selector.value)
    df = load_dataset(file_path=path)
    if df is not None:
        train_and_evaluate(df, target_selector.value, feature_selector.value)

run_button.on_click(on_run_clicked)

In [150]:
# Display the output area at the end where results will show
display(Markdown("## Classification Output"))
display(output_area)


## Classification Output

Output()

In [151]:
# Auto-load iris and run once
on_file_selected()
on_run_clicked(None)


## ✅ Summary

In this notebook, we:

- Interactively loaded datasets
- Configured classification features and targets
- Trained and evaluated models using multiple metrics
- Visualized performance using confusion matrices

Feel free to explore other datasets or upload your own. Happy modeling! 🎉
