In [None]:
"""
====================================================
Classical Machine Learning Pipeline for Video Actions
====================================================

This script implements and compares three classical
machine learning algorithms using extracted video
features:

1. Support Vector Machine (Linear + RBF)
2. Random Forest Classifier
3. k-Nearest Neighbors (k-NN)

Features are extracted using existing project modules:
- data_loader.py
- feature_extraction.py

Author: Student_2024AB05275
"""

# =========================
# Standard Library Imports
# =========================
from typing import Dict, Tuple
import warnings

# =========================
# Third-Party Imports
# =========================
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# =========================
# Project Imports
# =========================
from data_loader import load_split_data
from feature_extraction import extract_video_features

# =========================
# Global Configurations
# =========================
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
CV_FOLDS = 5


# ==================================================
# Utility Functions
# ==================================================
def evaluate_model(
    model,
    X_test: np.ndarray,
    y_test: np.ndarray,
) -> Dict[str, float]:
    """
    Evaluate a trained model using multiple metrics.

    Returns:
        Dictionary containing accuracy, precision, recall, and F1-score.
    """
    y_pred = model.predict(X_test)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1_score": f1_score(y_test, y_pred, average="macro"),
    }


def plot_confusion_matrix(model, X_test, y_test, title: str) -> None:
    """
    Plot confusion matrix for a trained classifier.
    """
    cm = confusion_matrix(y_test, model.predict(X_test))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(title)
    plt.show()


# ==================================================
# Data Loading & Feature Extraction
# ==================================================
print("\nüì• Loading dataset splits...")

train_videos, y_train = load_split_data(split="train")
val_videos, y_val = load_split_data(split="val")
test_videos, y_test = load_split_data(split="test")

print("üéØ Extracting features...")

X_train = extract_video_features(train_videos)
X_val = extract_video_features(val_videos)
X_test = extract_video_features(test_videos)

print(f"Train shape: {X_train.shape}")
print(f"Test shape : {X_test.shape}")


# ==================================================
# 1Ô∏è‚É£ Support Vector Machine (Linear + RBF)
# ==================================================
print("\nüöÄ Training Support Vector Machine...")

svm_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("svm", SVC()),
    ]
)

svm_param_grid = [
    {"svm__kernel": ["linear"], "svm__C": [0.1, 1, 10]},
    {
        "svm__kernel": ["rbf"],
        "svm__C": [0.1, 1, 10],
        "svm__gamma": [0.01, 0.1, 1],
    },
]

svm_grid = GridSearchCV(
    svm_pipeline,
    svm_param_grid,
    cv=CV_FOLDS,
    scoring="accuracy",
    n_jobs=-1,
)

svm_grid.fit(X_train, y_train)
best_svm = svm_grid.best_estimator_

svm_metrics = evaluate_model(best_svm, X_test, y_test)
plot_confusion_matrix(best_svm, X_test, y_test, "SVM Confusion Matrix")


# ==================================================
# 2Ô∏è‚É£ Random Forest Classifier
# ==================================================
print("\nüå≤ Training Random Forest...")

rf = RandomForestClassifier(random_state=RANDOM_STATE)

rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
}

rf_grid = GridSearchCV(
    rf,
    rf_param_grid,
    cv=CV_FOLDS,
    scoring="accuracy",
    n_jobs=-1,
)

rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

rf_metrics = evaluate_model(best_rf, X_test, y_test)
plot_confusion_matrix(best_rf, X_test, y_test, "Random Forest Confusion Matrix")


# ==================================================
# 3Ô∏è‚É£ k-Nearest Neighbors
# ==================================================
print("\nüìè Training k-Nearest Neighbors...")

knn_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier()),
    ]
)

knn_param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__metric": ["euclidean", "manhattan"],
}

knn_grid = GridSearchCV(
    knn_pipeline,
    knn_param_grid,
    cv=CV_FOLDS,
    scoring="accuracy",
    n_jobs=-1,
)

knn_grid.fit(X_train, y_train)
best_knn = knn_grid.best_estimator_

knn_metrics = evaluate_model(best_knn, X_test, y_test)
plot_confusion_matrix(best_knn, X_test, y_test, "k-NN Confusion Matrix")


# ==================================================
# üìä Comparative Analysis
# ==================================================
print("\nüìä Comparative Model Analysis")

results = {
    "SVM": svm_metrics,
    "Random Forest": rf_metrics,
    "k-NN": knn_metrics,
}

# Convert to matrix for plotting
metrics_names = list(next(iter(results.values())).keys())
model_names = list(results.keys())

metrics_matrix = np.array(
    [[results[m][metric] for metric in metrics_names] for m in model_names]
)

# =========================
# Bar Plot Comparison
# =========================
x = np.arange(len(metrics_names))
width = 0.25

plt.figure(figsize=(10, 6))
for i, model in enumerate(model_names):
    plt.bar(
        x + i * width,
        metrics_matrix[i],
        width,
        label=model,
    )

plt.xticks(x + width, metrics_names)
plt.ylabel("Score")
plt.title("Model Performance Comparison")
plt.legend()
plt.grid(axis="y")
plt.show()


# ==================================================
# üìå Final Dynamic Summary
# ==================================================
best_model = max(
    results.items(),
    key=lambda item: item[1]["f1_score"],
)

print("\nüèÜ Final Summary")
for model, metrics in results.items():
    print(f"\n{model}")
    for k, v in metrics.items():
        print(f"  {k:<10}: {v:.4f}")

print(
    f"\n‚úÖ Best overall model based on F1-score: "
    f"{best_model[0]}"
)


ImportError: cannot import name 'load_split_data' from 'data_loader' (/Users/chocalingamlakshmanan/Desktop/Video-analytics-assignment/Student_2024ab05275_Video_Classification/code/data_loader.py)