# Lab 7: Logistic Regression and SVM
## Notebook 2: Scaled Data
### CS201L - Artificial Intelligence Laboratory

In [None]:
# importing all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, accuracy_score,
    precision_score, recall_score,
    f1_score, classification_report
)

import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## Loading the Scaled Dataset

In [None]:
# loading the standardized (scaled) train, validation and test data
train_data = pd.read_csv('activity_scaled_train.csv')
val_data   = pd.read_csv('activity_scaled_validation.csv')
test_data  = pd.read_csv('activity_scaled_test.csv')

print("Train shape:", train_data.shape)
print("Validation shape:", val_data.shape)
print("Test shape:", test_data.shape)

In [None]:
# separating features and target labels
X_train = train_data.drop(columns=['Activity'])
y_train = train_data['Activity']

X_val = val_data.drop(columns=['Activity'])
y_val = val_data['Activity']

X_test = test_data.drop(columns=['Activity'])
y_test = test_data['Activity']

print("X_train shape:", X_train.shape)
print("X_val shape:",   X_val.shape)
print("X_test shape:",  X_test.shape)
print("\nClasses:", y_train.unique())

In [None]:
# helper function to print all evaluation metrics nicely
def evaluate_model(y_true, y_pred, dataset_name="Validation"):
    print(f"\n--- {dataset_name} Results ---")
    
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec  = recall_score(y_true, y_pred, average='weighted')
    f1   = f1_score(y_true, y_pred, average='weighted')
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=np.unique(y_true),
                yticklabels=np.unique(y_true))
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    return acc

print("Helper function defined!")

## Task 2.1: Logistic Regression

In [None]:
print("Training Logistic Regression on scaled data...")

logistic_reg = LogisticRegression(solver='liblinear', max_iter=1000)
logistic_reg.fit(X_train, y_train)

print("Training done!")

y_val_pred_lr  = logistic_reg.predict(X_val)
y_test_pred_lr = logistic_reg.predict(X_test)

In [None]:
print("=" * 50)
print("TASK 2.1: LOGISTIC REGRESSION - SCALED DATA")
print("=" * 50)

acc_lr_val  = evaluate_model(y_val, y_val_pred_lr, "Validation")
acc_lr_test = evaluate_model(y_test, y_test_pred_lr, "Test")

## Task 2.2: SVM with Linear Kernel

In [None]:
print("Training SVM with Linear Kernel on scaled data... (may take a few minutes)")

linear_svm = SVC(kernel='linear', C=1.0)
linear_svm.fit(X_train, y_train)

print("Done!")

y_val_pred_lin  = linear_svm.predict(X_val)
y_test_pred_lin = linear_svm.predict(X_test)

In [None]:
print("=" * 50)
print("TASK 2.2: SVM (LINEAR KERNEL) - SCALED DATA")
print("=" * 50)

acc_lin_val  = evaluate_model(y_val, y_val_pred_lin, "Validation")
acc_lin_test = evaluate_model(y_test, y_test_pred_lin, "Test")

## Task 2.3: SVM with Polynomial Kernel

In [None]:
print("Training Polynomial SVM for degrees 2, 3, 4, 5 on scaled data...")
print("(Please wait, this takes a while)\n")

best_degree  = None
best_val_acc = 0
degree_val_accuracies = {}

for degree in [2, 3, 4, 5]:
    poly_svm = SVC(kernel='poly', degree=degree, C=1.0, gamma='scale')
    poly_svm.fit(X_train, y_train)
    
    val_acc = poly_svm.score(X_val, y_val)
    degree_val_accuracies[degree] = val_acc
    print(f"Degree={degree}, Validation Accuracy={val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_degree  = degree

print(f"\nBest Degree: {best_degree} with Validation Accuracy: {best_val_acc:.4f}")

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(list(degree_val_accuracies.keys()), list(degree_val_accuracies.values()), marker='o', color='orange')
plt.title('Polynomial SVM: Validation Accuracy vs Degree (Scaled Data)')
plt.xlabel('Degree')
plt.ylabel('Validation Accuracy')
plt.xticks([2, 3, 4, 5])
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print(f"Training final Polynomial SVM with best degree = {best_degree}...")

best_poly_svm = SVC(kernel='poly', degree=best_degree, C=1.0, gamma='scale')
best_poly_svm.fit(X_train, y_train)

y_val_pred_poly  = best_poly_svm.predict(X_val)
y_test_pred_poly = best_poly_svm.predict(X_test)

print("Done!")

In [None]:
print("=" * 55)
print(f"TASK 2.3: SVM (POLY KERNEL, DEGREE={best_degree}) - SCALED DATA")
print("=" * 55)

print("\nValidation Accuracies for all degrees:")
for d, acc in degree_val_accuracies.items():
    marker = " <-- best" if d == best_degree else ""
    print(f"  Degree {d}: {acc:.4f}{marker}")

acc_poly_val  = evaluate_model(y_val, y_val_pred_poly, "Validation (Best Degree)")
acc_poly_test = evaluate_model(y_test, y_test_pred_poly, "Test (Best Degree)")

## Task 2.4: SVM with Gaussian (RBF) Kernel

In [None]:
print("Training SVM with RBF Kernel on scaled data...")

rbf_svm = SVC(kernel='rbf', C=1.0, gamma='scale')
rbf_svm.fit(X_train, y_train)

print("Done!")

y_val_pred_rbf  = rbf_svm.predict(X_val)
y_test_pred_rbf = rbf_svm.predict(X_test)

In [None]:
print("=" * 50)
print("TASK 2.4: SVM (RBF KERNEL) - SCALED DATA")
print("=" * 50)

acc_rbf_val  = evaluate_model(y_val, y_val_pred_rbf, "Validation")
acc_rbf_test = evaluate_model(y_test, y_test_pred_rbf, "Test")

## Task 2.5: Comparison on Scaled Data

In [None]:
comparison_val = {
    'Classifier':          ['Logistic Regression', 'SVM Linear', f'SVM Poly (d={best_degree})', 'SVM RBF'],
    'Validation Accuracy': [acc_lr_val, acc_lin_val, acc_poly_val, acc_rbf_val]
}

comparison_test = {
    'Classifier':    ['Logistic Regression', 'SVM Linear', f'SVM Poly (d={best_degree})', 'SVM RBF'],
    'Test Accuracy': [acc_lr_test, acc_lin_test, acc_poly_test, acc_rbf_test]
}

df_val  = pd.DataFrame(comparison_val)
df_test = pd.DataFrame(comparison_test)

print("Validation Accuracy Comparison (Scaled Data):")
print(df_val.to_string(index=False))

print("\nTest Accuracy Comparison (Scaled Data):")
print(df_test.to_string(index=False))

In [None]:
classifiers = ['Logistic\nRegression', 'SVM\nLinear', f'SVM Poly\n(d={best_degree})', 'SVM\nRBF']
val_accs  = [acc_lr_val, acc_lin_val, acc_poly_val, acc_rbf_val]
test_accs = [acc_lr_test, acc_lin_test, acc_poly_test, acc_rbf_test]

x = np.arange(len(classifiers))
width = 0.35

fig, ax = plt.subplots(figsize=(9, 5))
bars1 = ax.bar(x - width/2, val_accs,  width, label='Validation')
bars2 = ax.bar(x + width/2, test_accs, width, label='Test')

ax.set_title('Classifier Accuracy Comparison - Scaled Data')
ax.set_ylabel('Accuracy')
ax.set_xticks(x)
ax.set_xticklabels(classifiers)
ax.legend()
ax.set_ylim(0.5, 1.05)

for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=9)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

best_val_clf  = classifiers[np.argmax(val_accs)].replace('\n', ' ')
best_test_clf = classifiers[np.argmax(test_accs)].replace('\n', ' ')
print(f"\nDiscussion:")
print(f"Best classifier on Validation: {best_val_clf} ({max(val_accs):.4f})")
print(f"Best classifier on Test:       {best_test_clf} ({max(test_accs):.4f})")
print("Scaling the data often helps SVMs perform better, especially RBF kernel,")
print("because it makes the features comparable and the kernel distances more meaningful.")

In [None]:
print("Accuracy values to record for final comparison (Notebook 4):")
print(f"  LR  Val: {acc_lr_val:.4f}   | LR  Test: {acc_lr_test:.4f}")
print(f"  Lin Val: {acc_lin_val:.4f}   | Lin Test: {acc_lin_test:.4f}")
print(f"  Poly Val: {acc_poly_val:.4f} | Poly Test: {acc_poly_test:.4f}")
print(f"  RBF Val: {acc_rbf_val:.4f}   | RBF Test: {acc_rbf_test:.4f}")