# Lab 7: Logistic Regression and SVM
## Notebook 4: PCA Transformed Data (99% Variance) + Overall Comparison
### CS201L - Artificial Intelligence Laboratory

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, accuracy_score,
    precision_score, recall_score,
    f1_score, classification_report
)

import warnings
warnings.filterwarnings('ignore')

print("All libraries imported!")

## Loading the PCA (99% Variance) Dataset

In [None]:
# loading pca 99% variance data
# this dataset only keeps enough components to explain 99% of the variance
# so we expect fewer features than the original 561
train_data = pd.read_csv('activity_pca99_train.csv')
val_data   = pd.read_csv('activity_pca99_validation.csv')
test_data  = pd.read_csv('activity_pca99_test.csv')

print("Train shape:", train_data.shape)
print("Validation shape:", val_data.shape)
print("Test shape:", test_data.shape)

In [None]:
X_train = train_data.drop(columns=['Activity'])
y_train = train_data['Activity']

X_val = val_data.drop(columns=['Activity'])
y_val = val_data['Activity']

X_test = test_data.drop(columns=['Activity'])
y_test = test_data['Activity']

print(f"Number of PCA components (99% variance): {X_train.shape[1]}")
print("Classes:", y_train.unique())

In [None]:
def evaluate_model(y_true, y_pred, dataset_name="Validation"):
    print(f"\n--- {dataset_name} Results ---")
    
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec  = recall_score(y_true, y_pred, average='weighted')
    f1   = f1_score(y_true, y_pred, average='weighted')
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples',
                xticklabels=np.unique(y_true),
                yticklabels=np.unique(y_true))
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    return acc

print("Helper function defined!")

## Task 4.1: Logistic Regression

In [None]:
print("Training Logistic Regression on PCA 99% variance data...")

logistic_reg = LogisticRegression(solver='liblinear', max_iter=1000)
logistic_reg.fit(X_train, y_train)

print("Done!")

y_val_pred_lr  = logistic_reg.predict(X_val)
y_test_pred_lr = logistic_reg.predict(X_test)

In [None]:
print("=" * 55)
print("TASK 4.1: LOGISTIC REGRESSION - PCA 99% VARIANCE")
print("=" * 55)

acc_lr_val  = evaluate_model(y_val, y_val_pred_lr, "Validation")
acc_lr_test = evaluate_model(y_test, y_test_pred_lr, "Test")

## Task 4.2: SVM with Linear Kernel

In [None]:
print("Training SVM Linear on PCA 99% variance data... (may take a few minutes)")

linear_svm = SVC(kernel='linear', C=1.0)
linear_svm.fit(X_train, y_train)

print("Done!")

y_val_pred_lin  = linear_svm.predict(X_val)
y_test_pred_lin = linear_svm.predict(X_test)

In [None]:
print("=" * 55)
print("TASK 4.2: SVM (LINEAR KERNEL) - PCA 99% VARIANCE")
print("=" * 55)

acc_lin_val  = evaluate_model(y_val, y_val_pred_lin, "Validation")
acc_lin_test = evaluate_model(y_test, y_test_pred_lin, "Test")

## Task 4.3: SVM with Polynomial Kernel

In [None]:
print("Training Polynomial SVM for degrees 2, 3, 4, 5 on PCA 99% variance data...")
print("(Please wait)\n")

best_degree  = None
best_val_acc = 0
degree_val_accuracies = {}

for degree in [2, 3, 4, 5]:
    poly_svm = SVC(kernel='poly', degree=degree, C=1.0, gamma='scale')
    poly_svm.fit(X_train, y_train)
    
    val_acc = poly_svm.score(X_val, y_val)
    degree_val_accuracies[degree] = val_acc
    print(f"Degree={degree}, Validation Accuracy={val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_degree  = degree

print(f"\nBest Degree: {best_degree} with Validation Accuracy: {best_val_acc:.4f}")

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(list(degree_val_accuracies.keys()), list(degree_val_accuracies.values()), marker='D', color='purple')
plt.title('Polynomial SVM: Validation Accuracy vs Degree (PCA 99%)')
plt.xlabel('Degree')
plt.ylabel('Validation Accuracy')
plt.xticks([2, 3, 4, 5])
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print(f"Training final model with best degree = {best_degree}...")

best_poly_svm = SVC(kernel='poly', degree=best_degree, C=1.0, gamma='scale')
best_poly_svm.fit(X_train, y_train)

y_val_pred_poly  = best_poly_svm.predict(X_val)
y_test_pred_poly = best_poly_svm.predict(X_test)

print("Done!")

In [None]:
print("=" * 60)
print(f"TASK 4.3: SVM (POLY KERNEL, DEGREE={best_degree}) - PCA 99% VARIANCE")
print("=" * 60)

print("\nValidation Accuracies for all degrees:")
for d, acc in degree_val_accuracies.items():
    marker = " <-- best" if d == best_degree else ""
    print(f"  Degree {d}: {acc:.4f}{marker}")

acc_poly_val  = evaluate_model(y_val, y_val_pred_poly, "Validation (Best Degree)")
acc_poly_test = evaluate_model(y_test, y_test_pred_poly, "Test (Best Degree)")

## Task 4.4: SVM with Gaussian (RBF) Kernel

In [None]:
print("Training SVM with RBF Kernel on PCA 99% variance data...")

rbf_svm = SVC(kernel='rbf', C=1.0, gamma='scale')
rbf_svm.fit(X_train, y_train)

print("Done!")

y_val_pred_rbf  = rbf_svm.predict(X_val)
y_test_pred_rbf = rbf_svm.predict(X_test)

In [None]:
print("=" * 55)
print("TASK 4.4: SVM (RBF KERNEL) - PCA 99% VARIANCE")
print("=" * 55)

acc_rbf_val  = evaluate_model(y_val, y_val_pred_rbf, "Validation")
acc_rbf_test = evaluate_model(y_test, y_test_pred_rbf, "Test")

## Task 4.5: Comparison on PCA 99% Variance Data

In [None]:
comparison_val = {
    'Classifier':          ['Logistic Regression', 'SVM Linear', f'SVM Poly (d={best_degree})', 'SVM RBF'],
    'Validation Accuracy': [acc_lr_val, acc_lin_val, acc_poly_val, acc_rbf_val]
}

comparison_test = {
    'Classifier':    ['Logistic Regression', 'SVM Linear', f'SVM Poly (d={best_degree})', 'SVM RBF'],
    'Test Accuracy': [acc_lr_test, acc_lin_test, acc_poly_test, acc_rbf_test]
}

df_val  = pd.DataFrame(comparison_val)
df_test = pd.DataFrame(comparison_test)

print("Validation Accuracy Comparison (PCA 99% Variance):")
print(df_val.to_string(index=False))

print("\nTest Accuracy Comparison (PCA 99% Variance):")
print(df_test.to_string(index=False))

In [None]:
classifiers = ['Logistic\nRegression', 'SVM\nLinear', f'SVM Poly\n(d={best_degree})', 'SVM\nRBF']
val_accs  = [acc_lr_val, acc_lin_val, acc_poly_val, acc_rbf_val]
test_accs = [acc_lr_test, acc_lin_test, acc_poly_test, acc_rbf_test]

x = np.arange(len(classifiers))
width = 0.35

fig, ax = plt.subplots(figsize=(9, 5))
bars1 = ax.bar(x - width/2, val_accs,  width, label='Validation', color='mediumpurple')
bars2 = ax.bar(x + width/2, test_accs, width, label='Test', color='plum')

ax.set_title('Classifier Accuracy Comparison - PCA 99% Variance')
ax.set_ylabel('Accuracy')
ax.set_xticks(x)
ax.set_xticklabels(classifiers)
ax.legend()
ax.set_ylim(0.5, 1.05)

for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=9)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f'{bar.get_height():.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

best_val_clf  = classifiers[np.argmax(val_accs)].replace('\n', ' ')
best_test_clf = classifiers[np.argmax(test_accs)].replace('\n', ' ')
print(f"\nDiscussion:")
print(f"Best classifier on Validation: {best_val_clf} ({max(val_accs):.4f})")
print(f"Best classifier on Test:       {best_test_clf} ({max(test_accs):.4f})")
print("PCA with 99% variance reduces dimensionality slightly while preserving most information.")
print("Training should be slightly faster than full PCA or original data.")

---
## Task 4.5 (Overall): Cross-Dataset Comparison

**Note:** Before running this section, manually copy the accuracy values from Notebooks 1, 2, and 3 into the variables below.

In [None]:
# ============================================================
# IMPORTANT: Fill in values from the other notebooks below!
# You can find these printed at the bottom of each notebook.
# ============================================================

# --- Notebook 1: Original Data ---
acc_lr_orig_val    = 0.0  # replace with actual value from Notebook 1
acc_lin_orig_val   = 0.0
acc_poly_orig_val  = 0.0
acc_rbf_orig_val   = 0.0

acc_lr_orig_test   = 0.0
acc_lin_orig_test  = 0.0
acc_poly_orig_test = 0.0
acc_rbf_orig_test  = 0.0

# --- Notebook 2: Scaled Data ---
acc_lr_scaled_val    = 0.0  # replace with actual value from Notebook 2
acc_lin_scaled_val   = 0.0
acc_poly_scaled_val  = 0.0
acc_rbf_scaled_val   = 0.0

acc_lr_scaled_test   = 0.0
acc_lin_scaled_test  = 0.0
acc_poly_scaled_test = 0.0
acc_rbf_scaled_test  = 0.0

# --- Notebook 3: PCA All Components ---
acc_lr_pcaall_val    = 0.0  # replace with actual value from Notebook 3
acc_lin_pcaall_val   = 0.0
acc_poly_pcaall_val  = 0.0
acc_rbf_pcaall_val   = 0.0

acc_lr_pcaall_test   = 0.0
acc_lin_pcaall_test  = 0.0
acc_poly_pcaall_test = 0.0
acc_rbf_pcaall_test  = 0.0

# --- This Notebook (Notebook 4): PCA 99% Variance ---
# These are already computed above - no need to change these
acc_lr_pca99_val    = acc_lr_val
acc_lin_pca99_val   = acc_lin_val
acc_poly_pca99_val  = acc_poly_val
acc_rbf_pca99_val   = acc_rbf_val

acc_lr_pca99_test   = acc_lr_test
acc_lin_pca99_test  = acc_lin_test
acc_poly_pca99_test = acc_poly_test
acc_rbf_pca99_test  = acc_rbf_test

print("Values loaded! Remember to replace the 0.0 placeholders with actual values from other notebooks.")

In [None]:
# Overall Validation Accuracy Comparison Table
data_val = {
    "Dataset":            ["Original", "Scaled", "PCA All", "PCA 99"],
    "Logistic Regression": [acc_lr_orig_val,   acc_lr_scaled_val,   acc_lr_pcaall_val,   acc_lr_pca99_val],
    "SVM Linear":          [acc_lin_orig_val,  acc_lin_scaled_val,  acc_lin_pcaall_val,  acc_lin_pca99_val],
    "SVM Polynomial":      [acc_poly_orig_val, acc_poly_scaled_val, acc_poly_pcaall_val, acc_poly_pca99_val],
    "SVM Gaussian":        [acc_rbf_orig_val,  acc_rbf_scaled_val,  acc_rbf_pcaall_val,  acc_rbf_pca99_val]
}

df_overall_val = pd.DataFrame(data_val)
df_overall_val.set_index("Dataset", inplace=True)

print("Validation Accuracy Comparison (All Datasets)")
print(df_overall_val.round(4))

In [None]:
# Overall Test Accuracy Comparison Table
data_test = {
    "Dataset":            ["Original", "Scaled", "PCA All", "PCA 99"],
    "Logistic Regression": [acc_lr_orig_test,   acc_lr_scaled_test,   acc_lr_pcaall_test,   acc_lr_pca99_test],
    "SVM Linear":          [acc_lin_orig_test,  acc_lin_scaled_test,  acc_lin_pcaall_test,  acc_lin_pca99_test],
    "SVM Polynomial":      [acc_poly_orig_test, acc_poly_scaled_test, acc_poly_pcaall_test, acc_poly_pca99_test],
    "SVM Gaussian":        [acc_rbf_orig_test,  acc_rbf_scaled_test,  acc_rbf_pcaall_test,  acc_rbf_pca99_test]
}

df_overall_test = pd.DataFrame(data_test)
df_overall_test.set_index("Dataset", inplace=True)

print("Test Accuracy Comparison (All Datasets)")
print(df_overall_test.round(4))

In [None]:
# heatmap of validation accuracies across datasets and classifiers
fig, axes = plt.subplots(1, 2, figsize=(16, 4))

sns.heatmap(df_overall_val, annot=True, fmt='.4f', cmap='YlGnBu',
            ax=axes[0], vmin=0.5, vmax=1.0)
axes[0].set_title('Validation Accuracy Across Datasets & Classifiers')
axes[0].set_ylabel('Dataset')

sns.heatmap(df_overall_test, annot=True, fmt='.4f', cmap='YlOrRd',
            ax=axes[1], vmin=0.5, vmax=1.0)
axes[1].set_title('Test Accuracy Across Datasets & Classifiers')
axes[1].set_ylabel('Dataset')

plt.tight_layout()
plt.show()

In [None]:
# line plot to see trends across datasets for each classifier
datasets   = ['Original', 'Scaled', 'PCA All', 'PCA 99%']
lr_vals    = [acc_lr_orig_test,   acc_lr_scaled_test,   acc_lr_pcaall_test,   acc_lr_pca99_test]
lin_vals   = [acc_lin_orig_test,  acc_lin_scaled_test,  acc_lin_pcaall_test,  acc_lin_pca99_test]
poly_vals  = [acc_poly_orig_test, acc_poly_scaled_test, acc_poly_pcaall_test, acc_poly_pca99_test]
rbf_vals   = [acc_rbf_orig_test,  acc_rbf_scaled_test,  acc_rbf_pcaall_test,  acc_rbf_pca99_test]

plt.figure(figsize=(9, 5))
plt.plot(datasets, lr_vals,   marker='o', label='Logistic Regression')
plt.plot(datasets, lin_vals,  marker='s', label='SVM Linear')
plt.plot(datasets, poly_vals, marker='^', label='SVM Polynomial')
plt.plot(datasets, rbf_vals,  marker='D', label='SVM RBF')

plt.title('Test Accuracy of Classifiers Across Different Datasets')
plt.ylabel('Test Accuracy')
plt.xlabel('Dataset')
plt.legend()
plt.ylim(0.5, 1.05)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# final overall discussion
print("="*65)
print("OVERALL DISCUSSION - CROSS-DATASET COMPARISON")
print("="*65)
print("""
Key Observations:

1. Feature Scaling:
   - SVMs (especially RBF and Linear) generally perform better on scaled data
     because scaling ensures all features contribute equally to the kernel 
     distance computation.
   - Logistic Regression also benefits from scaling for faster convergence.

2. PCA All Components:
   - When PCA is applied with all components, the data is rotated to the 
     principal component space. Performance is similar to scaled data since
     no variance is discarded.

3. PCA 99% Variance:
   - By keeping only enough components for 99% variance, we reduce
     dimensionality slightly while preserving most of the information.
   - This can make training a bit faster without losing much accuracy.

4. Best Overall Classifier:
   - SVM with RBF kernel generally performs best on scaled/PCA data.
   - SVM with Linear kernel also performs very well on this dataset.
   - Logistic Regression is competitive and much faster to train.
   - Polynomial SVM is the most sensitive to the degree chosen and is 
     generally slower to train.

Conclusion:
   For the HAR dataset, SVM with RBF kernel on scaled or PCA-transformed 
   data tends to give the best results. However, if speed is a concern, 
   Logistic Regression is a strong and efficient baseline.
""")