In [1]:

!pip install xgboost -q
!pip install scipy -q

from google.colab import files
uploaded = files.upload()

zip_filename = list(uploaded.keys())[0]
print("Uploaded file:", zip_filename)

import zipfile

with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall("/content/")

print("Dataset extracted successfully!")

import pandas as pd
import numpy as np

features = pd.read_csv("/content/UCI HAR Dataset/features.txt",
                       sep=r"\s+", header=None, names=["index", "feature"])

features["feature"] = features["feature"].astype(str) + "_" + features.index.astype(str)
feature_names = features["feature"].values

X_train_raw = pd.read_csv("/content/UCI HAR Dataset/train/X_train.txt",
                          sep=r"\s+", header=None, names=feature_names)
y_train = pd.read_csv("/content/UCI HAR Dataset/train/y_train.txt",
                      header=None, names=["Activity"])

X_test_raw = pd.read_csv("/content/UCI HAR Dataset/test/X_test.txt",
                         sep=r"\s+", header=None, names=feature_names)
y_test = pd.read_csv("/content/UCI HAR Dataset/test/y_test.txt",
                     header=None, names=["Activity"])

print("Raw training shape:", X_train_raw.shape)
print("Raw test shape:", X_test_raw.shape)

# =============================
# FEATURE ENGINEERING
# =============================
from scipy.stats import skew, kurtosis

def extract_simple_features(X_df):
    X_np = X_df.values
    n_features = X_np.shape[1]

    feats = pd.DataFrame()
    feats["feat_mean"]    = X_np.mean(axis=1)
    feats["feat_std"]     = X_np.std(axis=1)
    feats["feat_min"]     = X_np.min(axis=1)
    feats["feat_max"]     = X_np.max(axis=1)
    feats["feat_median"]  = np.median(X_np, axis=1)
    feats["feat_skew"]    = skew(X_np, axis=1)
    feats["feat_kurt"]    = kurtosis(X_np, axis=1)
    feats["feat_energy"]  = (X_np ** 2).sum(axis=1) / n_features

    sign_changes = np.diff(np.sign(X_np), axis=1) != 0
    feats["feat_zcr"] = sign_changes.sum(axis=1) / (n_features - 1)

    return feats

X_train_feat = extract_simple_features(X_train_raw)
X_test_feat  = extract_simple_features(X_test_raw)

print("Engineered train shape:", X_train_feat.shape)
print("Engineered test shape:", X_test_feat.shape)

print("\nSample of Engineered Features (Train Set):")
display(X_train_feat.head(10))

# ======================
#  SCALE FEATURES
# ======================
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_feat)
X_test  = scaler.transform(X_test_feat)

# FIX XGBOOST LABEL ERROR (convert 1–6 → 0–5)
y_train = y_train["Activity"].values - 1
y_test  = y_test["Activity"].values - 1

# =======================
# TRAIN FOUR MODELS
# =======================
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(
        multi_class="multinomial", solver="lbfgs", max_iter=1000
    ),
    "SVM (RBF Kernel)": SVC(
        kernel="rbf", probability=True
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="multi:softprob",
        num_class=6,
        eval_metric="mlogloss",
        random_state=42
    )
}

# ================
#  EVALUATION
# ================
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.preprocessing import label_binarize

class_labels = np.unique(y_train)
y_test_bin = label_binarize(y_test, classes=class_labels)

results = []

for name, model in models.items():
    print("\n==============================")
    print(f"Training model: {name}")
    print("==============================")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec  = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1   = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    roc  = roc_auc_score(y_test_bin, y_proba, average="macro", multi_class="ovr")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "ROC-AUC": roc
    })

    print(f"Accuracy   : {acc:.4f}")
    print(f"Precision  : {prec:.4f}")
    print(f"Recall     : {rec:.4f}")
    print(f"F1-score   : {f1:.4f}")
    print(f"ROC-AUC    : {roc:.4f}")

    cm = confusion_matrix(y_test, y_pred, labels=class_labels)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(cm, index=class_labels, columns=class_labels))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

results_df = pd.DataFrame(results)
print("\n\n=== MODEL COMPARISON SUMMARY ===")
print(results_df)

features_df = pd.concat([
    X_train_feat.assign(Activity=y_train),
    X_test_feat.assign(Activity=y_test)
], ignore_index=True)

features_df.to_csv("features.csv", index=False)

print("features.csv created successfully!")
print("Shape:", features_df.shape)

from google.colab import files
files.download("features.csv")



Saving UCI HAR Dataset.zip to UCI HAR Dataset.zip
Uploaded file: UCI HAR Dataset.zip
Dataset extracted successfully!
Raw training shape: (7352, 561)
Raw test shape: (2947, 561)
Engineered train shape: (7352, 9)
Engineered test shape: (2947, 9)

Sample of Engineered Features (Train Set):


Unnamed: 0,feat_mean,feat_std,feat_min,feat_max,feat_median,feat_skew,feat_kurt,feat_energy,feat_zcr
0,-0.616973,0.588073,-1.0,1.0,-0.98518,1.360807,0.533957,0.726486,0.208929
1,-0.647453,0.535441,-1.0,0.997122,-0.986869,1.358073,0.727841,0.705892,0.1875
2,-0.652026,0.529502,-1.0,0.997122,-0.986831,1.359777,0.737754,0.70551,0.183929
3,-0.657535,0.531969,-1.0,1.0,-0.989413,1.412143,0.897555,0.715344,0.180357
4,-0.654058,0.533182,-1.0,0.997453,-0.990915,1.388527,0.79377,0.712075,0.1875
5,-0.659554,0.531774,-1.0,0.997453,-0.992264,1.399139,0.790504,0.717795,0.194643
6,-0.645117,0.535179,-1.0,0.996577,-0.987466,1.3357,0.657702,0.702592,0.175
7,-0.645325,0.534973,-1.0,0.996577,-0.986829,1.31063,0.551684,0.702641,0.191071
8,-0.641482,0.53948,-1.0,0.996723,-0.989371,1.294952,0.484073,0.702538,0.205357
9,-0.647291,0.523289,-1.0,0.994433,-0.98537,1.313783,0.632616,0.692817,0.183929



Training model: Logistic Regression




Accuracy   : 0.5541
Precision  : 0.5531
Recall     : 0.5541
F1-score   : 0.5492
ROC-AUC    : 0.8837

Confusion Matrix:
     0    1    2    3    4    5
0  370   64   62    0    0    0
1  141  289   41    0    0    0
2  108   90  222    0    0    0
3    0    0    0  150  194  147
4    1    2    0  108  295  126
5    0    0    0  120  110  307

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.75      0.66       496
           1       0.65      0.61      0.63       471
           2       0.68      0.53      0.60       420
           3       0.40      0.31      0.35       491
           4       0.49      0.55      0.52       532
           5       0.53      0.57      0.55       537

    accuracy                           0.55      2947
   macro avg       0.56      0.55      0.55      2947
weighted avg       0.55      0.55      0.55      2947


Training model: SVM (RBF Kernel)
Accuracy   : 0.5507
Precision  : 0.5516
Recall     : 0.5

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>