## Dataset Summary:

- **Dataset Name:** Human Activity Recognition Using Smartphones (UCI HAR)
- **Source:** UCI Machine Learning Repository  
  https://archive.ics.uci.edu/dataset/240/human+activity+recognition+using+smartphones
- **Samples:** 10,299 total observations  
- **Classes:** 6 human activity labels  
- **Participants:** 30 subjects  
- **Sensors Used:** Tri-axial Accelerometer and Tri-axial Gyroscope  
- **Features Extracted by Dataset:** 561 time-domain and frequency-domain variables  
- **Problem Type:** Multiclass Classification


In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# turn off warnings 
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("motioniq_cleaned_dataset.csv")



df.head()


In [None]:
sensor_cols = df.select_dtypes(include=[np.number]).columns
sensor_cols = [c for c in sensor_cols if c not in ['subject', 'activity_id']]

len(sensor_cols), sensor_cols[:10]

In [19]:


#Feature Engineering
stat_features = pd.DataFrame()

# Basc stats
stat_features['feat_mean']     = df[sensor_cols].mean(axis=1)
stat_features['feat_median']   = df[sensor_cols].median(axis=1)
stat_features['feat_variance'] = df[sensor_cols].var(axis=1)
stat_features['feat_std']      = df[sensor_cols].std(axis=1)
stat_features['feat_min']      = df[sensor_cols].min(axis=1)
stat_features['feat_max']      = df[sensor_cols].max(axis=1)

# Advanced stats 
stat_features['feat_skew']     = df[sensor_cols].skew(axis=1)
stat_features['feat_kurtosis'] = df[sensor_cols].kurtosis(axis=1)

stat_features.head()


Unnamed: 0,feat_mean,feat_median,feat_variance,feat_std,feat_min,feat_max,feat_skew,feat_kurtosis
0,-0.615876,-0.985106,0.346507,0.5886484,-1.0,1.0,1.35811,0.533846
1,35586.542312,-0.986854,711743800000.0,843649.1,-1.0,20000000.0,23.706539,562.0
2,71173.726358,-0.986793,2846975000000.0,1687298.0,-1.0,40000000.0,23.706539,562.0
3,106760.909471,-0.989358,6405694000000.0,2530947.0,-1.0,60000000.0,23.706539,562.0
4,142348.101554,-0.9908,11387900000000.0,3374596.0,-1.0,80000000.0,23.706539,562.0


In [18]:
#Temporal Features
temporal_features = pd.DataFrame()

# Simple Moving Average (SMA) per sample
temporal_features['feat_sma'] = df[sensor_cols].abs().sum(axis=1) / len(sensor_cols)

# Energy
temporal_features['feat_energy'] = (df[sensor_cols]**2).sum(axis=1) / len(sensor_cols)

# Peak-to-Peak amplitude
temporal_features['feat_ptp'] = df[sensor_cols].max(axis=1) - df[sensor_cols].min(axis=1)

# Zero Crossing Rate (approx)
temporal_features['feat_zcr'] = ((df[sensor_cols].diff() * df[sensor_cols].shift(-1)) < 0).sum(axis=1) / len(sensor_cols)

temporal_features.head()


Unnamed: 0,feat_sma,feat_energy,feat_ptp,feat_zcr
0,0.787174,0.7251932,2.0,0.0
1,35587.948923,711743800000.0,20000001.0,0.418149
2,71175.137229,2846975000000.0,40000001.0,0.512456
3,106762.334835,6405694000000.0,60000001.0,0.302491
4,142349.525162,11387900000000.0,80000001.0,0.398577


In [17]:
from scipy.fft import fft
from scipy.stats import entropy

# Frequency Features
freq_features = pd.DataFrame()

for col in sensor_cols:
    x = df[col].values
    # FFT
    spectrum = np.abs(fft(x))
    spectrum_norm = spectrum / np.sum(spectrum + 1e-12)  # normalize
    # Spectral Entropy
    freq_features.loc[:, f'{col}_spec_entropy'] = [entropy(spectrum_norm)]
    # Spectral Centroid
    freq_features.loc[:, f'{col}_spec_centroid'] = [np.sum(np.arange(len(spectrum))*spectrum)/np.sum(spectrum + 1e-12)]

# transposing to match row-wise format
freq_features = freq_features.transpose().reset_index(drop=True).T
freq_features.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1114,1115,1116,1117,1118,1119,1120,1121,1122,1123
0,8.865698,4923.70401,9.026558,5120.769652,8.973748,5031.456356,8.22482,4956.233819,8.367969,5016.748614,...,9.07573,5148.386894,8.310693,5035.413065,8.351498,5123.784386,8.36169,5125.769688,6.407661,4409.92685


In [16]:
# Combine all features
X = pd.concat([stat_features, temporal_features, freq_features], axis=1)

# Add target col
y = df['activity']

print("Shape of final feature matrix:", X.shape)
print("Sample target labels:")
y.value_counts()


Shape of final feature matrix: (10299, 1136)
Sample target labels:


activity
LAYING                1944
STANDING              1906
SITTING               1777
WALKING               1722
WALKING_UPSTAIRS      1544
WALKING_DOWNSTAIRS    1406
Name: count, dtype: int64

In [15]:
# Ml Models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

df = pd.read_csv("motioniq_cleaned_dataset.csv")  # your cleaned dataset

df = df.dropna()

# fts and target
X = df.drop(columns=['activity', 'subject', 'activity_id', 'timestamp', 'datetime'])
y = df['activity']

# Encode target labels to int(for XGBoost & sklearn models)
le = LabelEncoder()
y = le.fit_transform(y)

#train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features for LR & SVM

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 #initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

#training and evaluation
for name, model in models.items():
    print(f"--- {name} ---")
    
    # Scale only for LR & SVM
    if name in ["Logistic Regression", "SVM"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1-score:", f1)
    print("Confusion Matrix:\n", cm)
    
    # ROC-AUC (multi-class)
    try:
        roc_auc = roc_auc_score(pd.get_dummies(y_test), y_prob, multi_class='ovr')
        print("ROC-AUC:", roc_auc)
    except:
        print("ROC-AUC: Skipped (multi-class handling issue)")
    
    print("\n")


--- Logistic Regression ---
Accuracy: 0.9854368932038835
Precision: 0.9854349572903074
Recall: 0.9854368932038835
F1-score: 0.9854140010883592
Confusion Matrix:
 [[389   0   0   0   0   0]
 [  1 340  14   0   0   1]
 [  0  10 371   0   0   0]
 [  0   0   0 344   0   0]
 [  0   0   0   1 280   0]
 [  0   0   0   2   1 306]]
ROC-AUC: 0.9994552117657127


--- SVM ---
Accuracy: 0.9825242718446602
Precision: 0.9825815394286026
Recall: 0.9825242718446602
F1-score: 0.982522619483553
Confusion Matrix:
 [[389   0   0   0   0   0]
 [  0 343  12   0   0   1]
 [  0  19 362   0   0   0]
 [  0   0   0 342   2   0]
 [  0   0   0   0 280   1]
 [  0   0   0   0   1 308]]
ROC-AUC: 0.9995255323982984


--- Random Forest ---
Accuracy: 0.9771844660194174
Precision: 0.977379706230731
Recall: 0.9771844660194174
F1-score: 0.9772087024767577
Confusion Matrix:
 [[389   0   0   0   0   0]
 [  0 341  14   0   0   1]
 [  0  12 369   0   0   0]
 [  0   0   0 336   4   4]
 [  0   0   0   0 272   9]
 [  0   0   0   0