In [2]:
# Import necessary libraries
import numpy as np  # For numerical operations and handling arrays
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization using plots
import seaborn as sns  # For advanced data visualization

# Import preprocessing and machine learning libraries
from sklearn.preprocessing import StandardScaler  # Standardizes features by scaling
from sklearn.tree import DecisionTreeClassifier  # Decision Tree algorithm for classification
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # Random Forest and AdaBoost ensemble classifiers
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.model_selection import KFold, LeaveOneGroupOut  # For cross-validation strategies
from sklearn.metrics import (  # For model performance evaluation
    accuracy_score,  # To calculate model accuracy
    precision_recall_fscore_support,  # For precision, recall, and F1-score metrics
    classification_report  # To generate a detailed performance report
)

# Code functionality:
# 1. Importing necessary libraries for data manipulation, visualization, preprocessing, and machine learning.
# 2. Provides tools for building classifiers, evaluating model performance, and performing cross-validation.

# Next steps would involve loading the dataset, preprocessing the data, training models,
# and evaluating their performance using metrics and visualizations.



In [3]:
# Load the training data (features) from a text file
X_train = np.loadtxt("UCI HAR Dataset/train/X_train.txt")  
# X_train contains the feature values for the training set.

# Load the training labels (target/output) from a text file and convert them to integers
y_train = np.loadtxt("UCI HAR Dataset/train/y_train.txt").astype(int)  
# y_train contains the activity labels corresponding to X_train.

# Load the test data (features) from a text file
X_test = np.loadtxt("UCI HAR Dataset/test/X_test.txt")  
# X_test contains the feature values for the test set.

# Load the test labels (target/output) from a text file and convert them to integers
y_test = np.loadtxt("UCI HAR Dataset/test/y_test.txt").astype(int)  
# y_test contains the activity labels corresponding to X_test.

# Load the subject identifiers (who performed the activities) for the training data
subjects = np.loadtxt("UCI HAR Dataset/train/subject_train.txt").astype(int)  
# subjects contains the IDs of individuals performing the activities in the training set.

# Overall:
# - The data comes from the UCI Human Activity Recognition (HAR) dataset.
# - `X_train` and `X_test` contain the features for training and testing, respectively.
# - `y_train` and `y_test` contain the corresponding activity labels (target classes).
# - `subjects` gives information about the subjects involved in the training data.


In [4]:
# Print the shape (dimensions) of the training features and labels
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")  
# X_train.shape gives the number of samples (rows) and features (columns) in the training dataset.
# y_train.shape gives the number of labels (rows) in the training dataset.

# Print the shape (dimensions) of the test features and labels
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")  
# X_test.shape gives the number of samples (rows) and features (columns) in the test dataset.
# y_test.shape gives the number of labels (rows) in the test dataset.

# Verification step:
# - Check if the data has been loaded correctly and examine the dimensions of the datasets.
# - Ensure that the number of labels matches the number of samples in both training and test sets.


X_train shape: (7352, 561), y_train shape: (7352,)
X_test shape: (2947, 561), y_test shape: (2947,)


In [5]:
# Initialize the StandardScaler to normalize the data
scaler = StandardScaler()

# Normalize the training data using fit_transform
X_train_scaled = scaler.fit_transform(X_train)

# Normalize the test data using transform (without fitting again)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Lists to store training and testing accuracy
training_acc = []
testing_acc = []


In [7]:
X_train_df = pd.DataFrame(X_train)
correlation_matrix = X_train_df.corr().abs()

upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]

X_train_reduced = X_train_df.drop(columns=to_drop)
X_test_reduced = pd.DataFrame(X_test).drop(columns=to_drop)

print(f"Original features: {X_train.shape[1]}, Reduced features: {X_train_reduced.shape[1]}")

Original features: 561, Reduced features: 202


In [8]:
correlation_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,1.000000,0.148061,0.256952,0.000619,0.021903,0.044617,0.006290,0.022754,0.047558,0.044062,...,0.030681,0.017557,0.015613,0.544320,0.012173,0.037444,0.028844,0.035257,0.034371,0.028242
1,0.148061,1.000000,0.078769,0.045160,0.044920,0.049746,0.044180,0.045049,0.050402,0.038108,...,0.022395,0.001587,0.004459,0.070559,0.013541,0.017967,0.075679,0.005309,0.001053,0.013903
2,0.256952,0.078769,1.000000,0.020217,0.016641,0.008410,0.018747,0.015203,0.001988,0.037197,...,0.020481,0.020091,0.019127,0.052841,0.039836,0.063609,0.034037,0.008587,0.015288,0.022643
3,0.000619,0.045160,0.020217,1.000000,0.927461,0.851668,0.998632,0.920888,0.846392,0.980844,...,0.065987,0.148034,0.115565,0.035011,0.021633,0.018985,0.024810,0.371653,0.471065,0.394825
4,0.021903,0.044920,0.016641,0.927461,1.000000,0.895510,0.922803,0.997347,0.894509,0.917366,...,0.105621,0.206227,0.176946,0.020379,0.012505,0.008507,0.014592,0.380531,0.523600,0.433169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,0.037444,0.017967,0.063609,0.018985,0.008507,0.018429,0.019389,0.012546,0.023525,0.025066,...,0.026615,0.034514,0.024553,0.006269,0.009141,1.000000,0.116001,0.005853,0.012313,0.019903
557,0.028844,0.075679,0.034037,0.024810,0.014592,0.006471,0.024951,0.012341,0.007231,0.028871,...,0.000102,0.017937,0.014865,0.020823,0.035263,0.116001,1.000000,0.023995,0.005869,0.005656
558,0.035257,0.005309,0.008587,0.371653,0.380531,0.345011,0.368191,0.377025,0.347389,0.384192,...,0.087332,0.086006,0.079751,0.011880,0.023246,0.005853,0.023995,1.000000,0.783848,0.643655
559,0.034371,0.001053,0.015288,0.471065,0.523600,0.476006,0.466424,0.525081,0.477607,0.480229,...,0.100125,0.086993,0.078079,0.001540,0.012990,0.012313,0.005869,0.783848,1.000000,0.594885


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Step 1: Normalize the features
scaler = StandardScaler()

# Assuming X_train and X_test are NumPy arrays; Normalize them
X_train_normalized = scaler.fit_transform(X_train)  # Fit and transform training data
X_test_normalized = scaler.transform(X_test)  # Transform test data

# Step 2: Convert to pandas DataFrame to use iloc and calculate correlation
X_train_df = pd.DataFrame(X_train_normalized)
X_test_df = pd.DataFrame(X_test_normalized)

# Step 3: Calculate the correlation matrix
correlation_matrix = X_train_df.corr().abs()

# Step 4: Get the upper triangle of the correlation matrix
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Step 5: Identify columns with correlation greater than 0.9
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]

# Step 6: Create reduced datasets by dropping highly correlated features
X_train_reduced = X_train_df.drop(columns=to_drop)
X_test_reduced = X_test_df.drop(columns=to_drop)

# Print the number of original and reduced features
print(f"Original features: {X_train.shape[1]}, Reduced features: {X_train_reduced.shape[1]}")

# Step 7: List of models to evaluate
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
}

# Step 8: Initialize K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 9: Store results for each model
results = {model_name: {"accuracy": [], "precision": [], "recall": [], "f1": []} for model_name in models.keys()}

# Step 10: Evaluate each model using K-Fold Cross-Validation
for model_name, model in models.items():
    for train_index, test_index in kf.split(X_train_reduced):
        # Split data using NumPy indexing
        X_train_fold, X_test_fold = X_train_reduced.iloc[train_index], X_train_reduced.iloc[test_index]
        
        # Use NumPy indexing for y_train (No .iloc method available)
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict
        y_pred_fold = model.predict(X_test_fold)

        # Compute performance metrics for each fold
        results[model_name]["accuracy"].append(accuracy_score(y_test_fold, y_pred_fold))
        results[model_name]["precision"].append(precision_score(y_test_fold, y_pred_fold, average='weighted'))
        results[model_name]["recall"].append(recall_score(y_test_fold, y_pred_fold, average='weighted'))
        results[model_name]["f1"].append(f1_score(y_test_fold, y_pred_fold, average='weighted'))

# Step 11: Compute average metrics for each model
for model_name in results.keys():
    for metric in results[model_name].keys():
        results[model_name][metric] = np.mean(results[model_name][metric])

# Step 12: Print the results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric.capitalize()}: {value:.4f}")


Original features: 561, Reduced features: 202


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest:
  Accuracy: 0.9780
  Precision: 0.9780
  Recall: 0.9780
  F1: 0.9780
Decision Tree:
  Accuracy: 0.9358
  Precision: 0.9361
  Recall: 0.9358
  F1: 0.9358
Logistic Regression:
  Accuracy: 0.9740
  Precision: 0.9741
  Recall: 0.9740
  F1: 0.9740
AdaBoost:
  Accuracy: 0.3890
  Precision: 0.5334
  Recall: 0.3890
  F1: 0.2430


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
