# Importing Libraries and Loading Data

In [11]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time

def loadData():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip'
    response = requests.get(url)

    if response.status_code == 200:
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
            with zip_ref.open('UCI HAR Dataset/train/X_train.txt') as myfile:
                df = pd.read_csv(myfile, delim_whitespace=True, header=None)
            with zip_ref.open('UCI HAR Dataset/train/y_train.txt') as myfile_y:
                y = pd.read_csv(myfile_y, delim_whitespace=True, header=None)
    else:
        raise Exception("Failed to download or parse the dataset.")

    return df, y


In [12]:
df, y = loadData()

labelEncoder = LabelEncoder()
yEncoded = labelEncoder.fit_transform(y.values.ravel())

  df = pd.read_csv(myfile, delim_whitespace=True, header=None)
  y = pd.read_csv(myfile_y, delim_whitespace=True, header=None)


In [13]:
standardizer = StandardScaler()
dfStandardized = standardizer.fit_transform(df)

# print(df[:20])

X_train_full, X_test_full, y_train, y_test = train_test_split(dfStandardized, yEncoded, test_size=0.2, random_state=42)

# Baseline model with all features using Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

startTime = time.time()
classifierPipelineComplete = Pipeline([
    ('classifier', GaussianNB())
])
classifierPipelineComplete.fit(X_train_full, y_train)
y_pred_full = classifierPipelineComplete.predict(X_test_full)
endTime = time.time()
fullFeaturesTime = endTime - startTime
accuracyFull = accuracy_score(y_test, y_pred_full)

# K-Means for dimensionality reduction

In [17]:
from sklearn.cluster import KMeans

noOfClusters = 50
kMeans = KMeans(n_clusters=noOfClusters, random_state=42, n_init=10)
kMeans.fit(dfStandardized.T)
selectedFeatures = [np.random.choice(np.where(kMeans.labels_ == i)[0]) for i in range(noOfClusters)]
dfReduced = dfStandardized[:, selectedFeatures]

# Split the data with reduced features

In [19]:
X_train_reduced, X_test_reduced = train_test_split(dfReduced, test_size=0.2, random_state=42)

# Model with reduced features using Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB

startTime = time.time()
classifierPipelineReduced = Pipeline([
    ('classifier', GaussianNB())
])
classifierPipelineReduced.fit(X_train_reduced, y_train)
y_pred_reduced = classifierPipelineReduced.predict(X_train_reduced)
endTime = time.time()
reducedFeaturesTime = endTime - startTime
accuracyReduced = accuracy_score(y_test, y_pred_full)

# Print comparison results

In [21]:
print("Baseline Model (All Features):")
print("Accuracy:", accuracyFull)
print("Training Time:", fullFeaturesTime, "seconds")
print("Number of Features:", X_train_full.shape[1])

print("\nModel with Reduced Features (K-Means):")
print("Accuracy:", accuracyReduced)
print("Training Time:", reducedFeaturesTime, "seconds")
print("Number of Features:", noOfClusters)


Baseline Model (All Features):
Accuracy: 0.7314751869476547
Training Time: 0.1039283275604248 seconds
Number of Features: 561

Model with Reduced Features (K-Means):
Accuracy: 0.7314751869476547
Training Time: 0.009704113006591797 seconds
Number of Features: 50
