In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import requests
import zipfile
import io

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip'
r = requests.get(url, allow_redirects=True)
z = zipfile.ZipFile(io.BytesIO(r.content))

path = 'UCI HAR Dataset/'
features_path = path + 'features.txt'
activity_labels_path = path + 'activity_labels.txt'
x_train_path = path + 'train/X_train.txt'
y_train_path = path + 'train/y_train.txt'
x_test_path = path + 'test/X_test.txt'
y_test_path = path + 'test/y_test.txt'

features = pd.read_csv(z.open(features_path), sep=' ', header=None, names=['id', 'name'])
feature_names = features['name'].tolist()
activity_labels = pd.read_csv(z.open(activity_labels_path), sep=' ', header=None, names=['id', 'activity'])

X_train = pd.read_csv(z.open(x_train_path), delim_whitespace=True, header=None, names=feature_names)
y_train = pd.read_csv(z.open(y_train_path), header=None, names=['activity_id'])

X_test = pd.read_csv(z.open(x_test_path), delim_whitespace=True, header=None, names=feature_names)
y_test = pd.read_csv(z.open(y_test_path), header=None, names=['activity_id'])

X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

y = y.merge(activity_labels, left_on='activity_id', right_on='id')['activity']

print("Dataset Shape:")
print(X.shape)

print("\nClass Distribution:")
print(y.value_counts())

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca_full = PCA()
pca_full.fit(X_scaled)

cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Find components for 90% and 95% variance
n_components_90 = np.where(cumulative_variance >= 0.90)[0][0] + 1
n_components_95 = np.where(cumulative_variance >= 0.95)[0][0] + 1

print(f"Components to retain 90% variance: {n_components_90}")
print(f"Components to retain 95% variance: {n_components_95}")

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='.', linestyle='-')
plt.axhline(y=0.90, color='r', linestyle='--', label='90% Variance Threshold')
plt.axhline(y=0.95, color='g', linestyle='--', label='95% Variance Threshold')
plt.title('Cumulative Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.legend()
plt.show()

In [None]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)
pca_2d_df = pd.DataFrame(data=X_pca_2d, columns=['PC1', 'PC2'])
pca_2d_df['activity'] = y.values

plt.figure(figsize=(12, 9))
targets = activity_labels['activity'].tolist()
for activity in targets:
    subset = pca_2d_df[pca_2d_df['activity'] == activity]
    plt.scatter(subset['PC1'], subset['PC2'], label=activity, alpha=0.7)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA of HAR Dataset')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid()
plt.show()

In [None]:
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled)
pca_3d_df = pd.DataFrame(data=X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
pca_3d_df['activity'] = y.values

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

targets = activity_labels['activity'].tolist()
for activity in targets:
    subset = pca_3d_df[pca_3d_df['activity'] == activity]
    ax.scatter(subset['PC1'], subset['PC2'], subset['PC3'], label=activity)

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D PCA of HAR Dataset')
ax.legend(bbox_to_anchor=(1.1, 1), loc='upper left')
plt.show()