In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_path = '/kaggle/working/Combined/Train'
activities = sorted([act for act in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, act))])

activity_counts = {}

for activity in activities:
    activity_counts[activity] = len([f for f in os.listdir(os.path.join(base_path, activity)) if f.endswith('.csv')])

# Plot
plt.figure(figsize=(8, 4))
sns.barplot(x=list(activity_counts.keys()), y=list(activity_counts.values()))
plt.xticks(rotation=45)
plt.title("Number of Samples per Activity (Train Set)")
plt.ylabel("Sample Count")
plt.xlabel("Activity")
plt.tight_layout()
plt.show()


In [None]:
sample_file = os.listdir(os.path.join(base_path, 'WALKING'))[0]
df_sample = pd.read_csv(os.path.join(base_path, 'WALKING', sample_file), header=None)

print(f"Sample shape: {df_sample.shape}")
df_sample.head()

In [None]:
from glob import glob

plt.figure(figsize=(15, 8))

for i, activity in enumerate(activities):
    sample_path = glob(os.path.join(base_path, activity, '*.csv'))[0]
    df = pd.read_csv(sample_path, header=None)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)

    total_acc = np.sqrt((df**2).sum(axis=1))

    plt.subplot(2, 3, i+1)
    plt.plot(total_acc)
    plt.title(activity)
    plt.xticks([])

plt.suptitle("Total Acceleration per Sample by Activity", y=1.02)
plt.tight_layout()
plt.show()

In [None]:
print(df.isnull().sum())

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation between acceleration axes")
plt.show()


In [None]:
for activity in activities:
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[0]
    df = pd.read_csv(os.path.join(path, file), header=None)

    # Convert to numeric safely
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)

    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]
    df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)
    print(f"{activity:<20} | Mean Acc Magnitude: {df['acc_mag'].mean():.2f} | Std: {df['acc_mag'].std():.2f}")

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import os
import pandas as pd

fig = plt.figure(figsize=(18, 10))

for i, activity in enumerate(activities):
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[5]
    df = pd.read_csv(os.path.join(path, file), header=None)

    # Convert to numeric and clean
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)
    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]

    # Subplot
    ax = fig.add_subplot(2, 3, i+1, projection='3d')
    ax.plot(df['acc_x'], df['acc_y'], df['acc_z'], linewidth=0.8)
    ax.set_title(f"{activity}")
    ax.set_xlabel("X")
    ax.set_ylabel("Y")
    ax.set_zlabel("Z")

plt.suptitle("3D Trajectories of Acceleration for Each Activity", y=1.02)
plt.tight_layout()
plt.show()


In [None]:
for activity in activities:
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[0]
    df = pd.read_csv(os.path.join(path, file), header=None)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)
    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]

    plt.figure(figsize=(10, 4))
    plt.plot(df['acc_x'], label='X')
    plt.plot(df['acc_y'], label='Y')
    plt.plot(df['acc_z'], label='Z')
    plt.title(f"Waveform for {activity}")
    plt.xlabel("Time")
    plt.ylabel("Acceleration")
    plt.legend()
    plt.tight_layout()
    plt.show()


Across all activities (LAYING, SITTING, STANDING, WALKING, etc.), the mean acceleration magnitude is observed to be close to 1.0, with very low standard deviation for static activities and slightly higher values for dynamic ones.

A device at rest under Earth's gravity experiences a net acceleration of ~1g.

Thus, during stationary activities like LAYING, SITTING, the only significant acceleration is due to gravity, resulting in a stable magnitude of ~1.0g. During dynamic activities like WALKING, additional body motion introduces variability, increasing the standard deviation.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

X = []
y = []

for activity in activities:
    path = os.path.join(base_path, activity)
    for file in os.listdir(path)[:21]:  # pick all 21 samples
        df = pd.read_csv(os.path.join(path, file), header=None)
        df = df.apply(pd.to_numeric, errors='coerce')
        df.dropna(inplace=True)
        df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]
        
        # Use mean acceleration vector of the sample
        sample_feature = df.mean().values  # shape (3,)
        X.append(sample_feature)
        y.append(activity)

X = np.array(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plotting
plt.figure(figsize=(8, 6))
for activity in activities:
    idx = np.array(y) == activity
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], label=activity, s=40)

plt.title("PCA of Mean Acceleration per Sample")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# PCA to reduce to 1 component
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X)

# Plot
plt.figure(figsize=(10, 5))
for activity in activities:
    idx = np.array(y) == activity
    plt.scatter([activity]*sum(idx), X_pca[idx], label=activity)

plt.ylabel("PCA Component 1")
plt.title("1D PCA Projection of Acceleration Samples")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Perform PCA with 3 components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# 3D scatter plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

for activity in activities:
    idx = np.array(y) == activity
    ax.scatter(X_pca[idx, 0], X_pca[idx, 1], X_pca[idx, 2], label=activity, s=40)

ax.set_title("3D PCA of Mean Acceleration per Sample")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
ax.legend()
plt.tight_layout()
plt.show()

1. The plots reveals which activities are inherently separable (e.g., Walking vs. Laying) and which require finer           feature engineering (e.g., Sitting vs. Standing).
2. Laying, Sitting, and Standing appear close together, suggesting they have similar acceleration patterns due to           limited movement.
3. Walking, Walking Upstairs, and Walking Downstairs are more spread out, indicating greater variation in movement.
4. Walking Upstairs and Downstairs may overlap somewhat with regular walking but show unique patterns due to the upward     or downward motion.

In [None]:
pip install tsfel

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
import tsfel
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from tqdm import tqdm

# Load the raw acceleration signal files (each row is a 128-sample window)
acc_x = pd.read_csv('/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/Inertial Signals/total_acc_x_train.txt', delim_whitespace=True, header=None)
acc_y = pd.read_csv('/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/Inertial Signals/total_acc_y_train.txt', delim_whitespace=True, header=None)
acc_z = pd.read_csv('/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/Inertial Signals/total_acc_z_train.txt', delim_whitespace=True, header=None)

# Load activity labels
y_train = pd.read_csv('/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/y_train.txt', header=None)[0]

# Configure TSFEL to extract temporal & statistical features (no spectral for speed)
cfg = tsfel.get_features_by_domain(['temporal', 'statistical'])

X_features = []

# Loop through each sample (each row is a 128-sample window)
for i in tqdm(range(len(acc_x))):
    try:
        fx = tsfel.time_series_features_extractor(cfg, acc_x.iloc[i], sampling_frequency=50, verbose=0)
        fy = tsfel.time_series_features_extractor(cfg, acc_y.iloc[i], sampling_frequency=50, verbose=0)
        fz = tsfel.time_series_features_extractor(cfg, acc_z.iloc[i], sampling_frequency=50, verbose=0)
        features = pd.concat([fx, fy, fz], axis=1)
        X_features.append(features.values.flatten())
    except Exception as e:
        print(f"Error at sample {i}: {e}")

# Convert to NumPy array and handle NaNs
X_features = np.array(X_features)
X_features = np.nan_to_num(X_features)

# PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_features)

# Plot the results
plt.figure(figsize=(12, 6))
for label in np.unique(y_train):
    idx = y_train == label
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], label=f'Activity {label}', alpha=0.5)

plt.title("PCA on TSFEL Features (2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

df_features = pd.DataFrame(X_features)
corr_matrix_tsfel = df_features.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix_tsfel, cmap='coolwarm', center=0, xticklabels=False, yticklabels=False)
plt.title("Correlation Matrix of TSFEL Features")
plt.show()

# Identify redundant features
high_corr_pairs_tsfel = np.where((np.abs(corr_matrix_tsfel) > 0.9) & (np.abs(corr_matrix_tsfel) < 1.0))
redundant_provided = set()

for i, j in zip(*high_corr_pairs_tsfel):
    redundant_provided.add((min(i, j), max(i, j)))

print(f"TSFEL feature pairs (> 0.9 correlation): {len(redundant_provided)}")

In [None]:
Tsfel_Features=X_features

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Paths to provided features and labels
X_path = "/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/X_train.txt"       # or original path if different
y_path = "/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/y_train.txt"
label_names_path = "/kaggle/input/uci-har-dataset/UCI HAR Dataset/activity_labels.txt"

# Load features and labels
X = pd.read_csv(X_path, delim_whitespace=True, header=None)
y = pd.read_csv(y_path, header=None).values.flatten()

# Map numeric labels to activity names
label_map = pd.read_csv(label_names_path, delim_whitespace=True, header=None, index_col=0)
y_named = [label_map.loc[label].values[0] for label in y]

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot
plt.figure(figsize=(10, 6))
for activity in np.unique(y_named):
    idx = np.array(y_named) == activity
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], label=activity, alpha=0.6)

plt.title("PCA on Provided HAR Features")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Load provided features
X_provided = pd.read_csv("/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/X_train.txt", delim_whitespace=True, header=None)

# Correlation matrix
corr_matrix_provided = X_provided.corr()

# Visualize
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_provided, cmap='coolwarm', cbar=True)
plt.title("Correlation Matrix of Provided Features")
plt.show()

# Identify redundant features
high_corr_pairs_provided = np.where((np.abs(corr_matrix_provided) > 0.9) & (np.abs(corr_matrix_provided) < 1.0))
redundant_provided = set()

for i, j in zip(*high_corr_pairs_provided):
    redundant_provided.add((min(i, j), max(i, j)))

print(f"Provided Redundant feature pairs (> 0.9 correlation): {len(redundant_provided)}")


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load provided features
X_provided = pd.read_csv("/kaggle/input/uci-har-dataset/UCI HAR Dataset/train/X_train.txt", delim_whitespace=True, header=None)

# Correlation matrix
corr_matrix = X_provided.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

print(f" Number of highly correlated features to drop (>0.9): {len(to_drop)}")
print(f" Dropping columns: {to_drop[:10]}{'...' if len(to_drop) > 10 else ''}")

# Drop the features
X_reduced = X_provided.drop(columns=to_drop).reset_index(drop=True)


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

base_path = '/kaggle/input/d/dinesh168/collected-data/Collected data'
activities = ['Laying', 'Sitting', 'Standing', 'Walking', 'Walking_Downstairs', 'Walking_Upstairs']

plt.figure(figsize=(18, 8))
y_min, y_max = float('inf'), float('-inf')

for i, activity in enumerate(activities):
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[0]
    df = pd.read_csv(os.path.join(path, file), header=None)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)
    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]
    df = df.iloc[:]
    
    y_min = min(y_min, df.min().min())
    y_max = max(y_max, df.max().max())

    plt.subplot(2, 3, i + 1)
    plt.plot(df['acc_x'], label='X', alpha=0.7)
    plt.plot(df['acc_y'], label='Y', alpha=0.7)
    plt.plot(df['acc_z'], label='Z', alpha=0.7)
    plt.title(activity)
    plt.xlabel("Time")
    plt.ylabel("Acceleration")
    plt.ylim(y_min, y_max) 
    plt.grid(True)
    if i == 0:
        plt.legend(loc='upper right')

plt.suptitle("Accelerometer Waveforms for Each Activity", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
plt.figure(figsize=(18, 8))

for i, activity in enumerate(activities):
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[0]
    df = pd.read_csv(os.path.join(path, file), header=None)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)
    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]
    total_acc = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)

    plt.subplot(2, 3, i + 1)
    plt.plot(total_acc, color='darkviolet')
    plt.title(f"{activity}")
    plt.xlabel("Time")
    plt.ylabel("||acc||")
    plt.grid(True)

plt.suptitle("Total Acceleration Magnitude for Each Activity", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(18, 8))

for i, activity in enumerate(activities):
    path = os.path.join(base_path, activity)
    file = os.listdir(path)[0]
    df = pd.read_csv(os.path.join(path, file), header=None)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(inplace=True)
    df.columns = ['acc_x', 'acc_y', 'acc_z'][:df.shape[1]]

    ax = fig.add_subplot(2, 3, i + 1, projection='3d')
    ax.plot(df['acc_x'], df['acc_y'], df['acc_z'])
    ax.set_title(activity)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')

plt.suptitle("3D Trajectories of Accelerometer Data", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
