#Assignment No. 5.1
#Part 4: Data Loading and Preprocessing
##Required libraries

In [6]:
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
from google.colab import files
import os
from collections import Counter

##Upload and extract dataset




In [7]:

uploaded = files.upload()



Saving UCI HAR Dataset.zip to UCI HAR Dataset.zip


In [8]:
with zipfile.ZipFile('UCI HAR Dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Points to the extracted folder
DATA_DIR = Path('UCI HAR Dataset')

print("Contents of DATA_DIR:", os.listdir(DATA_DIR))


Contents of DATA_DIR: ['.DS_Store', 'activity_labels.txt', 'train', 'features.txt', 'test', 'features_info.txt', 'README.txt']


## Load feature names and activity labels

In [9]:
feat = pd.read_csv(DATA_DIR / 'features.txt', sep=r'\s+', header=None, names=['idx', 'feature'])
feature_names = feat['feature'].tolist()

act_labels = pd.read_csv(DATA_DIR / 'activity_labels.txt', sep=r'\s+', header=None, names=['id', 'activity'])
act_map = dict(zip(act_labels.id, act_labels.activity))


In [10]:
# Make feature names unique
counts = Counter()
unique_feature_names = []
for name in feature_names:
    counts[name] += 1
    if counts[name] > 1:
        unique_feature_names.append(f"{name}_{counts[name]}")
    else:
        unique_feature_names.append(name)

##Load train/test splits

In [13]:
def load_split(split='train'):
    # Load sensor features (561 columns)
    X = pd.read_csv(DATA_DIR / split / f'X_{split}.txt', sep=r'\s+', header=None, names=unique_feature_names)

    # Load activity IDs
    y = pd.read_csv(DATA_DIR / split / f'y_{split}.txt', header=None, names=['activity_id'])

    # Load subject IDs
    subj = pd.read_csv(DATA_DIR / split / f'subject_{split}.txt', header=None, names=['subject'])

    # Combine into a single dataframe
    df = pd.concat([subj, y, X], axis=1)

    df['activity'] = df['activity_id'].map(act_map)

    return df


##Combine train + test

In [14]:
train = load_split('train')
test = load_split('test')

df = pd.concat([train, test], ignore_index=True)
print("Dataset loaded and combined successfully!")


Dataset loaded and combined successfully!


##Parsing timestamps
####Correctly converting timestamps into datetime format is essential because it allows Python to understand the true sequence of events. Without proper parsing, time-based operations such as resampling, windowing, or trend analysis cannot be performed accurately.

In [15]:
sampling_rate = 50  # 50 samples per second
df['timestamp'] = pd.to_timedelta(df.index / sampling_rate, unit='s')

# Arbitrary start datetime
start = pd.to_datetime('2020-01-01 00:00:00')
df['datetime'] = start + df['timestamp']

print("Timestamps simulated and added")
df.head(3)


Timestamps simulated and added


Unnamed: 0,subject,activity_id,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",activity,timestamp,datetime
0,1,5,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,...,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,STANDING,0 days 00:00:00,2020-01-01 00:00:00.000
1,1,5,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,...,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,STANDING,0 days 00:00:00.020000,2020-01-01 00:00:00.020
2,1,5,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,...,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,STANDING,0 days 00:00:00.040000,2020-01-01 00:00:00.040


##Sort chronologically
####Time-series models depend on the natural flow of time. Sorting ensures that earlier events come before later ones, preventing data leakage and making sure the model learns realistic temporal patterns instead of incorrectly ordered data.

In [16]:
df = df.sort_values(by='datetime').reset_index(drop=True)
print("Dataset sorted chronologically")

Dataset sorted chronologically


##Handle missing values
####Missing data can cause errors, disrupt calculations, and lead to misleading patterns. Filling or removing missing values ensures consistent input for the model and preserves the integrity of statistical and ML results.

In [17]:
# Replace any '?' with NaN
df = df.replace('?', np.nan)

# Convert all numeric columns to numeric type
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Interpolate missing values linearly
df[num_cols] = df[num_cols].interpolate(method='linear', limit_direction='both')

# Forward-fill and backward-fill
df[num_cols] = df[num_cols].ffill().bfill()

print("Missing values handled")





Missing values handled


##Cleaning
####Tasks like removing duplicates, correcting invalid entries, and standardizing column names help maintain a clean and reliable dataset. This improves model performance, prevents bias, and ensures the dataset is easy to understand and process.

In [19]:
# Remove duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Removed {before - after} duplicate rows")

# Clean column names
df.columns = [str(c).strip().replace('(', '').replace(')', '').replace('-', '_') for c in df.columns]

print("Column names cleaned")
print("First 5 rows after cleaning:")
display(df.head())



Removed 0 duplicate rows
Column names cleaned
First 5 rows after cleaning:


Unnamed: 0,subject,activity_id,tBodyAcc_mean_X,tBodyAcc_mean_Y,tBodyAcc_mean_Z,tBodyAcc_std_X,tBodyAcc_std_Y,tBodyAcc_std_Z,tBodyAcc_mad_X,tBodyAcc_mad_Y,...,"angletBodyAccMean,gravity","angletBodyAccJerkMean,gravityMean","angletBodyGyroMean,gravityMean","angletBodyGyroJerkMean,gravityMean","angleX,gravityMean","angleY,gravityMean","angleZ,gravityMean",activity,timestamp,datetime
0,1,5,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,...,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,STANDING,0,2020-01-01 00:00:00.000
1,1,5,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,...,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,STANDING,20000000,2020-01-01 00:00:00.020
2,1,5,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,...,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,STANDING,40000000,2020-01-01 00:00:00.040
3,1,5,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,...,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,STANDING,60000000,2020-01-01 00:00:00.060
4,1,5,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,...,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,STANDING,80000000,2020-01-01 00:00:00.080


##Final cleaned dataset

In [20]:
df.to_csv('motioniq_cleaned_dataset.csv', index=False)
print("Final cleaned dataset saved as motioniq_cleaned_dataset.csv")


Final cleaned dataset saved as motioniq_cleaned_dataset.csv
