In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_pickle('data/model_data_window_300_space_50.pkl')
df.head()

Unnamed: 0,id,sig_array,activity_name,subject_id,dataset
0,0,"[[-0.1832595, -0.2076941, -0.53145254, 9.77168...",squats,0,mmfit
1,1,"[[1.4453067, -1.6517789, -1.4514153, 1.4441968...",squats,0,mmfit
2,2,"[[1.3023642, -0.0488692, -0.82344604, 8.140672...",squats,0,mmfit
3,3,"[[-0.67683846, 1.067792, 0.6450735, 3.2380664,...",squats,0,mmfit
4,4,"[[0.123394735, 0.28588483, 0.11362089, 12.6337...",squats,0,mmfit


In [3]:
df.loc[0, 'sig_array'].shape

(300, 6)

In [4]:
# limit to labels with enough examples

counts = df['activity_name'].value_counts()
valid_activities = [activity for activity in counts.index.tolist() if counts[activity] >=500]
df = df[df['activity_name'].isin(valid_activities)]

print(f"There are {len(df['activity_name'].unique())} unique activities in the dataset")

There are 51 unique activities in the dataset


In [5]:
# Create test train val split on the user level

# split test/train
train_users, test_users = train_test_split(df['subject_id'].unique(), train_size=.8, random_state=42)

train_data = df[df['subject_id'].isin(train_users)]
test_data = df[df['subject_id'].isin(test_users)]

# split train/val
train_users, val_users = train_test_split(train_data['subject_id'].unique(), train_size=.9, random_state=42)

val_data = train_data[train_data['subject_id'].isin(val_users)]
train_data = train_data[train_data['subject_id'].isin(train_users)]

print("The Sizes of the Train, Test, and Val Sets are:")
print(f"Train Size: {len(train_data)}")
print(f"Test Size: {len(test_data)}")
print(f"Val Size: {len(val_data)}")

The Sizes of the Train, Test, and Val Sets are:
Train Size: 116440
Test Size: 32224
Val Size: 10787


In [6]:
# Seperate X and Y components of data

X_train = np.array(train_data['sig_array'])
X_test = np.array(test_data['sig_array'])
X_Val = np.array(val_data['sig_array'])

y_train = np.array(train_data['activity_name'])
y_test = np.array(test_data['activity_name'])
y_val = np.array(val_data['activity_name'])

In [7]:
# Get Data in right format

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)


X_train = np.stack(X_train).astype(np.float32)
X_test = np.stack(X_test).astype(np.float32)
X_Val = np.stack(X_Val).astype(np.float32)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
X_Val_tensor = torch.tensor(X_Val, dtype=torch.float32)

# Convert y arrays to tensors
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)

In [8]:
# Define custom Dataset class
class SensorDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
# Final prep for model

# Create Dataset objects
train_dataset = SensorDataset(X_train_tensor, y_train_tensor)
test_dataset = SensorDataset(X_test_tensor, y_test_tensor)
val_dataset = SensorDataset(X_Val_tensor, y_val_tensor)

# Create DataLoaders
batch_size = 64  # Adjust batch size as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Train dataset size: 116440
Test dataset size: 32224
Val dataset size: 10787
