# Fall Detection using SisFall Dataset
Daniela Dias, nMec 98039

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Avoiding Subject Bias in Train-Test Split

When working with the SisFall dataset (or any dataset where multiple recordings come from the same individuals), it is essential to avoid data leakage caused by random sample splitting. 

If we randomly split the dataset into training and testing sets, we risk placing samples from the same subject in both sets. This introduces what is known as "subject bias" — the model can inadvertently learn personal characteristics or movement patterns of specific individuals rather than learning to generalize fall detection across new, unseen people. This would artificially inflate evaluation metrics (such as accuracy and F1-score), because the model is partially memorizing rather than generalizing.

To address this, we use a subject-wise splitting strategy:  
- We first extract the list of unique subjects.  
- Then we split these subjects into train and test groups.  
- Finally, we assign samples based on the subject to which they belong.

This ensures that the model is evaluated on entirely unseen individuals, simulating real-world scenarios where fall detection must work for new users. It leads to more honest and generalizable performance metrics.

## Deep Learning Preprocessing

This pipeline prepares raw time-series sensor data for deep learning models such as CNNs or LSTMs.

1. Raw sensor data is segmented into fixed-length overlapping windows (e.g., 2 seconds = 400 samples at 200 Hz).
    - Each window is treated as a single sample.
   - The overlap can be adjusted (e.g., 25% overlap).
2. Each window is converted into a 3D tensor: (number of samples, window length, number of channels).
    - For example, with 400-sample windows and 9 channels (3 sensors × 3 axes), the shape should be (N, 400, 9).
3. Labels are assigned to each window based on the most frequent class label within the window.
4. Input windows are normalized using z-score standardization based on the training set.
5. Subject-wise splitting is used to ensure no overlap of subjects between training and testing sets.

In [36]:
# Load the dataset from CSV file
sisfall_data = pd.read_csv('../reduced_sisfall_dataset.csv')

In [37]:
# Check the first few rows of the dataset
sisfall_data.head()

Unnamed: 0,accel_adxl345_x,accel_adxl345_y,accel_adxl345_z,gyro_itg3200_x,gyro_itg3200_y,gyro_itg3200_z,accel_mma8451q_x,accel_mma8451q_y,accel_mma8451q_z,label,filename,subject
0,0.066406,-0.699219,-0.386719,-1.098633,-30.761719,-21.484375,0.074219,-0.680664,-0.272461,adl,D01_SA01_R01.txt,SA01
1,0.058594,-0.679688,-0.351562,-3.234863,-34.667969,-18.676758,0.046875,-0.65918,-0.248047,adl,D01_SA01_R01.txt,SA01
2,0.003906,-0.6875,-0.316406,-5.126953,-37.414551,-16.540527,-0.001953,-0.652344,-0.21582,adl,D01_SA01_R01.txt,SA01
3,-0.039062,-0.703125,-0.300781,-6.347656,-39.489746,-13.85498,-0.033203,-0.680664,-0.170898,adl,D01_SA01_R01.txt,SA01
4,-0.082031,-0.746094,-0.246094,-7.8125,-41.19873,-11.657715,-0.072266,-0.723633,-0.129883,adl,D01_SA01_R01.txt,SA01


In [38]:
def segment_data(df, window_size, overlap, variance_threshold=0.3):
    # Calculate the step size based on the window size and overlap
    step = int(window_size * (1 - overlap))

    # Obtain the sensor columns
    sensor_cols = df.columns[:-3]  # Exclude label, filename, subject

    features = []
    labels = []

    for filename, group in df.groupby('filename'):
        group = group.reset_index(drop=True)
        for start in range(0, len(group) - window_size + 1, step):
            window = group.iloc[start:start + window_size]

            # Compute average standard deviation across all channels
            mean_std = np.mean([window[col].std() for col in sensor_cols])
            if mean_std < variance_threshold:
                continue  # Skip this window

            # Extract features and labels
            feature = window[sensor_cols].values
            label = window['label'].mode()[0]  # Most frequent label in the window

            features.append(feature)
            labels.append(label)

    return np.array(features), np.array(labels)

In [39]:
def preprocess_for_deep_learning(df, window_size=400, overlap=0.25):
    # Extract unique subjects
    subjects = df['subject'].unique()

    # Subject-wise split
    train_subjects, test_subjects = train_test_split(subjects, test_size=0.2, random_state=42)

    # Assign samples based on subject
    train_data = df[df['subject'].isin(train_subjects)]
    test_data = df[df['subject'].isin(test_subjects)]

    # Segment data into windows
    X_train, y_train = segment_data(train_data, window_size, overlap)
    X_test, y_test = segment_data(test_data, window_size, overlap)

    # Normalize using training statistics
    mean = X_train.mean(axis=(0, 1), keepdims=True)
    std = X_train.std(axis=(0, 1), keepdims=True)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    # Convert labels to numeric
    y_train = pd.Series(y_train).map({'adl': 0, 'fall': 1})
    y_test = pd.Series(y_test).map({'adl': 0, 'fall': 1})

    # Convert to dataframes
    X_train = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
    X_test = pd.DataFrame(X_test.reshape(X_test.shape[0], -1))
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)

    return X_train, X_test, y_train, y_test

In [40]:
# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_for_deep_learning(sisfall_data)

In [41]:
# Check the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19614, 3600), (4692, 3600), (19614, 1), (4692, 1))

In [44]:
# Save the preprocessed dataset to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)