# Fall Detection using SisFall Dataset
Daniela Dias, nMec 98039

In [53]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.fft import fft
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

## Avoiding Subject Bias in Train-Test Split

When working with the SisFall dataset (or any dataset where multiple recordings come from the same individuals), it is essential to avoid data leakage caused by random sample splitting. 

If we randomly split the dataset into training and testing sets, we risk placing samples from the same subject in both sets. This introduces what is known as "subject bias" — the model can inadvertently learn personal characteristics or movement patterns of specific individuals rather than learning to generalize fall detection across new, unseen people. This would artificially inflate evaluation metrics (such as accuracy and F1-score), because the model is partially memorizing rather than generalizing.

To address this, we use a subject-wise splitting strategy:  
- We first extract the list of unique subjects.  
- Then we split these subjects into train and test groups.  
- Finally, we assign samples based on the subject to which they belong.

This ensures that the model is evaluated on entirely unseen individuals, simulating real-world scenarios where fall detection must work for new users. It leads to more honest and generalizable performance metrics.

## Traditional Machine Learning Preprocessing

The preprocessing pipeline for traditional machine learning begins by converting raw sensor data into numerical features that summarize activity windows.

1. Raw signal data is segmented into overlapping windows of fixed length (e.g., 2 seconds = 400 samples at 200 Hz).
   - Each window is treated as a single sample.
   - The overlap can be adjusted (e.g., 25% overlap).
2. For each window, both time-domain and frequency-domain features are extracted:
   - Time-domain: mean, standard deviation, min, max, median, skewness, kurtosis.
   - Frequency-domain: FFT mean, standard deviation, max, and energy.
3. Each window is labeled based on the most frequent label within it.
4. Features are normalized using z-score standardization based on the training set.
5. SMOTE (Synthetic Minority Oversampling Technique) is applied to the training data to address class imbalance between falls and ADLs.
6. Subject-wise splitting is used to ensure no overlap of subjects between training and testing sets.


In [54]:
# Load the dataset from CSV file
sisfall_data = pd.read_csv('../reduced_sisfall_dataset.csv')

In [55]:
# Feature extraction function with frequency-domain features
def extract_features(df, window_size=400, overlap=0.25, variance_threshold=0.3):
    # Calculate the step size based on the window size and overlap
    step = int(window_size * (1 - overlap))
    
    # Obtain the sensor columns
    sensor_cols = df.columns[:-3]  # Exclude label, filename, subject

    # Generate feature names (only once)
    feature_names = []
    for col in sensor_cols:
        feature_names.extend([
            f'{col}_mean', f'{col}_std', f'{col}_min', f'{col}_max', f'{col}_median', f'{col}_skew', f'{col}_kurt',
            f'{col}_fft_mean', f'{col}_fft_std', f'{col}_fft_max', f'{col}_fft_energy'
        ])

    all_features = []
    all_labels = []

    for filename, group in df.groupby('filename'):
        group = group.reset_index(drop=True)
        for start in range(0, len(group) - window_size + 1, step):
            window = group.iloc[start:start + window_size]
            feature_vector = []

            # Compute average standard deviation across all channels
            mean_std = np.mean([window[col].std() for col in sensor_cols])
            if mean_std < variance_threshold:
                continue  # Skip this window

            for col in sensor_cols:
                data = pd.to_numeric(window[col], errors='coerce').fillna(0).values

                # Time-domain features
                feature_vector.extend([
                    data.mean(), data.std(), data.min(), data.max(),
                    np.median(data), skew(data), kurtosis(data)
                ])

                # Frequency-domain features
                fft_values = np.abs(fft(data))
                feature_vector.extend([
                    fft_values.mean(), fft_values.std(), fft_values.max(),
                    np.sum(fft_values ** 2) / len(fft_values)
                ])

            label = window['label'].mode().iloc[0]
            all_features.append(feature_vector)
            all_labels.append(label)

    return np.array(all_features), np.array(all_labels), feature_names


In [56]:
# Preprocessing function to handle class imbalance and scaling
def preprocess_data(df):
    
    # Extract unique subjects
    subjects = df['subject'].unique()
    
    # Subject-wise split
    train_subjects, test_subjects = train_test_split(subjects, test_size=0.2, random_state=42)

    # Assign samples based on subject
    train_data = df[df['subject'].isin(train_subjects)]
    test_data = df[df['subject'].isin(test_subjects)]

    # Extract features from training and testing data
    X_train, y_train, feature_names = extract_features(train_data)
    X_test, y_test, _ = extract_features(test_data)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Add SMOTE for handling class imbalance
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

    # Convert labels to binary format
    y_train_balanced = pd.Series(y_train_balanced).map({'adl': 0, 'fall': 1})
    y_test = pd.Series(y_test).map({'adl': 0, 'fall': 1})

    # Convert to DataFrame for easier handling
    X_train_balanced = pd.DataFrame(X_train_balanced, columns=feature_names)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)
    y_train_balanced = pd.DataFrame(y_train_balanced, columns=['label'])
    y_test = pd.DataFrame(y_test, columns=['label'])

    return X_train_balanced, y_train_balanced, X_test_scaled, y_test

In [57]:
# Preprocess the dataset
X_train, y_train, X_test, y_test = preprocess_data(sisfall_data)

In [58]:
# Check the shape of the training and testing sets
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((10960, 99), (10960, 1), (1870, 99), (1870, 1))

In [59]:
# Save the preprocessed dataset to CSV files
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)