In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Imports
import os
import sys
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from joblib import parallel_backend


In [6]:
# Dataset Builder

# List each file in folder data_collection
files = os.listdir('data_collection')

print('Files in data_collection:', files)

Files in data_collection: ['FALL_1.csv', 'FALL_2.csv', 'STUMBLE_1.csv', 'WALK_1.csv', 'WALK_2.csv', 'SITTING_1.csv', 'ACTIVITY_1.csv']


In [None]:
data_folder = 'data_collection'

# --- Function to determine binary label based on file name ---
def get_binary_label(filename):
    # Treat any file with "FALL" (case insensitive) as a fall (1); otherwise, non-fall (0)
    return 1 if "FALL" in filename.upper() else 0

# --- Import each file, add labels, and store in a list ---
dataframes = []
for file in files:
    if file.endswith('.csv'):
        file_path = os.path.join(data_folder, file)
        # Read CSV file (assumed to have no header and 500 columns)
        df = pd.read_csv(file_path, header=None)
        
        # Encode the labels:
        # Binary label: 1 for fall events (FALL_1, FALL_2), 0 for others.
        binary_label = get_binary_label(file)
        # Sub-label: the file name without the .csv extension (e.g., "FALL_1")
        sub_label = file.replace('.csv', '')
        
        # Add new columns to the DataFrame (one row per sequence)
        df['binary_label'] = binary_label
        df['sub_label'] = sub_label
        
        dataframes.append(df)

# --- Combine all dataframes into a single DataFrame ---
combined_df = pd.concat(dataframes, ignore_index=True)
print("Combined data (first 5 rows):")
print(combined_df.head())
print("Shape of combined data:", combined_df.shape)

Combined data (first 5 rows):
      0     1     2     3     4     5     6     7     8     9  ...   392  \
0  9.34  9.37  9.36  9.38  9.35  9.36  9.38  9.37  9.34  9.35  ...  9.35   
1  9.36  9.36  9.35  9.37  9.38  9.37  9.37  9.36  9.37  9.37  ...  9.37   
2  9.35  9.38  9.37  9.37  9.37  9.37  9.36  9.36  9.37  9.38  ...  9.36   
3  9.36  9.37  9.37  9.36  9.36  9.38  9.36  9.40  9.37  9.36  ...  9.36   
4  9.37  9.36  9.36  9.38  9.37  9.36  9.41  9.37  9.37  9.37  ...  9.36   

    393   394   395   396   397   398   399  binary_label  sub_label  
0  9.36  9.36  9.37  9.38  9.39  9.37  9.36             1     FALL_1  
1  9.37  9.36  9.36  9.36  9.34  9.39  9.36             1     FALL_1  
2  9.34  9.35  9.37  9.37  9.36  9.36  9.37             1     FALL_1  
3  9.38  9.39  9.37  9.37  9.38  9.38  9.39             1     FALL_1  
4  9.36  9.36  9.37  9.36  9.35  9.35  9.34             1     FALL_1  

[5 rows x 402 columns]
Shape of combined data: (436, 402)


In [8]:
# Print % of fall events with 2 decimal places
num_falls = combined_df['binary_label'].sum()
total_events = combined_df.shape[0]
percent_falls = 100 * num_falls / total_events

print(f"Percentage of fall events: {percent_falls:.2f}%")

Percentage of fall events: 8.26%


In [None]:
# From augmented folder, import the first 5 files with FALL in the name and add to the combined DataFrame
augmented_folder = 'augmented'
augmented_files = os.listdir(augmented_folder)

# --- Import each file, add labels, and store in a list ---
dataframes = []

for file in augmented_files:
    if file.endswith('.csv') and "FALL" in file.upper():
        file_path = os.path.join(augmented_folder, file)
        # Read CSV file (assumed to have no header and 500 columns)
        df = pd.read_csv(file_path, header=None)
        
        # Encode the labels:
        # Binary label: 1 for fall events (FALL_1, FALL_2), 0 for others.
        binary_label = get_binary_label(file)
        # Sub-label: the file name without the .csv extension (e.g., "FALL_1")
        sub_label = file.replace('.csv', '')
        
        # Add new columns to the DataFrame (one row per sequence)
        df['binary_label'] = binary_label
        df['sub_label'] = sub_label
        
        dataframes.append(df)

# --- Combine all dataframes into a single DataFrame ---
augmented = pd.concat(dataframes, ignore_index=True)

# --- Combine the augmented data with the original data ---
combined_df = pd.concat([combined_df, augmented], ignore_index=True)

print("Shape of combined data with augmented data:", combined_df.shape)

Shape of combined data with augmented data: (724, 402)


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- Assume 'combined_df' is your DataFrame from the previous cell ---
# The first 500 columns are your features, and 'binary_label' is the target.

# Select features (columns 0 to 399) and the target
X = combined_df.iloc[:, :500]
y = combined_df['binary_label']

# --- Split the data into train (80%) and test (20%) ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Define a Random Forest classifier ---
rf_clf = RandomForestClassifier(random_state=42)

# --- Set up the parameter grid for optimization ---
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# --- Use GridSearchCV for hyperparameter optimization ---
grid_search = GridSearchCV(estimator=rf_clf,
                           param_grid=param_grid,
                           cv=5,                # 5-fold cross-validation
                           n_jobs=-1,           # use all available cores
                           scoring='accuracy')

with parallel_backend('threading'):
    grid_search.fit(X_train, y_train)

# --- Print the best parameters and best cross-validation accuracy ---
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search.best_score_))

# --- Evaluate the best estimator on the test set ---
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("\nTest set performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation accuracy: 0.860

Test set performance:
Accuracy: 0.8482758620689655
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        89
           1       0.79      0.82      0.81        56

    accuracy                           0.85       145
   macro avg       0.84      0.84      0.84       145
weighted avg       0.85      0.85      0.85       145



In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# --- Assume 'combined_df' is your DataFrame from the previous cell ---
# The first 500 columns are your features and 'binary_label' is the target.

# Select features (columns 0 to 399) and the target
X = combined_df.iloc[:, :500]
y = combined_df['binary_label']

# --- Split the data into train (80%) and test (20%) ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale features (important for SVM) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Define an SVM classifier ---
svm_clf = SVC(random_state=42)

# --- Set up a parameter grid for grid search ---
param_grid = {
    'C': [0.1, 1, 10],          # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for 'rbf'
    'kernel': ['rbf']           # Using RBF kernel; you can also try 'linear'
}

# --- Set up GridSearchCV for hyperparameter optimization ---
grid_search = GridSearchCV(estimator=svm_clf,
                           param_grid=param_grid,
                           cv=5,              # 5-fold cross-validation
                           n_jobs=-1,         # Use all available cores
                           scoring='accuracy')


with parallel_backend('threading'):
    grid_search.fit(X_train_scaled, y_train)

# --- Print the best parameters and best cross-validation accuracy ---
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search.best_score_))

# --- Evaluate the best estimator on the test set ---
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_scaled)

print("\nTest set performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Best parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.815

Test set performance:
Accuracy: 0.8275862068965517
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86        89
           1       0.76      0.80      0.78        56

    accuracy                           0.83       145
   macro avg       0.82      0.82      0.82       145
weighted avg       0.83      0.83      0.83       145



In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# --- Assume 'combined_df' is your DataFrame from earlier ---
# The first 500 columns are features and 'binary_label' is the target.
X = combined_df.iloc[:, :500].values
y = combined_df['binary_label'].values

# --- Split the data into train (80%) and test (20%) ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale the features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Build a more complex Neural Network Model ---
model = keras.Sequential([
    layers.Input(shape=(500,)),
    
    # First hidden block
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Second hidden block
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Third hidden block
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Fourth hidden block
    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Output layer for binary classification
    layers.Dense(1, activation='sigmoid')
])

# --- Define the optimizer with a custom learning rate ---
optimizer = keras.optimizers.Adam(learning_rate=0.001)

# --- Compile the model ---
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# --- Set up callbacks ---
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

# --- Train the model ---
history = model.fit(X_train_scaled, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping, reduce_lr])

# --- Evaluate the model on the test set ---
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Test accuracy:", accuracy)

# --- Generate predictions and show classification report ---
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)
print("Classification Report:")
print(classification_report(y_test, y_pred))




Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4681 - loss: 0.9392 - val_accuracy: 0.6638 - val_loss: 0.5919 - learning_rate: 0.0010
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6347 - loss: 0.6674 - val_accuracy: 0.6724 - val_loss: 0.5395 - learning_rate: 0.0010
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6660 - loss: 0.6065 - val_accuracy: 0.6552 - val_loss: 0.5111 - learning_rate: 0.0010
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7425 - loss: 0.5270 - val_accuracy: 0.6810 - val_loss: 0.4859 - learning_rate: 0.0010
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7969 - loss: 0.4985 - val_accuracy: 0.7414 - val_loss: 0.4579 - learning_rate: 0.0010
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/