In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from pipeline.acquisition import load_sisfall_data
from pipeline.preprocessing import change_activity_duration
from pipeline.preprocessing import change_activity_sampling
from pipeline.feature_extraction import extract_features
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



In [2]:
dataset_folder = "datasets/SisFall_dataset/"
models_output_folder = 'models/'
uma_dataset_folder = "datasets/output_uma"

INCLUDE_GYROSCOPE = False

if INCLUDE_GYROSCOPE:
    sensors =  [0, 1, 2, 3, 4, 5]
else:
    sensors = [0, 1, 2]

ignored_subjects = []
duration = 12000
frequency = 200

should_save_models = True

In [3]:
raw_dataset = load_sisfall_data(dataset_folder, ignored_subjects, sensors)
raw_dataset


Unnamed: 0,subject,activity,trial,data
0,SA01,D01,R01,acc_x acc_y ...
1,SA01,D01,R01,acc_x acc_y ...
2,SA01,D01,R01,acc_x acc_y ...
3,SA01,D01,R01,acc_x acc_y ...
4,SA01,D01,R01,acc_x acc_y ...
...,...,...,...,...
5100,SE15,D17,R01,acc_x acc_y ...
5101,SE15,D17,R02,acc_x acc_y ...
5102,SE15,D17,R03,acc_x acc_y ...
5103,SE15,D17,R04,acc_x acc_y ...


In [4]:
from uma_dataset_utils import  group_sensor_data_by_segments, prepare_uma_dataset, uma_search_csv_files

combined_uma_df = uma_search_csv_files(directory=uma_dataset_folder, activities_of_interest=['Activity1','Activity2','Activity4','Activity5','Activity6','Activity7','Activity8','Activity9','Activity10','Activity11','Activity12','Activity13','Activity14','Activity15'])
uma_dataset = group_sensor_data_by_segments(combined_uma_df, INCLUDE_GYROSCOPE)
prepared_uma_dataset = prepare_uma_dataset(uma_dataset)


Created 627 data segments based on timestamp breaks
Segments per activity:
activity
D13    68
D14    67
D15    63
D8     58
D1     55
D7     48
D4     47
D10    40
D9     39
D11    39
D12    37
D2     28
D5     19
D6     19
Name: count, dtype: int64

Timestamp information:
Average segment duration: 295.9 samples
Average sampling rate: 20.0 Hz
UMA dataset shape: (627, 38)

Activity code distribution:
activity_code
D13    68
D14    67
D15    63
D8     58
D1     55
D7     48
D4     47
D10    40
D9     39
D11    39
D12    37
D2     28
D5     19
D6     19
Name: count, dtype: int64


In [5]:
UMA_ACTIVITY_MAPPING = {
    'D1': 'adl',
    'D2': 'adl',
    'D3': 'adl',
    'D4': 'adl',
    'D5': 'adl',
    'D6': 'adl',
    'D7': 'adl',
    'D8': 'adl',
    'D9': 'adl',
    'D10': 'adl',
    'D11': 'adl',
    'D12': 'adl',
    'D13': 'fall',
    'D14': 'fall',
    'D15': 'fall'
}

activity_codes = prepared_uma_dataset['activity_code'] 
y_labels = activity_codes.map(UMA_ACTIVITY_MAPPING)


# X_uma = prepared_uma_dataset.drop(columns=['activity_code', 'subject'])
# X_uma


In [6]:
uma_dataset['data'][626]

Unnamed: 0,acc_x,acc_y,acc_z
0,-0.948486,-0.334473,-0.210938
1,-0.873535,-0.339111,-0.178223
2,-0.873047,-0.335938,-0.180664
3,-0.870850,-0.316895,-0.174072
4,-0.940186,-0.246826,-0.157959
...,...,...,...
276,-0.016357,0.817139,-0.495850
277,-0.021973,0.814697,-0.490723
278,-0.021484,0.749756,-0.499512
279,-0.020752,0.814209,-0.563965


In [7]:
uma_dataset

Unnamed: 0,subject,activity,trial,group_id,data,timestamps,start_time,end_time,duration,sampling_rate
0,1,D1,1,0,acc_x acc_y acc_z 0 -0.74023...,0 0.782 1 0.831 2 0.879 3 ...,0.782,14.981,285,20.071836
1,1,D1,2,1,acc_x acc_y acc_z 0 -0.76416...,0 0.350 1 0.361 2 0.369 3 ...,0.350,15.002,301,20.543271
2,1,D1,3,2,acc_x acc_y acc_z 0 -0.75732...,0 0.154 1 0.183 2 0.201 3 ...,0.154,14.958,300,20.264793
3,1,D8,1,3,acc_x acc_y acc_z 0 -0.58740...,0 0.117 1 0.127 2 0.143 3 ...,0.117,14.952,300,20.222447
4,1,D8,2,4,acc_x acc_y acc_z 0 -0.81396...,0 0.350 1 0.354 2 0.384 3 ...,0.350,14.970,299,20.451436
...,...,...,...,...,...,...,...,...,...,...
622,19,D15,2,622,acc_x acc_y acc_z 0 -1.02294...,0 0.269 1 0.331 2 0.337 3 ...,0.269,14.979,281,19.102651
623,19,D15,3,623,acc_x acc_y acc_z 0 -1.37255...,0 0.218 1 0.282 2 0.326 3 ...,0.218,14.970,293,19.861714
624,19,D15,4,624,acc_x acc_y acc_z 0 -0.92089...,0 0.222 1 0.306 2 0.396 3 ...,0.222,14.950,296,20.097773
625,19,D15,5,625,acc_x acc_y acc_z 0 -0.90795...,0 0.211 1 0.274 2 0.351 3 ...,0.211,15.016,299,20.195880


In [14]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters, ComprehensiveFCParameters

def prepare_main_dataset(raw_dataset, CODE_TO_CLASS, duration, frequency, should_extract_features=True, should_extract_tsfresh_features=False):
    """
    Prepare dataset by filtering activities, extracting features, and organizing data.
    
    Parameters:
    raw_dataset: Raw dataset containing 'activity', 'data', and 'subject' columns
    CODE_TO_CLASS: Dictionary mapping activity codes to class labels
    duration: Duration parameter for activity processing
    frequency: Frequency parameter for activity sampling
    should_extract_features: Boolean flag to determine if features should be extracted
    should_extract_tsfresh_features: Boolean flag to determine if tsfresh features should be extracted
    
    Returns:
    pd.DataFrame: Prepared dataset with features, class labels, subjects, and activity codes
    """
    
    # Filter indices based on activity codes
    filtered_indices = []
    for i in raw_dataset.index:
        activity_code = raw_dataset['activity'][i]
        if activity_code in CODE_TO_CLASS:
            filtered_indices.append(i)

    print(f"Total samples: {len(raw_dataset)}")
    print(f"Filtered samples: {len(filtered_indices)}")

    print("Processing data...")
    feature_list = []
    tsfresh_feature_list = []
    labels = []
    subjects = []
    activity_codes = []

    for i in filtered_indices:
        # Get data and preprocess
        data = raw_dataset['data'][i]
        data = change_activity_duration(data, duration)
        data = change_activity_sampling(data, frequency)
        
        if should_extract_features:
            # Extract features
            features = extract_features(data, True)
            feature_list.append(features)
        
        if should_extract_tsfresh_features:
            print(data)
            # Extract tsfresh features
            tsfresh_features = extract_features(data, column_id="id", column_sort="time", default_fc_parameters=MinimalFCParameters())
            tsfresh_feature_list.append(tsfresh_features)
        
        # Get class label
        activity_code = raw_dataset['activity'][i]
        class_label = CODE_TO_CLASS[activity_code]
        
        # Store results
        labels.append(class_label)
        subjects.append(raw_dataset['subject'][i])
        activity_codes.append(activity_code)

    if should_extract_features:
        # Combine into final dataset
        prepared_dataset = pd.concat(feature_list, ignore_index=True)
    else:
        # Create a simpler dataset without features
        prepared_dataset = pd.DataFrame({'class': labels, 'subject': subjects, 'activity_code': activity_codes})

    if should_extract_tsfresh_features:
        # Combine tsfresh features into the dataset
        tsfresh_features_combined = pd.concat(tsfresh_feature_list, ignore_index=True)
        prepared_dataset = pd.concat([prepared_dataset, tsfresh_features_combined], axis=1)

    prepared_dataset['class'] = labels
    prepared_dataset['subject'] = subjects
    prepared_dataset['activity_code'] = activity_codes

    print(f"Final dataset shape: {prepared_dataset.shape}")
    print("\nClass distribution:")
    print(prepared_dataset['class'].value_counts())
    
    return prepared_dataset


In [15]:
ACTIVITY_CLASSES = {
    'adl': ['D01', 'D02','D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19'],
    'fall': ['F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', "F11","F12","F13", "F14","F15"],
}

# Create code to class mapping
CODE_TO_CLASS = {}
for class_name, codes in ACTIVITY_CLASSES.items():
    for code in codes:
        CODE_TO_CLASS[code] = class_name

print("Activity classes defined:")
for class_name, codes in ACTIVITY_CLASSES.items():
    print(f"  {class_name}: {codes}")

final_main_dataset = prepare_main_dataset(raw_dataset, CODE_TO_CLASS, duration, frequency, should_extract_features=False,should_extract_tsfresh_features=True)
final_main_dataset


Activity classes defined:
  adl: ['D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19']
  fall: ['F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15']
Total samples: 5105
Filtered samples: 5105
Processing data...
                            acc_x     acc_y     acc_z
2000-01-01 00:00:05.000 -0.003906 -0.886719 -0.421875
2000-01-01 00:00:05.005 -0.019531 -0.859375 -0.398438
2000-01-01 00:00:05.010 -0.046875 -0.820312 -0.390625
2000-01-01 00:00:05.015 -0.062500 -0.808594 -0.386719
2000-01-01 00:00:05.020 -0.082031 -0.824219 -0.351562
...                           ...       ...       ...
2000-01-01 00:00:05.975  0.027344 -0.953125  0.046875
2000-01-01 00:00:05.980  0.023438 -0.945312  0.031250
2000-01-01 00:00:05.985  0.011719 -0.945312  0.039062
2000-01-01 00:00:05.990  0.000000 -0.929688  0.054688
2000-01-01 00:00:05.995  0.007812 -0.937500  0.078125

[200 row

ValueError: Column not found: id

In [10]:
print(final_main_dataset)

     class subject activity_code
0      adl    SA01           D01
1      adl    SA01           D01
2      adl    SA01           D01
3      adl    SA01           D01
4      adl    SA01           D01
...    ...     ...           ...
5100   adl    SE15           D17
5101   adl    SE15           D17
5102   adl    SE15           D17
5103   adl    SE15           D17
5104   adl    SE15           D17

[5105 rows x 3 columns]
