In [5]:
# Load the cleaned features
import pandas as pd
features_df = pd.read_csv("/Volumes/Mac/DatasetSP/preprocessed/cleaned_features.csv")

In [11]:
# Raggruppa per Subject, Position, Test, Trial, Bout
grouped_bouts = features_df.groupby(["Subject", "Position", "Test", "Trial", "Bout"]).size()

# Conta i bout per ogni soggetto e position
subject_position_counts = grouped_bouts.reset_index().groupby(["Subject", "Position"]).size()

# Mostra il risultato
print(subject_position_counts)


Subject  Position
1        0           48
         1           48
         2           48
         3           47
         4           48
                     ..
25       1           38
         2           38
         3           25
         4           23
         5           38
Length: 89, dtype: int64


In [14]:
# Inizializza le liste per training e test set
train_subjects = []
remaining_subjects = set(subject_position_counts.index.get_level_values("Subject").unique())

# Numero minimo di soggetti da lasciare per il test set
num_test_subjects = 3

# Calcola il numero massimo di soggetti nel training set
max_train_subjects = len(remaining_subjects) - num_test_subjects

# Ciclo per selezionare i soggetti per il training set
while len(train_subjects) < max_train_subjects:
    # Calcola il bilanciamento attuale nel training set
    if train_subjects:
        current_balance = subject_position_counts.loc[train_subjects].groupby("Position").sum()
    else:
        current_balance = pd.Series(0, index=subject_position_counts.index.levels[1])

    # Seleziona il soggetto che migliora il bilanciamento
    best_subject = None
    best_balance = None

    for subject in remaining_subjects:
        # Calcola il bilanciamento candidato aggiungendo il soggetto
        candidate_balance = current_balance + subject_position_counts.loc[subject].reindex(current_balance.index, fill_value=0)

        # Migliora il bilanciamento: minimizza la differenza tra la classe più presente e la meno presente
        if best_balance is None or (candidate_balance.max() - candidate_balance.min()) < (best_balance.max() - best_balance.min()):
            best_subject = subject
            best_balance = candidate_balance

    # Aggiorna il training set con il miglior soggetto
    train_subjects.append(best_subject)
    remaining_subjects.remove(best_subject)

# Soggetti rimanenti nel test set
test_subjects = list(remaining_subjects)

print(f"Soggetti nel training set: {train_subjects}")
print(f"Soggetti nel test set: {test_subjects}")


Soggetti nel training set: [8, 16, 1, 9, 6, 19, 20, 17, 4, 7, 25, 3]
Soggetti nel test set: [2, 15, 22]


In [15]:
# Inizializza le liste per training e test set
train_subjects = []
remaining_subjects = set(subject_position_counts.index.get_level_values("Subject").unique())

# Numero minimo di soggetti da lasciare per il test set
num_test_subjects = 3

# Calcola il numero massimo di soggetti nel training set
max_train_subjects = len(remaining_subjects) - num_test_subjects

# Ciclo per selezionare i soggetti per il training set
while len(train_subjects) < max_train_subjects:
    # Calcola il bilanciamento attuale nel training set
    if train_subjects:
        current_balance = subject_position_counts.loc[train_subjects].groupby("Position").sum()
    else:
        current_balance = pd.Series(0, index=subject_position_counts.index.levels[1])

    # Seleziona il soggetto che migliora il bilanciamento
    best_subject = None
    best_balance = None

    for subject in remaining_subjects:
        # Calcola il bilanciamento candidato aggiungendo il soggetto
        candidate_balance = current_balance + subject_position_counts.loc[subject].reindex(current_balance.index, fill_value=0)

        # Migliora il bilanciamento: minimizza la differenza tra la classe più presente e la meno presente
        if best_balance is None or (candidate_balance.max() - candidate_balance.min()) < (best_balance.max() - best_balance.min()):
            best_subject = subject
            best_balance = candidate_balance

    # Aggiorna il training set con il miglior soggetto
    train_subjects.append(best_subject)
    remaining_subjects.remove(best_subject)

# Soggetti rimanenti nel test set
test_subjects = list(remaining_subjects)

print(f"Soggetti nel training set: {train_subjects}")
print(f"Soggetti nel test set: {test_subjects}")


Soggetti nel training set: [8, 16, 1, 9, 6, 19, 20, 17, 4, 7, 25, 3]
Soggetti nel test set: [2, 15, 22]


In [16]:
costruction_df = features_df[features_df['Subject'].isin(train_subjects)]
test_df = features_df[features_df['Subject'].isin(test_subjects)]

print(f"Costruction set: {costruction_df.shape[0]} rows")
print(f"Test set: {test_df.shape[0]} rows")

Costruction set: 126111 rows
Test set: 37624 rows


In [17]:
X_train = costruction_df.drop(columns=["Subject", "Test", "Trial", "Bout", "Position"])
y_train = costruction_df["Position"]

In [18]:
# feature selection with tsfresh
from tsfresh import select_features

tsfresh_selected_features = select_features(X_train, y_train, multiclass=True, fdr_level=0.05).columns.tolist()
print(f"Feature selezionate con tsfresh: {len(tsfresh_selected_features)}")

Feature selezionate con tsfresh: 746


In [21]:
# save the features selected
costruction_df = costruction_df[tsfresh_selected_features]
display(costruction_df)

Unnamed: 0,Acc_value__abs_energy,Acc_value__absolute_maximum,Acc_value__absolute_sum_of_changes,"Acc_value__agg_autocorrelation__f_agg_""mean""__maxlag_40","Acc_value__agg_autocorrelation__f_agg_""median""__maxlag_40","Acc_value__agg_autocorrelation__f_agg_""var""__maxlag_40","Acc_value__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","Acc_value__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","Acc_value__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""","Acc_value__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""",...,Gyr_value__spkt_welch_density__coeff_5,Gyr_value__spkt_welch_density__coeff_8,Gyr_value__standard_deviation,Gyr_value__sum_values,Gyr_value__symmetry_looking__r_0.05,Gyr_value__symmetry_looking__r_0.1,Gyr_value__symmetry_looking__r_0.15000000000000002,Gyr_value__variance,Gyr_value__variance_larger_than_standard_deviation,Gyr_value__variation_coefficient
0,10878.628209,18.392162,31.909707,0.008228,-0.072114,0.067613,10.590769,9.677254,8.932775,-0.357863,...,563.267440,538.209614,13.006211,2688.431810,0.0,1.0,1.0,169.161513,1.0,0.483784
1,10058.220528,16.162462,37.437539,-0.060050,-0.128442,0.083563,10.588557,9.711219,8.985681,0.359844,...,1547.895769,11563.005284,52.509851,9167.123931,0.0,1.0,1.0,2757.284425,1.0,0.572806
2,9593.887578,11.548936,15.572296,-0.003432,-0.035873,0.117923,10.043115,9.948069,9.781175,0.018508,...,27017.918238,5227.691693,51.459432,7684.184877,0.0,1.0,1.0,2648.073149,1.0,0.669680
3,11357.659222,15.146336,26.615612,0.019490,-0.022361,0.150425,10.797518,10.181398,9.238174,0.007283,...,8688.400693,28.203920,18.802065,6657.286459,1.0,1.0,1.0,353.517638,1.0,0.282428
4,11002.001084,15.689916,30.212005,-0.051458,-0.140149,0.161858,10.805099,9.972403,9.341110,0.243195,...,3856.359722,3084.665888,45.394563,8655.757805,1.0,1.0,1.0,2060.666329,1.0,0.524444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163730,10365.049925,19.237081,58.033558,-0.186897,-0.362984,0.308477,10.605540,9.131196,8.039393,1.582184,...,30319.681535,4768.011639,30.370467,7360.450598,0.0,1.0,1.0,922.365269,1.0,0.412617
163731,13380.615660,26.189281,79.883053,-0.175925,-0.368655,0.232651,16.704983,12.842659,10.012880,8.069530,...,68337.780607,7296.198712,56.429528,8693.673253,0.0,1.0,1.0,3184.291619,1.0,0.649087
163732,13829.601510,21.943039,63.180951,-0.146531,-0.404277,0.225204,13.306709,11.175447,8.829099,2.772015,...,16820.549014,2016.975765,46.096693,8445.549509,1.0,1.0,1.0,2124.905138,1.0,0.545810
163733,12993.299824,20.186223,63.765920,-0.179858,-0.448642,0.289254,11.140629,8.765461,7.021151,2.565619,...,14535.448828,5469.975179,40.067341,7981.217141,0.0,1.0,1.0,1605.391788,1.0,0.502020
