In [3]:
import numpy as np
import pandas as pd
import tsfresh as ts
from sklearn import preprocessing
from tqdm import tqdm
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, SmartCorrelatedSelection, SelectBySingleFeaturePerformance
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [4]:
fe = []
lablist = []
for sj in range(2, 16):
    csv_file_path = f'./steps_data_new/sj{sj}.csv'
    df = pd.read_csv(csv_file_path)
    df = df[~df['Speed'].str.contains('warm_up')]
    df = df[~df['Speed'].str.contains('cool_down')]
    labels = df.groupby('Step')['Combined'].apply(list)
    labels = [sublist[0] for sublist in labels]
    l2 = preprocessing.LabelEncoder()
    l2.fit(labels)
    labs = l2.transform(labels)
    lablist.append(labs)
    le = preprocessing.LabelEncoder()
    le.fit(df.Step)
    df['Step'] = le.transform(df.Step)
    df = df.drop(columns = ['Unnamed: 0', 'Forefoot', 'Midfoot', 'Heel', 'Total', 'AT_loading', 'Speed', 'Incline', 'Combined'])
    extracted_features = ts.extract_features(df, column_id = 'Step', column_sort = 'Time')
    fe.append(extracted_features)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:26<00:00,  1.89it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.73it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:18<00:00,  2.68it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:18<00:00,  2.64it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:12<00:00,  3.91it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.06it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.01it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:27<00:00,  1.84it/s]
Feature Extraction: 100%|███████████████

In [6]:
fe_2 = []

for i in tqdm(range(len(fe))):
    rem = []
    X_ = fe[i]
    for j in range(X_.columns.shape[0]):
        num_class = np.unique(X_.iloc[:,j]).shape[0]
        if num_class <= 2 or np.where(X_.iloc[:,j].isna())[0].shape[0] != 0 or np.sum(np.isinf(X_.iloc[:,j])) > 0:
            rem.append(X_.columns[j])
    new_feat = X_.drop(columns = rem)
    fe_2.append(new_feat)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:07<00:00,  1.90it/s]


In [7]:
fe_3 = []
kfold = KFold(n_splits=5, random_state=0, shuffle = True)
for i in tqdm(range(len(fe_2))):
    X = fe_2[i]
    Y = lablist[i]
    tr = []
    tr = SmartCorrelatedSelection()
    X2 = tr.fit_transform(X, Y)
    fe_3.append(X2)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:15<00:00,  1.14s/it]


In [8]:
import warnings
warnings.filterwarnings('ignore')
acc_list = []
f_list = []
d_list = []
b_list = []
c_list = []
for i in tqdm(range(len(fe_3))):
    #clf = SVC(random_state = 0)
    clf = RandomForestClassifier(n_estimators = 200, random_state = 0)
    T = SelectBySingleFeaturePerformance(estimator = clf, threshold = 0.01, scoring = 'accuracy', cv = 5)
    x = fe_3[i]
    y = lablist[i]
    model = T.fit(x, y = y)
    dic = model.feature_performance_
    keys = list(dic.values())
    order = np.argsort(np.array(keys))[::-1]
    idx = order[0]
    feat_set = [idx]
    acc = [keys[idx]]
    best = 0
    for i in range(len(keys)):
        feat_set.append(order[i])
        temp = x.iloc[:,feat_set]
        y_pred = cross_val_predict(clf, temp, y, cv = kfold)
        cv_score = accuracy_score(y_pred, y)
        cm_matrix = confusion_matrix(y_pred, y)
        if cv_score > best:
            best_data = temp.copy()
            best_vars = np.array(list(dic))[feat_set]
            best_cm = cm_matrix
            best = cv_score
        acc.append(cv_score)
    acc_list.append(acc)
    b_list.append(best)
    f_list.append(best_vars)
    d_list.append(best_data)
    c_list.append(best_cm)
print(b_list)

100%|██████████████████████████████████████████████████████████████████████████████| 14/14 [7:50:43<00:00, 2017.36s/it]

[0.7333333333333333, 0.7312312312312312, 0.7789203084832905, 0.7204819277108434, 0.5975103734439834, 0.7795992714025501, 0.7160940325497287, 0.7825242718446602, 0.6912972085385879, 0.707089552238806, 0.6994106090373281, 0.7380497131931166, 0.7117117117117117, 0.7136752136752137]





In [17]:
np.mean(b_list), np.median(b_list)

(0.719891433876537, 0.7191991566757923)

In [1]:
import numpy as np
np.std(np.array([0.7333333333333333, 0.7312312312312312, 0.7789203084832905, 0.7204819277108434, 0.5975103734439834, 0.7795992714025501, 0.7160940325497287, 0.7825242718446602, 0.6912972085385879, 0.707089552238806, 0.6994106090373281, 0.7380497131931166, 0.7117117117117117, 0.7136752136752137]))

0.04478020395581429