In [1]:
import numpy as np
import pandas as pd
import tsfresh as ts
from sklearn import preprocessing
from tqdm import tqdm
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, SmartCorrelatedSelection, SelectBySingleFeaturePerformance
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [2]:
fe = []
lablist = []
for sj in range(2, 16):
    csv_file_path = f'./steps_data_new/sj{sj}.csv'
    df = pd.read_csv(csv_file_path)
    df = df[~df['Speed'].str.contains('warm_up')]
    df = df[~df['Speed'].str.contains('cool_down')]
    labels = df.groupby('Step')['Combined'].apply(list)
    labels = [sublist[0] for sublist in labels]
    l2 = preprocessing.LabelEncoder()
    l2.fit(labels)
    labs = l2.transform(labels)
    lablist.append(labs)
    le = preprocessing.LabelEncoder()
    le.fit(df.Step)
    df['Step'] = le.transform(df.Step)
    df = df.drop(columns = ['Unnamed: 0', 'Forefoot', 'Midfoot', 'Heel', 'Total', 'AT_loading', 'Speed', 'Incline', 'Combined'])
    extracted_features = ts.extract_features(df, column_id = 'Step', column_sort = 'Time')
    fe.append(extracted_features)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  3.88it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:29<00:00,  1.70it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:21<00:00,  2.28it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:20<00:00,  2.42it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 49/49 [00:13<00:00,  3.73it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:25<00:00,  1.98it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.76it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:23<00:00,  2.13it/s]
Feature Extraction: 100%|███████████████

In [4]:
fe_2 = []

for i in tqdm(range(len(fe))):
    rem = []
    X_ = fe[i]
    for j in range(X_.columns.shape[0]):
        num_class = np.unique(X_.iloc[:,j]).shape[0]
        if num_class <= 2 or np.where(X_.iloc[:,j].isna())[0].shape[0] != 0 or np.sum(np.isinf(X_.iloc[:,j])) > 0:
            rem.append(X_.columns[j])
    new_feat = X_.drop(columns = rem)
    fe_2.append(new_feat)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:08<00:00,  1.66it/s]


In [5]:
fe_3 = []
kfold = KFold(n_splits=5, random_state=0, shuffle = True)
for i in tqdm(range(len(fe_2))):
    X = fe_2[i]
    Y = lablist[i]
    tr = []
    tr = SmartCorrelatedSelection()
    X2 = tr.fit_transform(X, Y)
    fe_3.append(X2)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:16<00:00,  1.16s/it]


In [6]:
import warnings
warnings.filterwarnings('ignore')
acc_list = []
f_list = []
d_list = []
b_list = []
c_list = []
for i in tqdm(range(len(fe_3))):
    clf = SVC(random_state = 0)
    #clf = RandomForestClassifier(n_estimators = 200, random_state = 0)
    T = SelectBySingleFeaturePerformance(estimator = clf, threshold = 0.01, scoring = 'accuracy', cv = 5)
    x = fe_3[i]
    y = lablist[i]
    model = T.fit(x, y = y)
    dic = model.feature_performance_
    keys = list(dic.values())
    order = np.argsort(np.array(keys))[::-1]
    idx = order[0]
    feat_set = [idx]
    acc = [keys[idx]]
    best = 0
    for i in range(len(keys)):
        feat_set.append(order[i])
        temp = x.iloc[:,feat_set]
        y_pred = cross_val_predict(clf, temp, y, cv = kfold)
        cv_score = accuracy_score(y_pred, y)
        cm_matrix = confusion_matrix(y_pred, y)
        if cv_score > best:
            best_data = temp.copy()
            best_vars = np.array(list(dic))[feat_set]
            best_cm = cm_matrix
            best = cv_score
        acc.append(cv_score)
    acc_list.append(acc)
    b_list.append(best)
    f_list.append(best_vars)
    d_list.append(best_data)
    c_list.append(best_cm)
print(b_list)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [13:17<00:00, 56.96s/it]

[0.3282051282051282, 0.3333333333333333, 0.43958868894601544, 0.39759036144578314, 0.3112033195020747, 0.3861566484517304, 0.36347197106690776, 0.34563106796116505, 0.31527093596059114, 0.30970149253731344, 0.3555992141453831, 0.4072657743785851, 0.36036036036036034, 0.3974358974358974]





In [8]:
np.mean(b_list)

0.36352859350451433

In [2]:
import numpy as np
np.std(np.array([0.3282051282051282, 0.3333333333333333, 0.43958868894601544, 0.39759036144578314, 0.3112033195020747, 0.3861566484517304, 0.36347197106690776, 0.34563106796116505, 0.31527093596059114, 0.30970149253731344, 0.3555992141453831, 0.4072657743785851, 0.36036036036036034, 0.3974358974358974]))

0.038665690811775434