# Main objective: Assessing OLST performance

###### In this script, we are using unsupervied machine learning to assess the performance of the One Leg Balance Test. For input metric, it is a N raws * 128 columns dataframe. The input parameters include different features during One Leg Balance Test trail such as 
* 1.	Lifting forefoot or heel  
* 2.   Moving hip into more than 30 degrees of flexion or abduction  
* 3.	Stepping, stumbling, or falling 
* 4.	Lifting hands off iliac crests


## Step 1: Load and create the dataset

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

In [3]:
raw_df = pd.read_csv("SL_df.csv")
target_df = pd.read_csv("SL_target.csv")

In [4]:
raw_df.head()
SL_train = raw_df.drop(["sub"], axis = 1)
SL_input = SL_train.iloc[:,1:]
SL_target = target_df.iloc[:,1:]

In [5]:
### Direct to the correct folder
path = 'C:/Users/a1003/OneDrive/桌面/Thesis/data/OLBT/SL_testing_data'
filenames = os.listdir(path)   ### List the files name in the folder
sub_id = []                    ### Initialize the subject name list
value_list = []               ### Initialzie the value list

### Organize the information into different list and then integrate them together as a dataframe
for sub in filenames:
    curr_file = path + '/' + sub
    curr_csv = pd.read_csv(curr_file)
    value = list(curr_csv.loc[0])
    value_list.append(value)
    ID = str(sub[:-4]) ### -25: sub0?, -4: remove .csv
    sub_id.append(ID)
            
print("Current subject number: " + str(len(sub_id)) + '\n' + "Current features number: " + str(len(value)))

Current subject number: 45
Current features number: 39


In [6]:
### Create the correct columns name
col_names = list(curr_csv.columns)
all_col_names = ["sub_ID"] + col_names

In [7]:
### Create the correct metrix which is going to be convert to dataframe
all_list = []
for i in range(len(sub_id)):
    v = [(sub_id[i])] + value_list[i]
    all_list.append(v)

In [8]:
### Use the list and metrix above the create a new table
raw_df = pd.DataFrame(all_list, columns = all_col_names)
raw_df.columns = ['sub_ID', 'SL_fore_var_height', 'SL_heel_var_height',
       'OL_fore_var_height', 'OL_heel_var_height', 'SL_fore_mean_height',
       'SL_heel_mean_height', 'OL_fore_mean_height', 'OL_heel_mean_height',
       'SL_fore_max_height', 'SL_heel_max_height', 'OL_fore_max_height',
       'OL_heel_max_height', 'SL_var_flex', 'OL_var_flex', 'SL_mean_flex',
       'OL_mean_flex', 'SL_max_flex', 'OL_max_flex', 'SL_duration_flex',
       'OL_duration_flex', 'SL_var_abd', 'OL_var_abd', 'SL_mean_abd',
       'OL_mean_abd', 'SL_max_abd', 'OL_max_abd', 'SL_duration_abd',
       'OL_duration_abd', 'stepping_freq', 'stepping_duration', 'var_RL_dis',
       'Max_min_RL_dis', 'sub', 'foot_height_count', 'angle_count',
       'stepping_count', 'hand_iliac_count', 'out_position_count', 'totel']
x_test_df = raw_df.drop(["sub"], axis = 1).sort_values("sub_ID")

In [9]:
x_test_df

Unnamed: 0,sub_ID,SL_fore_var_height,SL_heel_var_height,OL_fore_var_height,OL_heel_var_height,SL_fore_mean_height,SL_heel_mean_height,OL_fore_mean_height,OL_heel_mean_height,SL_fore_max_height,...,stepping_freq,stepping_duration,var_RL_dis,Max_min_RL_dis,foot_height_count,angle_count,stepping_count,hand_iliac_count,out_position_count,totel
0,sub01_sl_01,0.346398,0.118385,1.01444,1.3013,3.38605,2.73011,11.4582,25.0853,5.03583,...,0,0,21.9591,95.1949,3,1,0,2,0,6
1,sub01_sl_02,0.384351,0.179211,1.15733,0.913797,3.48595,2.78751,10.7546,25.4511,5.02729,...,0,0,0.55486,3.2615,4,0,0,1,0,5
2,sub01_wl_01,0.184049,0.066591,1.51785,1.88838,3.54662,2.18225,9.78238,23.3543,4.34254,...,0,0,0.2445,1.1988,0,0,0,0,0,0
3,sub02_sl_01,0.367686,0.191607,2.05034,2.42287,2.91314,2.10326,10.9186,25.0258,5.69173,...,0,0,0.77695,3.8504,1,0,0,0,0,1
4,sub02_sl_02,0.3387,0.18232,6.2629,7.13464,2.86603,2.0173,18.2625,33.1984,5.00449,...,5,1930,1.25794,15.5843,2,1,5,0,0,8
5,sub02_wl_01,0.319891,0.137237,6.55789,8.04261,3.2886,2.51311,24.5113,40.7178,4.31007,...,4,1868,1.2718,5.8806,1,6,4,0,0,11
6,sub03_sl_01,0.480433,0.120548,5.27699,6.70007,4.25075,3.22724,18.3135,33.8194,6.29457,...,2,1456,0.63589,5.3974,11,4,3,1,0,19
7,sub03_sl_02,0.338991,0.087869,4.99413,6.75907,4.0611,3.20843,22.1792,39.966,5.99522,...,1,987,0.41402,2.9685,3,0,1,1,0,5
8,sub03_wl_01,0.363354,0.112189,4.37746,5.84687,3.93537,2.77938,15.7115,31.8837,5.26333,...,3,1009,9.72803,83.0931,4,36,4,1,0,45
9,sub04_sl_01,0.250408,0.121234,2.97564,4.10627,3.09612,2.7112,16.3319,27.3344,5.02202,...,0,0,0.27613,1.3061,3,0,0,0,0,3


In [10]:
raw_y_test_df = pd.read_csv("C:/Users/a1003/OneDrive/桌面/Thesis/data/trained_data_BESS/BESS_human_rating_testing.csv")
y_test_df = raw_y_test_df[["sub_ID", "overall rating"]].sort_values("sub_ID")
y_test_df

Unnamed: 0,sub_ID,overall rating
0,sub01_sl_01,Good
1,sub01_sl_02,Good
2,sub01_wl_01,Good
3,sub02_sl_01,Good
4,sub02_sl_02,Bad
5,sub02_wl_01,Moderate
6,sub03_sl_01,Moderate
7,sub03_sl_02,Good
8,sub03_wl_01,Good
9,sub04_sl_01,Good


In [11]:
SL_test_input = x_test_df.iloc[:,1:]
SL_test_target = y_test_df.iloc[:,1:]

In [12]:
### fill the missing data with average
df = SL_test_input.copy()
df_without_na = df.copy()
for col in SL_test_input.columns:
    if df[col].isna().sum() != 0:
        df_without_na[col] = df_without_na[col].fillna(df[col].mean())
print("Amount of nan in original table: " + str(df.isna().sum().sum()) + '\n' + "Amount of nan in current table: " + str(df_without_na.isna().sum().sum()))

Amount of nan in original table: 3
Amount of nan in current table: 0


In [13]:
SL_test_input_without_nan = df_without_na

## Step 2: Preprocessing the data

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [15]:
scaler = StandardScaler()
onehotencoder = OneHotEncoder()

In [16]:
SL_input_processed = scaler.fit_transform(SL_input)
SL_target_processed = np.array(SL_target["overall_rate"])

In [17]:
x_test = scaler.fit_transform(SL_test_input_without_nan)
y_test = np.array(SL_test_target)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
train_num = round(1 * len(SL_input_processed))
test_num = round(0 * len(SL_input_processed))

In [21]:
x_train = SL_input_processed[:train_num]
y_train = SL_target_processed[:train_num]


## Step 3: Feature selection

## Step 4: Train and testing the model

In [22]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

SVC_reg = SVC()
DT_reg = DecisionTreeClassifier()
Log_reg = LogisticRegression()
RF_reg = RandomForestClassifier()

In [23]:
from sklearn.model_selection import GridSearchCV

SVC_para = {
    "C":[1,1.2,2,2.5,3],
    "kernel":['linear', 'poly', 'rbf', 'sigmoid'],
    "degree":[3,4,5],
    "probability":[True]
}

DT_para = {
    "criterion": ["gini", "entropy"],
    'max_depth': [1,2,3,4,5], 
    'max_leaf_nodes': [2,3,4,5]
}

Log_para = {
    "penalty": ['l2', 'none'],
    "C":[1,1.2,2,2.5,3],
    "max_iter":[200, 300, 400],
    "solver":['lbfgs']
}

RF_para = {
    'criterion': ['gini', 'entropy'], 
    'max_depth': [1,2,3], 
    'max_leaf_nodes': [2,3,4,5], 
    'n_estimators': [ 100, 200, 300]
}


In [24]:
SVC_grid_search = GridSearchCV(SVC_reg, param_grid = SVC_para, cv = 10, return_train_score = True)
SVC_grid_search.fit(x_train, y_train.ravel())
SVC_grid_search.best_params_



{'C': 1, 'degree': 3, 'kernel': 'sigmoid', 'probability': True}

In [25]:
DT_grid_search = GridSearchCV(DT_reg, param_grid = DT_para, cv = 10, return_train_score = True)
DT_grid_search.fit(x_train, y_train)
DT_grid_search.best_params_



{'criterion': 'gini', 'max_depth': 2, 'max_leaf_nodes': 3}

In [26]:
SVC_best_reg = SVC_grid_search.best_estimator_
DT_best_reg = DT_grid_search.best_estimator_

## Step 4: Evaluate the model

In [27]:
SVC_best_reg.score(x_test, y_test.ravel())

0.7555555555555555

In [28]:
DT_best_reg.score(x_test, y_test.ravel())

0.7777777777777778

In [29]:
svc_pred = SVC_best_reg.predict_proba(x_test)

In [30]:
#dt_pred = DT.predict(x_test)

In [31]:
import scikitplot as skplt
import matplotlib.pyplot as plt



In [46]:
import scikitplot as skplt
import matplotlib.pyplot as plt


TypeError: plot_roc_curve() missing 2 required positional arguments: 'estimator' and 'X'

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [None]:
acc_3_t = accuracy_score(y_test.ravel(), SVC_best_reg.predict(x_test))
cm_3_t = confusion_matrix(y_test.ravel(), SVC_best_reg.predict(x_test))
recall = np.diag(cm_3_t) / np.sum(cm_3_t, axis = 1)
recall_3_t = np.mean(recall)
precision = np.diag(cm_3_t) / np.sum(cm_3_t, axis = 0)
precision_3_t = np.nanmean(precision)
print("acc: " + str(round(acc_3_t, 3)) + "\n" + "recall: " + str(round(recall_3_t,3)) + "\n" + "precision: " + str(round(precision_3_t,3)))

In [None]:
### sensitivity
labels = ["Good", "Moderate", "Bad"]
cm = confusion_matrix(y_test.ravel(), SVC_best_reg.predict(x_test), labels= labels)
B_index = list(labels).index("Bad")
M_index = list(labels).index("Moderate")
G_index = list(labels).index("Good")
B_predicted_B_rate = cm[B_index][B_index]/sum(cm[B_index][:])*100
M_predicted_M_rate = cm[M_index][M_index]/sum(cm[M_index][:])*100
G_predicted_G_rate = cm[G_index][G_index]/sum(cm[G_index][:])*100
print("SVC Sensitivity: \n Good prediction: {} \n Moderate prediction {} \n Bad prediction {}".format(G_predicted_G_rate, M_predicted_M_rate, B_predicted_B_rate)  )

In [None]:
PredictedB_is_B_rate = cm[B_index][B_index]/sum(cm[:,B_index])*100
PredictedM_is_M_rate = cm[M_index][M_index]/sum(cm[:,M_index])*100
PredictedG_is_G_rate = cm[G_index][G_index]/sum(cm[:,G_index])*100
print("SVC Precision: \n Good prediction: {} \n Moderate prediction {} \n Bad prediction {}".format(PredictedG_is_G_rate, PredictedM_is_M_rate, PredictedB_is_B_rate))


In [None]:
# confusion_matrix(comparison_df['Target'], comparison_df['prediction'], labels = labels)
labels = ["Good", "Moderate", "Bad"]
origin_cm = confusion_matrix(y_test.ravel(), SVC_best_reg.predict(x_test), labels = labels)
B_index = list(labels).index("Bad")
M_index = list(labels).index("Moderate")
G_index = list(labels).index("Good")
transform_cm =confusion_matrix(y_test.ravel(), SVC_best_reg.predict(x_test), labels = labels)
transform_cm = [[origin_cm[G_index][B_index], origin_cm[M_index][B_index], origin_cm[B_index][B_index]],
                [origin_cm[G_index][M_index], origin_cm[M_index][M_index], origin_cm[B_index][M_index]],
                [origin_cm[G_index][G_index], origin_cm[M_index][G_index], origin_cm[B_index][G_index]]
               ]


In [None]:
c = 0
for i in y_test:
    if i == "Good":
        c = c+1
c        

In [None]:
#labels = comparison_df['prediction'].unique()

cm = transform_cm
ax= plt.subplot()
sns.heatmap(cm, ax = ax, annot=True, cmap="Blues")
ax.set_xlabel('SVC labels')
ax.set_ylabel('Human labels')
ax.set_xticklabels(labels)
ax.set_yticklabels(["Bad", "Moderate", "Good"])
ax.set_title("SVC confusion matrix")

In [None]:
acc_3_t = accuracy_score(y_test.ravel(),DT_best_reg.predict(x_test))
cm_3_t = confusion_matrix(y_test.ravel(),DT_best_reg.predict(x_test))
recall = np.diag(cm_3_t) / np.sum(cm_3_t, axis = 1)
recall_3_t = np.mean(recall)
precision = np.diag(cm_3_t) / np.sum(cm_3_t, axis = 0)
precision_3_t = np.nanmean(precision)
print("acc: " + str(round(acc_3_t, 3)) + "\n" + "recall: " + str(round(recall_3_t,3)) + "\n" + "precision: " + str(round(precision_3_t,3)))

In [None]:
# confusion_matrix(comparison_df['Target'], comparison_df['prediction'], labels = labels)
labels = ["Good", "Moderate", "Bad"]
origin_cm = confusion_matrix(y_test.ravel(),DT_best_reg.predict(x_test), labels = labels)

In [None]:
### sensitivity
cm = origin_cm
B_index = list(labels).index("Bad")
M_index = list(labels).index("Moderate")
G_index = list(labels).index("Good")
B_predicted_B_rate = cm[B_index][B_index]/sum(cm[B_index][:])*100
M_predicted_M_rate = cm[M_index][M_index]/sum(cm[M_index][:])*100
G_predicted_G_rate = cm[G_index][G_index]/sum(cm[G_index][:])*100
print("DT Sensitivity: \n Good prediction: {} \n Moderate prediction {} \n Bad prediction {}".format(G_predicted_G_rate, M_predicted_M_rate, B_predicted_B_rate)  )

In [None]:
PredictedB_is_B_rate = cm[B_index][B_index]/sum(cm[:,B_index])*100
PredictedM_is_M_rate = cm[M_index][M_index]/sum(cm[:,M_index])*100
PredictedG_is_G_rate = cm[G_index][G_index]/sum(cm[:,G_index])*100
print("DT Precision: \n Good prediction: {} \n Moderate prediction {} \n Bad prediction {}".format(PredictedG_is_G_rate, PredictedM_is_M_rate, PredictedB_is_B_rate))


In [None]:
# confusion_matrix(comparison_df['Target'], comparison_df['prediction'], labels = labels)
labels = ["Good", "Moderate", "Bad"]
origin_cm = confusion_matrix(y_test.ravel(), DT_best_reg.predict(x_test), labels = labels)
B_index = list(labels).index("Bad")
M_index = list(labels).index("Moderate")
G_index = list(labels).index("Good")
transform_cm =confusion_matrix(y_test.ravel(), DT_best_reg.predict(x_test), labels = labels)
transform_cm = [[origin_cm[G_index][B_index], origin_cm[M_index][B_index], origin_cm[B_index][B_index]],
                [origin_cm[G_index][M_index], origin_cm[M_index][M_index], origin_cm[B_index][M_index]],
                [origin_cm[G_index][G_index], origin_cm[M_index][G_index], origin_cm[B_index][G_index]]
               ]


In [None]:
#labels = comparison_df['prediction'].unique()

cm = transform_cm
ax= plt.subplot()
sns.heatmap(cm, ax = ax, annot=True, cmap="Blues")
ax.set_xlabel('Human Tree labels')
ax.set_ylabel('Decision labels')
ax.set_xticklabels(labels)
ax.set_yticklabels(["Bad", "Moderate", "Good"])
ax.set_title("Decision tree confusion matrix")