# Load Kinecal Data ELEC872 Project
### Cameron Bishop | November 16th 2022

In [53]:
import pandas as pd
import numpy as np
import importlib
import matplotlib.pyplot as plt
import seaborn as sns
import time

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import random

import xgboost
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder #Standardization and conversion of categorical labels to binary representation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import LoadKinecalFunctions
from LoadKinecalFunctions import *
importlib.reload(LoadKinecalFunctions)
from const import User, Exercise
import models
from models import *
importlib.reload(models)



<module 'models' from 'c:\\Users\\camb7\\Code Repositories\\KINECAL Balance Assessment\\KINECAL-Balance-Assessment\\models.py'>

In [54]:
user = User.CD
excerciseList = [Exercise.QSECFS, Exercise.QSEOFS, Exercise.STS, Exercise.TS]

QSClosedDf = readKinecalFiles(excerciseList[0], user) 
QSOpenDf = readKinecalFiles(excerciseList[1], user) 
SemiTandemDf = readKinecalFiles(excerciseList[2], user) 
TandemDf = readKinecalFiles(excerciseList[3], user) 


### Seperate X and Y Dataframes

In [55]:
yDf = QSClosedDf.iloc[:, 0:9]
yDf = yDf.drop(columns='movement')

QSClosedXDf = QSClosedDf.iloc[:, 9:-1]
QSOpenXDf = QSOpenDf.iloc[:, 9:-1]
SemiTandemXDf = SemiTandemDf.iloc[:, 9:-1]
TandemXDf = TandemDf.iloc[:, 9:-1]

print(yDf)

   part_id group sex  height  weight   BMI recorded_in_the_lab  \
0       10    HA   m    1.85    77.0  22.5                   1   
1      100    NF   f    1.55    82.5  34.3                   1   
2       11    HA   f    1.57    51.5  20.9                   1   
3       12    HA   m    1.64    68.5  25.5                   1   
4       13    HA   m    1.78    85.0  26.8                   1   
..     ...   ...  ..     ...     ...   ...                 ...   
85      84   FHm   f    1.56    60.0  24.7                   0   
86      87   FHs   f    1.60    77.1  30.1                   0   
87       9    HA   m    1.69    75.0  26.3                   1   
88      92    NF   m    1.83    97.0  29.0                   0   
89      96   FHs   m    1.78    81.2  25.6                   0   

   clinically_at_risk  
0                   0  
1                   0  
2                   0  
3                   0  
4                   0  
..                ...  
85                  0  
86             

### Replace Missing Data With Mean of That Class

In [56]:
QSClosedXDf = replaceMissingValues(QSClosedXDf, yDf)
QSOpenXDf = replaceMissingValues(QSOpenXDf, yDf)
SemiTandemXDf = replaceMissingValues(SemiTandemXDf, yDf)
TandemXDf = replaceMissingValues(TandemXDf, yDf)


[ 2 14 15]
[ 2 17 29 34 71]
[ 2 14 17 31 33 60 66]
[13 14 17 29 31 33 36 37 55 60 66 74 77]


### Perform Sample Normalization

In [57]:
#print(np.asarray(QS_Closed_x_df.iloc[0,:].values))
QSClosedXNormDf = datasetNormalization(QSClosedXDf)
QSOpenXNormDf = datasetNormalization(QSOpenXDf)
SemiTandemXNormDf = datasetNormalization(SemiTandemXDf)
TandemXNormDf = datasetNormalization(TandemXDf)

### Combine Excercises Into One Dataset

In [58]:
print(QSClosedXNormDf.shape, QSOpenXNormDf.shape, SemiTandemXNormDf.shape, TandemXNormDf.shape)
combinedXNormDf = pd.concat([QSClosedXNormDf, QSOpenXNormDf, SemiTandemXNormDf, TandemXDf], axis=1)
print(combinedXNormDf.shape)

(90, 16) (90, 16) (90, 16) (90, 16)
(90, 64)


### Seperate Into Training and Testing Sets

In [59]:
xTrain, xTest, yTrain, yTest = train_test_split(combinedXNormDf, yDf, test_size=0.2, random_state=42)

le = LabelEncoder()

yTest_orig = yTest
yTrain_orig = yTrain

xTest_orig = xTest
xTrain_orig = xTrain

yTest = le.fit_transform(yTest.group.values)
yTrain = le.fit_transform(yTrain.group.values)

X = np.asarray(combinedXNormDf)
y = np.asarray(le.fit_transform(yDf.group.values))

xTest = np.asarray(xTest)
xTrain = np.asarray(xTrain) 



### Train XGBoost Model

In [71]:

param_space = {
    'eta': hp.loguniform('eta', np.log(0.01), np.log(1)),
    'tree_method': hp.choice('tree_method', ['exact', 'gpu_hist']),
    'objective': hp.choice('objective', ['multi:softmax', 'multi:softprob']),
    'max_depth': scope.int(hp.quniform("max_depth", 3, 18, 1)),
    'n_estimators': scope.int(hp.quniform("n_estimators", 5, 200, 1)),
    'gamma': hp.uniform('gamma', 1,9),
    'alpha' : hp.quniform('reg_alpha', 40,180,1),
    'lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : scope.int(hp.quniform('min_child_weight', 0, 10, 1)),
}
def objective(params):
        clf = xgboost.XGBClassifier(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        score = cross_val_score(clf, xTrain, yTrain, cv=skf).mean()
        return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best_param = fmin(fn=objective, 
                    space=param_space, 
                    algo=tpe.suggest, 
                    max_evals=200, 
                    trials=trials)
print(f"Best parameters: {best_param}")

100%|██████████| 200/200 [05:26<00:00,  1.63s/trial, best loss: -0.11047619047619046]
Best parameters: {'colsample_bytree': 0.581073142624591, 'eta': 0.07308961138214025, 'gamma': 8.746543148247016, 'max_depth': 14.0, 'min_child_weight': 9.0, 'n_estimators': 150.0, 'objective': 1, 'reg_alpha': 125.0, 'reg_lambda': 0.3590171171629857, 'tree_method': 1}


### Create Optimal XGBoost Model

In [45]:
#optimalParameters = [{'kernel': ['rbf'], 'gamma':['scale'],'C':[250]}]
#svc = train_svm(xTrain,yTrain.group.values.shape,optimalParameters,modelDir,modelName)

clf = xgb.XGBClassifier(**best_hyperparams)
clf.fit(xTrain, yTrain)

y_pred = clf.predict(xTest)


XGBoostError: Invalid Parameter format for max_depth expect int but value='7.0'

### Create Confusion Matrix and Classification Report

In [None]:
classes = ['HA','NF','FHs','FHm']
cm_python = confusion_matrix(y_true=yTest, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(4,4)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTest.group.values, y_pred=y_pred))


### Remap Labels to Be Binary Single Fallers Included as Fallers

In [None]:
yTrainBinary = binaryLabelRemapping(yTrain, True)
yTestBinary = binaryLabelRemapping(yTest, True)

### Tune XGBoost

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]},
                        {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]}]
modelDir = './models/'
modelName = 'tunedSVM5Fold_binarylabelsTrue'
svmModel = tune_svm(xTrain,yTrainBinary.label_binary.values, tuned_parameters, 5)

### Train XGBoost with Optimal Parameters For Binary Labels Single Fallers are Fallers

In [None]:
clf = svm.SVC(kernel = 'rbf' , C = 250, gamma='scale', probability=True)
clf.fit(xTrain, yTrainBinary.label_binary.values)

y_pred = clf.predict(xTest)

### Create Confusion Matrix For Binary Labels

In [None]:
classes = ['Faller', 'Non-Faller']
cm_python = confusion_matrix(y_true=yTestBinary.label_binary.values, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTestBinary.label_binary.values, y_pred=y_pred))


### Obtain Missclassifications

In [None]:
missclassifications = yTestBinary.loc[yTestBinary.label_binary!=y_pred]
print(missclassifications)

### Remap Labels to Be Binary Single Fallers Included as Non-Fallers

In [None]:
yTrainBinary = binaryLabelRemapping(yTrain, False)
yTestBinary = binaryLabelRemapping(yTest, False)

### Tune XGBoost

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]},
                        {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]}]
modelDir = './models/'
modelName = 'tunedSVM5Fold_binarylabelsFalse'
svmModel = tune_svm(xTrain,yTrainBinary.label_binary.values, tuned_parameters, 5)

### Train XGBoost with Optimal Parameters Single Fallers Considered Non-Fallers

In [None]:
clf = svm.SVC(kernel = 'sigmoid' , C = 750, gamma=0.0001, probability=True)
clf.fit(xTrain, yTrainBinary.label_binary.values)

y_pred = clf.predict(xTest)

### Create Confusion Matrix For Binary Labels FHs -> Non-Fallers

In [None]:
classes = ['Faller', 'Non-Faller']
cm_python = confusion_matrix(y_true=yTestBinary.label_binary.values, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTestBinary.label_binary.values, y_pred=y_pred))


### Obtain Missclassifications

In [None]:
missclassifications = yTestBinary.loc[yTestBinary.label_binary!=y_pred]
print(missclassifications)