# Load Kinecal Data ELEC872 Project
### Cameron Bishop | November 16th 2022

In [123]:
import pandas as pd
import numpy as np
import importlib
import matplotlib.pyplot as plt
import seaborn as sns

from lazypredict.Supervised import LazyClassifier
import xgboost
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import label_binarize, LabelEncoder #Standardization and conversion of categorical labels to binary representation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, plot_roc_curve

import LoadKinecalFunctions
from LoadKinecalFunctions import *
importlib.reload(LoadKinecalFunctions)
from const import User, Exercise
import models
from models import *
importlib.reload(models)



<module 'models' from 'c:\\Users\\camb7\\Code Repositories\\KINECAL Balance Assessment\\KINECAL-Balance-Assessment\\models.py'>

In [124]:
user = User.CD
excerciseList = [Exercise.QSECFS, Exercise.QSEOFS, Exercise.STS, Exercise.TS]

QSClosedDf = readKinecalFiles(excerciseList[0], user) 
QSOpenDf = readKinecalFiles(excerciseList[1], user) 
SemiTandemDf = readKinecalFiles(excerciseList[2], user) 
TandemDf = readKinecalFiles(excerciseList[3], user) 


### Seperate X and Y Dataframes

In [125]:
yDf = QSClosedDf.iloc[:, 0:9]
yDf = yDf.drop(columns='movement')

QSClosedXDf = QSClosedDf.iloc[:, 9:-1]
QSOpenXDf = QSOpenDf.iloc[:, 9:-1]
SemiTandemXDf = SemiTandemDf.iloc[:, 9:-1]
TandemXDf = TandemDf.iloc[:, 9:-1]

print(yDf)

   part_id group sex  height  weight   BMI recorded_in_the_lab  \
0       10    HA   m    1.85   77.00 22.50                   1   
1      100    NF   f    1.55   82.50 34.30                   1   
2       11    HA   f    1.57   51.50 20.90                   1   
3       12    HA   m    1.64   68.50 25.50                   1   
4       13    HA   m    1.78   85.00 26.80                   1   
..     ...   ...  ..     ...     ...   ...                 ...   
85      84   FHm   f    1.56   60.00 24.70                   0   
86      87   FHs   f    1.60   77.10 30.10                   0   
87       9    HA   m    1.69   75.00 26.30                   1   
88      92    NF   m    1.83   97.00 29.00                   0   
89      96   FHs   m    1.78   81.20 25.60                   0   

   clinically_at_risk  
0                   0  
1                   0  
2                   0  
3                   0  
4                   0  
..                ...  
85                  0  
86             

### Replace Missing Data With Mean of That Class

In [126]:
QSClosedXDf = replaceMissingValues(QSClosedXDf, yDf)
QSOpenXDf = replaceMissingValues(QSOpenXDf, yDf)
SemiTandemXDf = replaceMissingValues(SemiTandemXDf, yDf)
TandemXDf = replaceMissingValues(TandemXDf, yDf)


[ 2 14 15]
[ 2 17 29 34 71]
[ 2 14 17 31 33 60 66]
[13 14 17 29 31 33 36 37 55 60 66 74 77]


### Perform Sample Normalization

In [127]:
#print(np.asarray(QS_Closed_x_df.iloc[0,:].values))
QSClosedXNormDf = datasetNormalization(QSClosedXDf)
QSOpenXNormDf = datasetNormalization(QSOpenXDf)
SemiTandemXNormDf = datasetNormalization(SemiTandemXDf)
TandemXNormDf = datasetNormalization(TandemXDf)

### Combine Excercises Into One Dataset

In [128]:
print(QSClosedXNormDf.shape, QSOpenXNormDf.shape, SemiTandemXNormDf.shape, TandemXNormDf.shape)
combinedXNormDf = pd.concat([QSClosedXNormDf, QSOpenXNormDf, SemiTandemXNormDf, TandemXDf], axis=1)
print(combinedXNormDf.shape)

(90, 16) (90, 16) (90, 16) (90, 16)
(90, 64)


### Seperate Into Training and Testing Sets

In [129]:
xTrain, xTest, yTrain, yTest = train_test_split(combinedXNormDf, yDf, test_size=0.2, random_state=42)

print(yTrain)
print(yTest)

le = LabelEncoder()
yTest = le.fit_transform(yTest.group.values)
yTrain = le.fit_transform(yTrain.group.values)
print(yTrain)

   part_id group sex  height  weight   BMI recorded_in_the_lab  \
49      42   FHm   f    1.66   87.00 31.60                   0   
62      51    NF   f    1.58   55.30 22.20                   0   
73     701   FHs   f    1.65   85.00 31.20                   0   
69      67    NF   f    1.68   63.50 22.50                   0   
76     705   FHs   m    1.88   87.00 24.60                   0   
..     ...   ...  ..     ...     ...   ...                 ...   
20      26    HA   m    1.73   76.00 25.40                   1   
60     504    HA   m    1.71   70.00 23.90                   1   
71       7    HA   f    1.70   59.00 20.40                   1   
14     201   FHm   f    1.52   54.00 23.40                   0   
51      44   FHs   f    1.65   50.80 18.70                   0   

   clinically_at_risk  
49                  0  
62                  0  
73                  0  
69                  0  
76                  0  
..                ...  
20                  0  
60             

### Train XGBoost Model

In [135]:
tuning_params =[{
    'learning_rate': [0.3, 0.4, 0.5],
    'objective': ['multi:softmax'],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'max_depth': [2, 3, 4, 5],
    'min_child_weight': [2, 3, 4],
    'subsample': [0.6, 0.65, 0.7],
    'colsample_bytree': [0.65 ,0.7, 0.75, 0.8],
    'n_estimators': [200, 225, 250, 275],
    "missing": [-999],
    "seed": [1337]
}]
xTest = np.asarray(xTest)
xTrain = np.asarray(xTrain)
grid = GridSearchCV(xgboost.XGBClassifier(), tuning_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, refit=True)
grid.fit(xTrain, yTrain)

best_model = grid.best_estimator_
print(grid.best_score_)
print(grid.best_params_)

# xgb_model = tune_xgboost(xTrain, yTrain, tuning_params, 2, 2)

Fitting 5 folds for each of 3456 candidates, totalling 17280 fits
0.6514285714285715
{'colsample_bytree': 0.7, 'learning_rate': 0.5, 'max_depth': 2, 'min_child_weight': 4, 'missing': -999, 'n_estimators': 225, 'objective': 'multi:softmax', 'seed': 1337, 'subsample': 0.7}


In [131]:
lazyclass = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, preds = lazyclass.fit(xTrain, xTest, yTrain, yTest)
print(models)

100%|██████████| 29/29 [00:00<00:00, 34.59it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
BernoulliNB                        0.39               0.53    None      0.38   
LinearSVC                          0.44               0.37    None      0.45   
RidgeClassifier                    0.44               0.37    None      0.45   
RandomForestClassifier             0.44               0.37    None      0.44   
BaggingClassifier                  0.39               0.33    None      0.38   
ExtraTreesClassifier               0.44               0.32    None      0.43   
LabelPropagation                   0.39               0.32    None      0.43   
LabelSpreading                     0.39               0.32    None      0.43   
LogisticRegression                 0.39               0.32    None      0.39   
PassiveAggressiveClassifier        0.39               0.31    None      0.38   
RidgeClassifierCV                  0.39 




### Create Optimal XGBoost Model

In [None]:
#optimalParameters = [{'kernel': ['rbf'], 'gamma':['scale'],'C':[250]}]
#svc = train_svm(xTrain,yTrain.group.values.shape,optimalParameters,modelDir,modelName)

clf = svm.SVC(kernel = 'rbf' , C = 250, gamma='scale', probability=True)
clf.fit(xTrain, yTrain.group.values)

y_pred = clf.predict(xTest)


### Create Confusion Matrix and Classification Report

In [None]:
classes = ['HA','NF','FHs','FHm']
cm_python = confusion_matrix(y_true=yTest.group.values, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(4,4)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTest.group.values, y_pred=y_pred))


### Remap Labels to Be Binary Single Fallers Included as Fallers

In [None]:
yTrainBinary = binaryLabelRemapping(yTrain, True)
yTestBinary = binaryLabelRemapping(yTest, True)

### Tune XGBoost

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]},
                        {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]}]
modelDir = './models/'
modelName = 'tunedSVM5Fold_binarylabelsTrue'
svmModel = tune_svm(xTrain,yTrainBinary.label_binary.values, tuned_parameters, 5)

### Train XGBoost with Optimal Parameters For Binary Labels Single Fallers are Fallers

In [None]:
clf = svm.SVC(kernel = 'rbf' , C = 250, gamma='scale', probability=True)
clf.fit(xTrain, yTrainBinary.label_binary.values)

y_pred = clf.predict(xTest)

### Create Confusion Matrix For Binary Labels

In [None]:
classes = ['Faller', 'Non-Faller']
cm_python = confusion_matrix(y_true=yTestBinary.label_binary.values, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTestBinary.label_binary.values, y_pred=y_pred))


### Obtain Missclassifications

In [None]:
missclassifications = yTestBinary.loc[yTestBinary.label_binary!=y_pred]
print(missclassifications)

### Remap Labels to Be Binary Single Fallers Included as Non-Fallers

In [None]:
yTrainBinary = binaryLabelRemapping(yTrain, False)
yTestBinary = binaryLabelRemapping(yTest, False)

### Tune XGBoost

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]},
                        {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-5, 'scale', 'auto'], 'C': [100, 250, 500, 750, 1000]}]
modelDir = './models/'
modelName = 'tunedSVM5Fold_binarylabelsFalse'
svmModel = tune_svm(xTrain,yTrainBinary.label_binary.values, tuned_parameters, 5)

### Train XGBoost with Optimal Parameters Single Fallers Considered Non-Fallers

In [None]:
clf = svm.SVC(kernel = 'sigmoid' , C = 750, gamma=0.0001, probability=True)
clf.fit(xTrain, yTrainBinary.label_binary.values)

y_pred = clf.predict(xTest)

### Create Confusion Matrix For Binary Labels FHs -> Non-Fallers

In [None]:
classes = ['Faller', 'Non-Faller']
cm_python = confusion_matrix(y_true=yTestBinary.label_binary.values, y_pred=y_pred, labels=classes)

plt.subplots(figsize=(6,4))

group_counts = ['{0:0.0f}'.format(value) for value in
                cm_python.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm_python.flatten()/np.sum(cm_python)]
labels = [f'{v1}\n{v2}' for v1, v2 in
          zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
df_cm_python = pd.DataFrame(cm_python)

sns.heatmap(df_cm_python, annot=labels, fmt='',annot_kws={"size": 16}, xticklabels=classes, yticklabels=classes) # font size
plt.title('Test Model: Standard processing w/normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.yticks(rotation=0)
plt.show()


print(classification_report(y_true=yTestBinary.label_binary.values, y_pred=y_pred))


### Obtain Missclassifications

In [None]:
missclassifications = yTestBinary.loc[yTestBinary.label_binary!=y_pred]
print(missclassifications)