# **Improving Kyphosis Diagnosis with ML/DL: Classifying Patients as Having Kyphosis or Not**

In [1]:
!pip install xgboost lazypredict -q

## **Problem Statement**

Kyphosis is a spinal condition that can have significant impacts on patient health.In his notebook We aim to develop a machine learning model that can accurately classify patients as having kyphosis or not based on various features.
<center>

<img src="images/Kyphosis.png" width="500"/>

</center>

## Dataset Overview

*   kyphosis dataset has 81 rows and 4 columns :

    1.   Kyphosis : Target present/absent
    2.   Age : the number of months
    3.   Number : the number of vertebrae involved
    4.   Start: the number of the first vertebra operated on.

# **Importing Libraries and Loading the dataset**

In [48]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from lazypredict.Supervised import LazyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline

# Utils

In [53]:
def plot_confusion_matrix_plotly(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
  
    x_labels = ['Predicted Negative', 'Predicted Positive']
    y_labels = ['Actual Negative', 'Actual Positive']
    colorscale = [[0, '#FFFFFF'], [1, '#4B0082']]

    fig = ff.create_annotated_heatmap(
        z=cm,
        x=x_labels,
        y=y_labels,
        showscale=True,
        colorscale=colorscale,
        reversescale=False,
        font_colors=['#000000', '#FFFFFF'],
    )
    # Set the title and axis labels
    fig.update_layout(
        title='Confusion Matrix : Random Forest Classifier',
        xaxis_title='Predicted Label',
        yaxis_title='True Label',
    )
    fig.show()

# Exploratory Data Analysis

In [6]:
df = pd.read_csv('kyphosis.csv')
df = df[['Age', 'Number', 'Start', 'Kyphosis']]
df.head()

Unnamed: 0,Age,Number,Start,Kyphosis
0,71,3,5,absent
1,158,3,14,absent
2,128,4,5,present
3,2,5,1,absent
4,1,4,15,absent


In [7]:
df['Kyphosis'] = df['Kyphosis'].map({'absent':0, 'present':1})
df.head()

Unnamed: 0,Age,Number,Start,Kyphosis
0,71,3,5,0
1,158,3,14,0
2,128,4,5,1
3,2,5,1,0
4,1,4,15,0


### Dataset description

In [8]:
df.describe()

Unnamed: 0,Age,Number,Start,Kyphosis
count,81.0,81.0,81.0,81.0
mean,83.654321,4.049383,11.493827,0.209877
std,58.104251,1.619423,4.883962,0.409758
min,1.0,2.0,1.0,0.0
25%,26.0,3.0,9.0,0.0
50%,87.0,4.0,13.0,0.0
75%,130.0,5.0,16.0,0.0
max,206.0,10.0,18.0,1.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Age       81 non-null     int64
 1   Number    81 non-null     int64
 2   Start     81 non-null     int64
 3   Kyphosis  81 non-null     int64
dtypes: int64(4)
memory usage: 2.7 KB


In [10]:
# check for missing values
df.isnull().sum()

Age         0
Number      0
Start       0
Kyphosis    0
dtype: int64

### Data preprocessing

#### Visualizing Key Features in the dataset

In [11]:
# visualize the correlation between the features and the target with plotly
import plotly.express as px
fig = px.scatter_matrix(df, dimensions=['Age', 'Number', 'Start'], color='Kyphosis')
fig.show()

In [12]:
#calculate the correlation between the features
correlation = df.corr()
fig = px.imshow(correlation, text_auto=True)
fig.show()

In [13]:
fig = px.scatter_3d(df, x='Age', y='Number', z='Start', color='Kyphosis', color_continuous_scale='Viridis')
fig.show()

In [14]:
# boxplots of numerical features for outlier detection using plotly
fig = px.box(df, x='Age', color='Kyphosis')
fig.show()

fig = px.box(df, x='Number', color='Kyphosis')
fig.show()

fig = px.box(df, x='Start', color='Kyphosis')
fig.show()

In [22]:
X = df.drop('Kyphosis', axis=1)
y = df['Kyphosis']

## Data Augmentation

In [26]:
def balance_dataset(X, y):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res

X_res, y_res = balance_dataset(X, y) 

# check the balance of the dataset
y_res.value_counts()

0    64
1    64
Name: Kyphosis, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)

# Modelling 

## Lazy Predict

In [37]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_res, X_test, y_res, y_test)

100%|██████████| 29/29 [00:02<00:00, 10.05it/s]


In [38]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,1.0,1.0,1.0,1.0,0.32
LabelPropagation,1.0,1.0,1.0,1.0,0.05
XGBClassifier,1.0,1.0,1.0,1.0,0.19
DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.06
RandomForestClassifier,1.0,1.0,1.0,1.0,0.54
ExtraTreeClassifier,1.0,1.0,1.0,1.0,0.06
ExtraTreesClassifier,1.0,1.0,1.0,1.0,0.46
BaggingClassifier,1.0,1.0,1.0,1.0,0.12
LabelSpreading,1.0,1.0,1.0,1.0,0.08
SVC,0.96,0.96,0.96,0.96,0.04


In [39]:
predictions

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,1.0,1.0,1.0,1.0,0.32
LabelPropagation,1.0,1.0,1.0,1.0,0.05
XGBClassifier,1.0,1.0,1.0,1.0,0.19
DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.06
RandomForestClassifier,1.0,1.0,1.0,1.0,0.54
ExtraTreeClassifier,1.0,1.0,1.0,1.0,0.06
ExtraTreesClassifier,1.0,1.0,1.0,1.0,0.46
BaggingClassifier,1.0,1.0,1.0,1.0,0.12
LabelSpreading,1.0,1.0,1.0,1.0,0.08
SVC,0.96,0.96,0.96,0.96,0.04


## Random Forest

In [41]:
rfc = RandomForestClassifier()
rfc.fit(X_res, y_res)

In [46]:
rfc_pred = rfc.predict(X_test)

print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        13

    accuracy                           1.00        26
   macro avg       1.00      1.00      1.00        26
weighted avg       1.00      1.00      1.00        26



In [47]:
plot_confusion_matrix_plotly(y_test, rfc_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

Confusion matrix, without normalization
[[13  0]
 [ 0 13]]


### RandomizedSearchCV

In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 250, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rfc_random.fit(X_res, y_res)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [50]:
# print the best parameters, the best score and the best estimator of the model after HPO
print("Best parameters : ",rfc_random.best_params_)
print("Best score : ",rfc_random.best_score_)
print("Best estimator",rfc_random.best_estimator_)

Best parameters :  {'n_estimators': 205, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': False}
Best score :  0.8905500184569952
Best estimator RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=4,
                       min_samples_split=5, n_estimators=205)


In [54]:
rfc_random_pred = rfc_random.predict(X_test)
print(classification_report(y_test, rfc_random_pred))
plot_confusion_matrix_plotly(y_test, rfc_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        13

    accuracy                           1.00        26
   macro avg       1.00      1.00      1.00        26
weighted avg       1.00      1.00      1.00        26

Confusion matrix, without normalization


## XGBoost 

In [56]:
# implement xgboost classifier 
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_res, y_res)

xgb_pred = xgb.predict(X_test)

print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        13

    accuracy                           1.00        26
   macro avg       1.00      1.00      1.00        26
weighted avg       1.00      1.00      1.00        26

[[13  0]
 [ 0 13]]


In [57]:
plot_confusion_matrix_plotly(y_test, xgb_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

Confusion matrix, without normalization


## AdaBoost

In [59]:
from sklearn.ensemble import AdaBoostClassifier
adaclf = AdaBoostClassifier(n_estimators=100, random_state=0)
adaclf.fit(X_res, y_res)

ada_pred = adaclf.predict(X_test)

print(classification_report(y_test, ada_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        13

    accuracy                           1.00        26
   macro avg       1.00      1.00      1.00        26
weighted avg       1.00      1.00      1.00        26



In [60]:
plot_confusion_matrix_plotly(y_test, xgb_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

Confusion matrix, without normalization
