In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sklearn
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [151]:
# Settings
cv = 5

In [162]:
xls = pd.ExcelFile('Dataset - LBP RA.xlsx')
dataframe = pd.read_excel(xls, 'Training Dataset')
#dataframe = dataframe[(dataframe["Treatment"] == 1) | (dataframe["Treatment"] == 5)]
# dataframe = dataframe[(dataframe["Treatment"] != 5)]
dataframe_original = dataframe.copy(True)
print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546 entries, 0 to 1545
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Treatment                         1546 non-null   int64  
 1   Fever                             1512 non-null   float64
 2   Duration_of_pain                  1515 non-null   float64
 3   Sick_leave                        1546 non-null   int64  
 4   Earlier_hospitalization           1546 non-null   int64  
 5   Workoverload                      459 non-null    float64
 6   Familiy_history                   1546 non-null   int64  
 7   Depression                        1546 non-null   int64  
 8   Extremely_nervous                 1494 non-null   float64
 9   Stress                            1546 non-null   int64  
 10  Relationship_with_colleagues      979 non-null    float64
 11  Irrational_thoughts_risk_lasting  1475 non-null   float64
 12  Irrati

In [None]:
categorical_columns = ["Treatment", "Weightloss_per_year"]

boolean_columns = [
    "Fever",
    "Sick_leave",
    "Earlier_hospitalization",
    "Workoverload",
    "Familiy_history",
    "Depression",
    "Stress",
    "Uses_analgesics",
    "Uses_corticosteroids",
    "Serious_disease",
    "Neurogenic_signals",
    "Continuous_pain",
    "Nocturnal_pain",
    "Loss_muscle_strength",
    "Trauma",
    "Failure_symptoms",
    "Incoordination",
    "Paidwork",
]

ordinal_columns = [
    "Duration_of_pain",
    "Extremely_nervous",
    "Relationship_with_colleagues",
    "Irrational_thoughts_risk_lasting",
    "Irrational_thoughts_work",
    "Coping_strategy",
    "Kinesiophobia_physical_exercise",
    "Kinesiophobia_pain_stop",
    "Age",
    "neck_pain_intensity",
    "low_back_pain_intensity",
    "arm_left_pain_intensity",
    "arm_right_pain_intensity",
    "leg_left_pain_intensity",
    "leg_right_pain_intensity",
    "working_ability",
]

value_columns = ["Decreased_mobility"]

In [153]:
# Mapping integer colum
dataframe[value_columns] = dataframe[value_columns].astype("Int64")

# Mapping categories and boolean columns
dataframe[categorical_columns] = dataframe[categorical_columns].astype("category")
dataframe[boolean_columns] = dataframe[boolean_columns].astype("boolean")

# Mapping ordinal columns 
age_mapping = {
    "0-19": 0,
    "20-29": 1,
    "30-39": 2,
    "40-49": 3,
    "50-59": 4,
    "60-69": 5,
    "70-79":6,
    ">=80": 7,
}

dataframe["Age"] = dataframe["Age"].replace(age_mapping)

for column in ordinal_columns:
    dataframe[[column]] = dataframe[[column]].astype("Int64")
    dataframe[column].fillna(-1, inplace=True)
    dataframe[column] = pd.Categorical(dataframe[column], categories=sorted(dataframe[column].unique()), ordered=True)

In [154]:
missing_percentages = dataframe_original.isnull().mean()
columns_to_remove = missing_percentages[missing_percentages > 0.3].index.tolist()
dataframe = dataframe.drop(columns=columns_to_remove)

categorical_columns =  [col for col in categorical_columns if col not in columns_to_remove]
ordinal_columns = [col for col in ordinal_columns if col not in columns_to_remove]
boolean_columns = [col for col in boolean_columns if col not in columns_to_remove]
value_columns = [col for col in value_columns if col not in columns_to_remove]

print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546 entries, 0 to 1545
Data columns (total 33 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Treatment                         1546 non-null   category
 1   Fever                             1512 non-null   boolean 
 2   Duration_of_pain                  1546 non-null   category
 3   Sick_leave                        1546 non-null   boolean 
 4   Earlier_hospitalization           1546 non-null   boolean 
 5   Familiy_history                   1546 non-null   boolean 
 6   Depression                        1546 non-null   boolean 
 7   Extremely_nervous                 1546 non-null   category
 8   Stress                            1546 non-null   boolean 
 9   Irrational_thoughts_risk_lasting  1546 non-null   category
 10  Irrational_thoughts_work          1546 non-null   category
 11  Coping_strategy                   1546 non-null   catego

In [155]:
categorical_columns.remove("Treatment")

X_encoded = pd.get_dummies(dataframe[categorical_columns], drop_first=True)
X = pd.concat(
    [dataframe[value_columns + ordinal_columns + boolean_columns], X_encoded], axis=1
)
X_clean = X.dropna()

y = dataframe["Treatment"]
y_clean = y[X.index.isin(X_clean.index)]

minority_data = dataframe[(dataframe["Treatment"] != 1) & (dataframe["Treatment"] != 5)]
minority_data = pd.concat([minority_data] * 3)
minority_X_encoded = pd.get_dummies(minority_data[categorical_columns], drop_first=True)
minority_X = pd.concat(
    [
        minority_data[value_columns + ordinal_columns + boolean_columns],
        minority_X_encoded,
    ],
    axis=1,
)
minority_X_clean = minority_X.dropna()

minority_y = minority_data["Treatment"]
minority_y_clean = minority_y[minority_X.index.isin(minority_X_clean.index)]


X_Train = pd.concat([X_clean, minority_X_clean], axis=0)
y_Train = pd.concat([y_clean, minority_y_clean], axis=0)

print(y_clean.info())
print(y_Train.head())

<class 'pandas.core.series.Series'>
Index: 1216 entries, 1 to 1545
Series name: Treatment
Non-Null Count  Dtype   
--------------  -----   
1216 non-null   category
dtypes: category(1)
memory usage: 10.9 KB
None
1    3
2    1
5    1
6    1
8    1
Name: Treatment, dtype: category
Categories (4, int64): [1, 2, 3, 4]


In [156]:
# Decision Tree Model
param_grid = {
    "max_depth": [1, 2, 3, 4, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(
    DecisionTreeClassifier(), param_grid, cv=cv, scoring="accuracy"
)
grid_search.fit(X_Train, y_Train)
best_params = grid_search.best_params_
# print("Best Score:", grid_search.best_score_)

best_max_depth = grid_search.best_params_["max_depth"]
best_min_samples_split = grid_search.best_params_["min_samples_split"]
best_min_samples_leaf = grid_search.best_params_["min_samples_leaf"]

tree_model = DecisionTreeClassifier(
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
)

tree_model.fit(X_Train, y_Train)
tree_predicted = cross_val_predict(tree_model, X, y, cv=5)

# Evaluation
print("Simple k=" + str(cv) + " K fold CV")
print("Decision Tree Model:")
print(classification_report(y, tree_predicted))

Simple k=5 K fold CV
Decision Tree Model:
              precision    recall  f1-score   support

           1       0.45      0.42      0.43       659
           2       0.16      0.12      0.14       160
           3       0.14      0.06      0.09        65
           4       0.00      0.00      0.00        13
           5       0.44      0.53      0.48       649

    accuracy                           0.41      1546
   macro avg       0.24      0.23      0.23      1546
weighted avg       0.40      0.41      0.40      1546



In [157]:
# class_labels = [str(label) for label in tree_model.classes_]
# plt.figure(figsize=(135, 90))
# plot_tree(tree_model,max_depth=5, feature_names=X.columns, class_names=class_labels, filled=True, rounded=True)
# plt.show()

# Note that colors are based on the majority class in a leaf (with intensity being an indicator for how large this majority is over the others).

In [158]:
# Histogram-based Gradient Boosting Classification Tree Model
boosting_model = HistGradientBoostingClassifier(max_depth=5)
boosting_model.fit(X_Train, y_Train)
boosting_predicted = cross_val_predict(boosting_model, X, y, cv=cv)

# Evaluation
print("Simple k=" + str(cv) + " K fold CV")
print("Histogram-based Gradient Boosting Classification Tree Model:")
print(classification_report(y, boosting_predicted))

Simple k=5 K fold CV
Histogram-based Gradient Boosting Classification Tree Model:
              precision    recall  f1-score   support

           1       0.46      0.47      0.46       659
           2       0.17      0.07      0.10       160
           3       0.07      0.02      0.03        65
           4       0.00      0.00      0.00        13
           5       0.45      0.55      0.50       649

    accuracy                           0.44      1546
   macro avg       0.23      0.22      0.22      1546
weighted avg       0.40      0.44      0.42      1546



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [159]:
# Random forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10)
rf_model.fit(X_Train, y_Train)
rf_predicted = cross_val_predict(rf_model, X_clean, y_clean, cv=cv)

# Evaluation
print("Simple k=" + str(cv) + " K fold CV")
print("Random forest model:")
print(classification_report(y_clean, rf_predicted))

Simple k=5 K fold CV
Random forest model:
              precision    recall  f1-score   support

           1       0.47      0.58      0.52       535
           2       1.00      0.01      0.01       133
           3       1.00      0.02      0.04        46
           4       0.00      0.00      0.00        10
           5       0.46      0.51      0.48       492

    accuracy                           0.46      1216
   macro avg       0.58      0.22      0.21      1216
weighted avg       0.54      0.46      0.43      1216



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [160]:
# KNN model
knn_model = KNeighborsClassifier(n_neighbors=7)  
knn_model.fit(X_Train, y_Train)
knn_predicted = cross_val_predict(knn_model, X_clean, y_clean, cv=cv)

# Evaluation
print("Simple k=" + str(cv) + " K fold CV")
print("K-Nearest Neighbors model:")
print(classification_report(y_clean, knn_predicted))

Simple k=5 K fold CV
K-Nearest Neighbors model:
              precision    recall  f1-score   support

           1       0.46      0.60      0.52       535
           2       0.10      0.02      0.04       133
           3       0.00      0.00      0.00        46
           4       0.00      0.00      0.00        10
           5       0.42      0.41      0.42       492

    accuracy                           0.43      1216
   macro avg       0.19      0.21      0.19      1216
weighted avg       0.38      0.43      0.40      1216



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [161]:
# Ensemble model - Duplication
ensemble_model = VotingClassifier(estimators=[
    ('decision_tree', tree_model),
    ('gradient_boosting', boosting_model),
    ('random_forest', rf_model)
    ,('knn',knn_model)
], voting='hard')

ensemble_predicted = cross_val_predict(ensemble_model, X_clean, y_clean, cv=cv)

print("Simple k=" + str(cv) + " K fold CV")
print("Ensemble Model:")
print(classification_report(y_clean, ensemble_predicted))

KeyboardInterrupt: 

In [None]:
# Ensemble model - No duplication
# ------------------------------------------------------------------------
# Decision Tree Model
param_grid = {
    "max_depth": [1, 2, 3, 4, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
grid_search = GridSearchCV(
    DecisionTreeClassifier(), param_grid, cv=cv, scoring="accuracy"
)
grid_search.fit(X, y)
best_params = grid_search.best_params_
# print("Best Score:", grid_search.best_score_)

best_max_depth = grid_search.best_params_["max_depth"]
best_min_samples_split = grid_search.best_params_["min_samples_split"]
best_min_samples_leaf = grid_search.best_params_["min_samples_leaf"]

tree_model = DecisionTreeClassifier(
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
)

tree_model.fit(X_clean, y_clean)
# ------------------------------------------------------------------------
# Random forest model
rf_model = RandomForestClassifier(max_depth=5)
rf_model.fit(X_clean, y_clean)
# ------------------------------------------------------------------------
# Histogram-based Gradient Boosting Classification Tree Model
boosting_model = HistGradientBoostingClassifier(max_depth=5)
boosting_model.fit(X_clean, y_clean)
# ------------------------------------------------------------------------

ensemble_model = VotingClassifier(estimators=[
    ('decision_tree', tree_model),
    ('gradient_boosting', boosting_model),
    ('random_forest', rf_model)
], voting='hard')

ensemble_predicted = cross_val_predict(ensemble_model, X_clean, y_clean, cv=cv)

print("Simple k=" + str(cv) + " K fold CV")
print("Ensemble Model:")
print(classification_report(y_clean, ensemble_predicted))

Simple k=5 K fold CV
Ensemble Model:
              precision    recall  f1-score   support

           1       0.44      0.55      0.49       535
           2       0.50      0.01      0.01       133
           3       0.33      0.02      0.04        46
           4       0.00      0.00      0.00        10
           5       0.42      0.47      0.44       492

    accuracy                           0.43      1216
   macro avg       0.34      0.21      0.20      1216
weighted avg       0.43      0.43      0.40      1216



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
