In [1]:
from cashe import get_train_test_split
from src.model_train import ModelTrainer, encode_label
from src.data_application import train_columns
import pandas as pd

In [29]:
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

def round_two(val):
    return round(val,2)

def encode_label(labels:list):
    out = []
    for i in range(len(labels)):
        if labels[i] == 'approved':
            out.append(1)
        else:
            out.append(0)
    return out

class ModelTrainer:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def _train_sgd_classifier(self):
        self.sgd_model = SGDClassifier()
        self.sgd_model.fit(self.X_train, self.y_train)
    def _train_decision_tree(self):
        self.dt_model = DecisionTreeClassifier()
        self.dt_model.fit(self.X_train, self.y_train)

    def _train_neural_network(self):
        self.nn_model = MLPClassifier()
        self.nn_model.fit(self.X_train, self.y_train)

    def _train_lightgbm(self):
        self.lgbm_model = LGBMClassifier()
        self.lgbm_model.fit(self.X_train, self.y_train)

    def _train_xgboost_classifier(self):
        self.xgb_model = xgb.XGBClassifier()
        self.xgb_model.fit(self.X_train, self.y_train)

    def _get_prediction_metrics(self,model):
        y_pred = model.predict(self.X_test)
        mod_accuracy = accuracy_score(self.y_test, y_pred)
        mod_precision = precision_score(self.y_test, y_pred)
        mod_recall = recall_score(self.y_test, y_pred)

        dict_metrics = {
            "Accuracy": round_two(mod_accuracy),
            "Precision": round_two(mod_precision),
            "Recall": round_two(mod_recall)}

        return dict_metrics

    def train_models(self):
        self._train_lightgbm()
        self._train_decision_tree()
        self._train_neural_network()
        self._train_sgd_classifier()
        self._train_xgboost_classifier()

        print('\n\nLightGBM, Decision Tree, SGD and Neural Network are trained on dataset.')

    def evaluate_models(self):
        dt_dict   = self._get_prediction_metrics(self.dt_model)
        lgbm_dict = self._get_prediction_metrics(self.lgbm_model)
        nn_dict   = self._get_prediction_metrics(self.nn_model)
        sgd_dict  = self._get_prediction_metrics(self.sgd_model)
        xgb_dict  = self._get_prediction_metrics(self.xgb_model)

        return {
            "Decision Tree" : dt_dict,
            "LightGBM" : lgbm_dict,
            "SGD Classifier" : sgd_dict,
            "XGBoost" : xgb_dict,
            "Neural Network" : nn_dict
        }

    def get_decision_tree_feature_importance(self):
        return self.dt_model.feature_importances_

    def _get_lightgbm_feature_importance_sub(self):
        return self.lgbm_model.feature_importances_

    def _normalize_feature_importance(self, importance_values):
        total_importance = np.sum(importance_values)
        normalized_importance = importance_values / total_importance
        return normalized_importance

    def get_lightgbm_feature_importance(self):
        lgbm_feature_importance = self._get_lightgbm_feature_importance_sub()
        normalized_importance = self._normalize_feature_importance(lgbm_feature_importance)
        return normalized_importance

    def get_neural_network_feature_importance(self):

        coefficients = self.nn_model.coefs_[0]
        absolute_weights = np.abs(coefficients)

        total_absolute_weight = np.sum(absolute_weights)
        normalized_importance = absolute_weights / total_absolute_weight * 100
        return normalized_importance

    def get_sgd_classifier_feature_importance(self):
        coefficients = self.sgd_model.coef_
        absolute_weights = np.abs(coefficients)

        total_absolute_weight = np.sum(absolute_weights)
        normalized_importance = absolute_weights / total_absolute_weight *100

        return normalized_importance

    def get_xgboost_feature_importance(self):
        return self.xgb_model.feature_importances_

    def get_feature_importance(self):
        dt_feats = self.get_decision_tree_feature_importance()
        lgbm_feats = self.get_lightgbm_feature_importance()
        dnn_feats = self.get_neural_network_feature_importance()
        sgd_feats = self.get_sgd_classifier_feature_importance()
        xgb_feats = self.get_xgboost_feature_importance()

        return dt_feats, lgbm_feats, dnn_feats,sgd_feats, xgb_feats

In [30]:
df_train, df_test = get_train_test_split()

In [32]:
df_train.head(3)

Unnamed: 0,item_CreatedDate,transaction_RequestId,transaction_PatientAge,transaction_PatientEnGender,item_NameEn,item_Price,item_Status,item_Sequence,item_RequestQuantity,item_ResponseQuantity,transaction_DiagnosisIds,transaction_PhysicianIds,item_ResponseState
0,2023-09,8b577a2f-5bae-4781-a591-1651916efd53,55,0,74,90.0,6,1,1.0,0.0,9,",299,299",rejected
1,2023-10,12c269b4-fa50-4bdf-9715-67765daa8741,27,0,79,242.0,2,4,1.0,1.0,1,",196,196,196",approved
2,2024-02,645fa618-e1f3-42b5-a3ba-6e351e138b0a,33,0,45,151.67,2,2,1.0,1.0,9,",340",approved


In [33]:
X_train = df_train[train_columns[:-1]];  y_train = df_train[train_columns[-1]]
X_test = df_test[train_columns[:-1]];  y_test = df_test[train_columns[-1]]

In [35]:
y_test = encode_label(y_test)
y_train = encode_label(y_train)
y_test[0]

1

In [36]:
model_runner = ModelTrainer(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

In [37]:
model_runner.train_models()

[LightGBM] [Info] Number of positive: 7416, number of negative: 7560
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 579
[LightGBM] [Info] Number of data points in the train set: 14976, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495192 -> initscore=-0.019231
[LightGBM] [Info] Start training from score -0.019231


LightGBM, Decision Tree, SGD and Neural Network are trained on dataset.


In [38]:
res = model_runner.evaluate_models()
res

{'Decision Tree': {'Accuracy': 0.7, 'Precision': 0.72, 'Recall': 0.66},
 'LightGBM': {'Accuracy': 0.75, 'Precision': 0.75, 'Recall': 0.76},
 'SGD Classifier': {'Accuracy': 0.51, 'Precision': 0.62, 'Recall': 0.03},
 'XGBoost': {'Accuracy': 0.76, 'Precision': 0.75, 'Recall': 0.77},
 'Neural Network': {'Accuracy': 0.64, 'Precision': 0.65, 'Recall': 0.6}}

In [39]:
dt_tran = model_runner.get_decision_tree_feature_importance()
dt_feats = pd.DataFrame(list(dt_tran),train_columns[:-1])

dt_feats[0].sort_values(ascending=False)

transaction_PatientAge         0.280368
item_Price                     0.224865
item_NameEn                    0.200458
item_Sequence                  0.151283
transaction_DiagnosisIds       0.094136
transaction_PatientEnGender    0.042211
item_RequestQuantity           0.006679
Name: 0, dtype: float64

In [40]:
gbm_tran = model_runner.get_lightgbm_feature_importance()
gbm_feats = pd.DataFrame(list(gbm_tran),train_columns[:-1])

gbm_feats[0].sort_values(ascending=False)

item_Price                     0.304000
item_NameEn                    0.255000
transaction_PatientAge         0.198000
transaction_DiagnosisIds       0.105667
item_Sequence                  0.099333
transaction_PatientEnGender    0.024333
item_RequestQuantity           0.013667
Name: 0, dtype: float64

In [41]:
nn_tran = model_runner.get_neural_network_feature_importance()
nn_feats = pd.DataFrame(list(nn_tran),train_columns[:-1])

nn_feats[0].sort_values(ascending=False)

transaction_PatientEnGender    0.193461
item_RequestQuantity           0.162856
transaction_DiagnosisIds       0.159356
item_Sequence                  0.151382
item_Price                     0.099707
item_NameEn                    0.083131
transaction_PatientAge         0.082109
Name: 0, dtype: float64