In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from scipy.special import rel_entr
import numpy as np
import json
from sklearn.metrics import accuracy_score
import random

In [2]:
train_processed_sample = 'data/processed/train_processed_sample1.csv'
train_processed_target = 'data/processed/train_processed_target.csv'

test_processed_sample = 'data/processed/test_processed_sample1.csv'
test_processed_target = 'data/processed/test_processed_target.csv'
test_processed_diff = 'data/processed/test_processed_differential_diagnosis.csv'

validate_processed_sample = 'data/processed/validate_processed_sample.csv'
validate_processed_target = 'data/processed/validate_processed_target.csv'
validate_processed_diff = 'data/processed/validate_processed_differential_diagnosis.csv'

In [3]:
dep_list = [5, 8, 10, 15, 20, 30]
lr_list = [0.01, 0.03, 0.05, 0.1, 0.3, 0.4, 0.5, 0.8]
subsam_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
colsam_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
rnd_list = [10, 15, 20, 25, 30, 35]

In [4]:
X_train = pd.read_csv(train_processed_sample)
y_train = pd.read_csv(train_processed_target)

dtrain = xgb.DMatrix(X_train, label=y_train)

del X_train
del y_train

X_val = pd.read_csv(validate_processed_sample)
y_val = pd.read_csv(validate_processed_target)

dval = xgb.DMatrix(X_val, label=y_val)

y_val = y_val['PATHOLOGY'].values

diff_val = pd.read_csv(validate_processed_diff)
diff_val = diff_val.to_numpy()

In [5]:
while True:
    for dep in dep_list:
        for lr in lr_list:
            for subsam in subsam_list:
                for colsam in colsam_list:
                    for rnd in rnd_list:
                        seed = random.randint(0, 100000)
                        model_filename = "model/dep" + str(dep) + "_lr" + str(int(lr*100)) + "_subsam" + str(int(subsam*10)) + "_colsam" + str(int(colsam*10)) + "_rnd" + str(rnd) + "_sed" + str(seed)

                        params = {
                            'objective': 'multi:softprob',  # Multi-class classification with probabilities
                            'num_class': 49,  # Number of classes
                            'eval_metric': 'mlogloss',  # Multi-class log loss
                            'max_depth': dep,  # Maximum depth of a tree
                            'learning_rate': lr,  # Learning rate
                            'subsample': subsam,  # Subsample ratio
                            'colsample_bytree': colsam,  # Subsample ratio of columns
                            'seed': seed,
                            'device': 'cuda',
                        }

                        bst = xgb.train(params, dtrain, num_boost_round=rnd)

                        # Eval
                        y_pred_prob = bst.predict(dval)

                        predicted_classes = np.argmax(y_pred_prob, axis=1)
                        score = accuracy_score(y_val, predicted_classes)
                        model_filename += "_GTPA" + str(int(score * 10000))

                        y_pred_prob = bst.predict(dval)

                        threshold = 0.01

                        DDrecall = 0
                        DDprecision = 0
                        for i in range(len(y_pred_prob)):
                            overlap = 0
                            diff_num = 0 
                            pred_num = 0
                            for pathology in range(49):
                                if (y_pred_prob[i][pathology] > threshold):
                                    pred_num += 1
                                if (diff_val[i][pathology] > threshold):
                                    diff_num += 1

                                    if (y_pred_prob[i][pathology] > threshold):
                                        overlap += 1

                            DDrecall += overlap/diff_num
                            DDprecision += overlap/pred_num

                        DDrecall /= len(y_pred_prob)
                        DDprecision /= len(y_pred_prob)
                        F1 = (2 * DDprecision * DDrecall) / (DDprecision + DDrecall)

                        # Save model for later use
                        model_filename += "_F1_" + str(int(F1 * 10000)) + ".json"
                        bst.save_model(model_filename)


KeyboardInterrupt: 