In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from scipy.special import rel_entr
import numpy as np

In [2]:
train_processed_sample = 'data/processed/train_processed_sample1.csv'
train_processed_target = 'data/processed/train_processed_target.csv'

test_processed_sample = 'data/processed/test_processed_sample1.csv'
test_processed_target = 'data/processed/test_processed_target.csv'
test_processed_diff = 'data/processed/test_processed_differential_diagnosis.csv'

validate_processed_sample = 'data/processed/validate_processed_sample.csv'
validate_processed_target = 'data/processed/validate_processed_target.csv'

In [3]:
X_train = pd.read_csv(train_processed_sample)
y_train = pd.read_csv(train_processed_target)

dtrain = xgb.DMatrix(X_train, label=y_train)

# del X_train
# del y_train

In [4]:
params = {
    'objective': 'multi:softprob',  # Multi-class classification with probabilities
    'num_class': 49,  # Number of classes
    'eval_metric': 'mlogloss',  # Multi-class log loss
    'max_depth': 10,  # Maximum depth of a tree
    'learning_rate': 0.4,  # Learning rate
    'subsample': 0.8,  # Subsample ratio
    'colsample_bytree': 0.8,  # Subsample ratio of columns
    'seed': 345,
    'device': 'cuda',
}

bst = xgb.train(params, dtrain, num_boost_round=20)

In [5]:
X_test = pd.read_csv(test_processed_sample)
y_test = pd.read_csv(test_processed_target)

dtest = xgb.DMatrix(X_test, label=y_test)

del X_test
del y_test

y_pred_prob = bst.predict(dtest)
diff_test = pd.read_csv(test_processed_diff)
diff_test = diff_test.to_numpy()

kl_divergence = np.sum(rel_entr(diff_test, y_pred_prob))/diff_test.shape[0]
print(f'KL Divergence: {kl_divergence:.4f}')

KL Divergence: inf


In [6]:
# Save the model to a file
model_filename = 'model/xgboost_model_dep10_rnd20_sed345_lr04.json'
bst.save_model(model_filename)

In [7]:
import json
from sklearn.metrics import accuracy_score

predicted_classes = np.argmax(y_pred_prob, axis=1)
y_test = pd.read_csv(test_processed_target)
y_test = y_test['PATHOLOGY'].values
score = accuracy_score(y_test, predicted_classes)
print(y_test, predicted_classes, score)


[35  8 39 ... 42 41 44] [35  8 14 ... 42 15 15] 0.44692965828929077


In [8]:
X_val = pd.read_csv(validate_processed_sample)
y_val = pd.read_csv(validate_processed_target)

dval = xgb.DMatrix(X_val, label=y_val)

del X_val
del y_val

y_pred_prob = bst.predict(dval)

predicted_classes = np.argmax(y_pred_prob, axis=1)
y_val = pd.read_csv(validate_processed_target)
y_val = y_val['PATHOLOGY'].values
score = accuracy_score(y_val, predicted_classes)
print(y_val, predicted_classes, score)


[ 3  5 33 ... 35 48 24] [ 3  4 33 ... 35 48 34] 0.44832689055327374
