In [24]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [25]:
data = pd.read_csv("../../data/tcga_pathology/processed/tcga_pathology_reports.csv")
X = data.drop(columns=['cancer_labels'])
y = data.cancer_labels.values

data.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '759', '760', '761', '762', '763', '764', '765', '766', '767',
       'cancer_labels'],
      dtype='object', length=769)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
752,0.198680,0.024915,-0.195526,-0.033028,0.074169,0.076019,0.118719,0.334839,-0.153490,0.048702,...,-0.173195,0.055383,-0.267569,-0.147621,0.270156,-0.291197,0.268658,0.244785,-0.357056,-0.228224
2810,0.110508,-0.293786,-0.218045,-0.130518,0.198673,-0.010610,-0.005065,0.238553,-0.114730,0.118370,...,-0.187993,-0.061602,-0.167573,0.039786,0.276110,-0.250691,0.185378,0.090653,-0.127866,-0.349447
1674,0.142873,-0.344184,-0.162753,-0.027272,0.390561,-0.090615,-0.151586,0.150529,-0.150289,0.180605,...,-0.215279,0.018694,-0.141834,0.065933,0.152321,-0.263283,0.050630,-0.000109,0.009862,-0.328287
2172,-0.059717,-0.406814,-0.122087,-0.133889,0.437806,0.014836,-0.034618,0.075410,-0.184569,0.212918,...,-0.149908,-0.051076,-0.004629,0.008722,0.055604,-0.180921,0.058413,0.109738,-0.006655,-0.343475
891,0.184986,-0.261685,-0.204302,-0.155656,0.171001,-0.034816,0.092865,0.198130,-0.033367,0.075499,...,-0.107130,-0.103813,-0.212665,0.004173,0.249689,-0.226942,0.113310,0.250761,-0.168153,-0.200610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9036,0.263727,-0.291597,-0.242645,0.045362,0.202823,0.030686,0.066699,0.300470,-0.214969,0.126700,...,-0.387786,-0.106691,-0.124229,0.030014,0.258119,-0.225108,0.044992,0.076146,-0.113441,-0.295821
560,0.078041,-0.254316,-0.217612,-0.129100,0.085639,0.081059,0.028686,0.191367,-0.098042,0.180097,...,-0.158204,-0.197579,-0.289611,0.036621,0.328815,-0.355035,0.205058,0.078785,-0.175643,-0.328230
3316,0.089882,-0.304006,-0.152675,0.061119,0.121093,-0.198027,0.049193,-0.006617,0.153249,0.009788,...,-0.221764,-0.071662,-0.129878,-0.126207,0.074800,-0.339821,-0.158300,0.168329,-0.004788,-0.280737
5311,0.000332,-0.303518,-0.090566,-0.151779,0.304763,-0.049618,0.021162,0.107809,-0.032449,0.152639,...,-0.133423,-0.056473,-0.222307,-0.049103,0.206966,-0.164749,0.146690,0.030818,-0.114340,-0.337290


In [27]:
from sklearn.metrics import f1_score, accuracy_score

xgb_classifier = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=1,
    n_jobs=-1,
    early_stopping_rounds=10
)

xgb_classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


val_predictions = xgb_classifier.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
val_macro_f1 = f1_score(y_val, val_predictions, average='macro')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Macro-F1: {val_macro_f1:.4f}")

predictions = xgb_classifier.predict(X_test)

[0]	validation_0-mlogloss:2.98524
[1]	validation_0-mlogloss:2.72167
[2]	validation_0-mlogloss:2.53627
[3]	validation_0-mlogloss:2.38313
[4]	validation_0-mlogloss:2.24956
[5]	validation_0-mlogloss:2.13586
[6]	validation_0-mlogloss:2.04382
[7]	validation_0-mlogloss:1.95906
[8]	validation_0-mlogloss:1.88031
[9]	validation_0-mlogloss:1.81313
[10]	validation_0-mlogloss:1.74905
[11]	validation_0-mlogloss:1.69044
[12]	validation_0-mlogloss:1.63713
[13]	validation_0-mlogloss:1.58519
[14]	validation_0-mlogloss:1.53951
[15]	validation_0-mlogloss:1.49582
[16]	validation_0-mlogloss:1.45483
[17]	validation_0-mlogloss:1.41713
[18]	validation_0-mlogloss:1.37988
[19]	validation_0-mlogloss:1.34360
[20]	validation_0-mlogloss:1.31082
[21]	validation_0-mlogloss:1.27719
[22]	validation_0-mlogloss:1.24839
[23]	validation_0-mlogloss:1.22125
[24]	validation_0-mlogloss:1.19387
[25]	validation_0-mlogloss:1.16789
[26]	validation_0-mlogloss:1.14405
[27]	validation_0-mlogloss:1.12080
[28]	validation_0-mlogloss:1.0

In [28]:
from sklearn.metrics import classification_report

accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, output_dict=True, zero_division=0)

f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.8409
F1: 0.7880
Recall: 0.7601
Precision: 0.8633
