In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np

## Load data

In [2]:
# --- Load data ---
mlst_train_df = pd.read_csv('../assets/mlst_train_set.csv')
mlst_test_df = pd.read_csv('../assets/mlst_test_set.csv')
mlst_val_df = pd.read_csv('../assets/mlst_val_set.csv')

serotype_train_df = pd.read_csv('../assets/serotype_train_set.csv')
serotype_test_df = pd.read_csv('../assets/serotype_test_set.csv')
serotype_val_df = pd.read_csv('../assets/serotype_val_set.csv')

subspecies_train_df = pd.read_csv('../assets/subspecies_train_set.csv')
subspecies_test_df = pd.read_csv('../assets/subspecies_test_set.csv')
subspecies_val_df = pd.read_csv('../assets/subspecies_val_set.csv')

kmc5_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc5_arrays/')
kmc7_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc7_arrays/')

def load_kmer_arrays(df, array_dir, suffix):
    arrays = []
    labels = []
    for idx, row in df.iterrows():
        sample_id = row[0]
        label = row[1]
        array_path = os.path.join(array_dir, f"{sample_id}{suffix}.npy")
        if os.path.exists(array_path):
            array = np.load(array_path).flatten()
            arrays.append(array)
            labels.append(label)
        else:
            print(f"Warning: Array file {array_path} not found.")
    return np.array(arrays), np.array(labels)

# MLST
X_train_mlst_5, y_train_mlst_5 = load_kmer_arrays(mlst_train_df, kmc5_arrays, '_k5_k5')
X_val_mlst_5, y_val_mlst_5 = load_kmer_arrays(mlst_val_df, kmc5_arrays, '_k5_k5')
X_test_mlst_5, y_test_mlst_5 = load_kmer_arrays(mlst_test_df, kmc5_arrays, '_k5_k5')

X_train_mlst_7, y_train_mlst_7 = load_kmer_arrays(mlst_train_df, kmc7_arrays, '_k7_k7')
X_val_mlst_7, y_val_mlst_7 = load_kmer_arrays(mlst_val_df, kmc7_arrays, '_k7_k7')
X_test_mlst_7, y_test_mlst_7 = load_kmer_arrays(mlst_test_df, kmc7_arrays, '_k7_k7')

# Serotype
X_train_sero_5, y_train_sero_5 = load_kmer_arrays(serotype_train_df, kmc5_arrays, '_k5_k5')
X_val_sero_5, y_val_sero_5 = load_kmer_arrays(serotype_val_df, kmc5_arrays, '_k5_k5')
X_test_sero_5, y_test_sero_5 = load_kmer_arrays(serotype_test_df, kmc5_arrays, '_k5_k5')

X_train_sero_7, y_train_sero_7 = load_kmer_arrays(serotype_train_df, kmc7_arrays, '_k7_k7')
X_val_sero_7, y_val_sero_7 = load_kmer_arrays(serotype_val_df, kmc7_arrays, '_k7_k7')
X_test_sero_7, y_test_sero_7 = load_kmer_arrays(serotype_test_df, kmc7_arrays, '_k7_k7')

# Subspecies
X_train_sub_5, y_train_sub_5 = load_kmer_arrays(subspecies_train_df, kmc5_arrays, '_k5_k5')
X_val_sub_5, y_val_sub_5 = load_kmer_arrays(subspecies_val_df, kmc5_arrays, '_k5_k5')
X_test_sub_5, y_test_sub_5 = load_kmer_arrays(subspecies_test_df, kmc5_arrays, '_k5_k5')

X_train_sub_7, y_train_sub_7 = load_kmer_arrays(subspecies_train_df, kmc7_arrays, '_k7_k7')
X_val_sub_7, y_val_sub_7 = load_kmer_arrays(subspecies_val_df, kmc7_arrays, '_k7_k7')
X_test_sub_7, y_test_sub_7 = load_kmer_arrays(subspecies_test_df, kmc7_arrays, '_k7_k7')

  sample_id = row[0]
  label = row[1]


## Define model

In [4]:
def train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, num_boost_round=200):
    # Encode labels if categorical
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    # Convert to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train_enc)
    dval = xgb.DMatrix(X_val, label=y_val_enc)
    dtest = xgb.DMatrix(X_test, label=y_test_enc)

    # GPU parameters
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'multi:softprob',
        'num_class': len(le.classes_),
        'eval_metric': 'mlogloss',
        'seed': 42
    }

    # Train
    evals = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=evals, verbose_eval=False)

    # Predict
    y_val_pred = model.predict(dval)
    y_val_pred_labels = y_val_pred.argmax(axis=1)
    f1_val = f1_score(y_val_enc, y_val_pred_labels, average='macro')

    y_test_pred = model.predict(dtest)
    y_test_pred_labels = y_test_pred.argmax(axis=1)
    f1_test = f1_score(y_test_enc, y_test_pred_labels, average='macro')

    print("Validation F1 macro:", f1_val)
    print("Test F1 macro:", f1_test)

    return model, f1_val, f1_test

## Run model on kmer=5

In [5]:
# Define a dictionary to store results
results = []

# --- MLST ---
_, f1_val, f1_test = train_xgboost(X_train_mlst_5, y_train_mlst_5,
                                   X_val_mlst_5, y_val_mlst_5,
                                   X_test_mlst_5, y_test_mlst_5)
results.append({
    "dataset": "MLST",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Serotype ---
_, f1_val, f1_test = train_xgboost(X_train_sero_5, y_train_sero_5,
                                   X_val_sero_5, y_val_sero_5,
                                   X_test_sero_5, y_test_sero_5)
results.append({
    "dataset": "Serotype",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Subspecies ---
_, f1_val, f1_test = train_xgboost(X_train_sub_5, y_train_sub_5,
                                   X_val_sub_5, y_val_sub_5,
                                   X_test_sub_5, y_test_sub_5)
results.append({
    "dataset": "Subspecies",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv("../results/xgboost_macro_f1_results_k5.csv", index=False)
print("complete")

Validation F1 macro: 0.8267972769617082
Test F1 macro: 0.8404815041686307
Validation F1 macro: 0.8335756974267287
Test F1 macro: 0.8249688434946545
Validation F1 macro: 0.9680453730856957
Test F1 macro: 0.9916614550760893
complete


## Run model on kmer=7

In [7]:
# Define a list to store results
results_k7 = []

# --- MLST ---
_, f1_val, f1_test = train_xgboost(X_train_mlst_7, y_train_mlst_7,
                                   X_val_mlst_7, y_val_mlst_7,
                                   X_test_mlst_7, y_test_mlst_7)
results_k7.append({
    "dataset": "MLST",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Serotype ---
_, f1_val, f1_test = train_xgboost(X_train_sero_7, y_train_sero_7,
                                   X_val_sero_7, y_val_sero_7,
                                   X_test_sero_7, y_test_sero_7)
results_k7.append({
    "dataset": "Serotype",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Subspecies ---
_, f1_val, f1_test = train_xgboost(X_train_sub_7, y_train_sub_7,
                                   X_val_sub_7, y_val_sub_7,
                                   X_test_sub_7, y_test_sub_7)
results_k7.append({
    "dataset": "Subspecies",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# Convert results to DataFrame
results_k7_df = pd.DataFrame(results_k7)

# Save to CSV
results_k7_df.to_csv("../results/xgboost_macro_f1_results_k7.csv", index=False)
print("complete")

Validation F1 macro: 0.942288121124333
Test F1 macro: 0.9442759694192361
Validation F1 macro: 0.9466071472058517
Test F1 macro: 0.9399310257779535
Validation F1 macro: 0.9791564841968068
Test F1 macro: 1.0
complete
