In [3]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np

In [4]:
# --- Load data ---
mlst_train_df = pd.read_csv('../assets/training_set_mlst.csv')
mlst_test_df = pd.read_csv('../assets/test_set_mlst.csv')
mlst_val_df = pd.read_csv('../assets/validation_set_mlst.csv')

serotype_train_df = pd.read_csv('../assets/training_set_serotype.csv')
serotype_test_df = pd.read_csv('../assets/test_set_serotype.csv')
serotype_val_df = pd.read_csv('../assets/validation_set_serotype.csv')

subspecies_train_df = pd.read_csv('../assets/training_set_subspecies.csv')
subspecies_test_df = pd.read_csv('../assets/test_set_subspecies.csv')
subspecies_val_df = pd.read_csv('../assets/validation_set_subspecies.csv')

kmc5_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc5_arrays/')
kmc7_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc7_arrays/')

def load_kmer_arrays(df, array_dir, suffix):
    arrays = []
    labels = []
    for idx, row in df.iterrows():
        sample_id = row[0]
        label = row[1]
        array_path = os.path.join(array_dir, f"{sample_id}{suffix}.npy")
        if os.path.exists(array_path):
            array = np.load(array_path).flatten()
            arrays.append(array)
            labels.append(label)
        else:
            print(f"Warning: Array file {array_path} not found.")
    return np.array(arrays), np.array(labels)

# MLST
X_train_mlst_5, y_train_mlst_5 = load_kmer_arrays(mlst_train_df, kmc5_arrays, '_k5_k5')
X_val_mlst_5, y_val_mlst_5 = load_kmer_arrays(mlst_val_df, kmc5_arrays, '_k5_k5')
X_test_mlst_5, y_test_mlst_5 = load_kmer_arrays(mlst_test_df, kmc5_arrays, '_k5_k5')

X_train_mlst_7, y_train_mlst_7 = load_kmer_arrays(mlst_train_df, kmc7_arrays, '_k7_k7')
X_val_mlst_7, y_val_mlst_7 = load_kmer_arrays(mlst_val_df, kmc7_arrays, '_k7_k7')
X_test_mlst_7, y_test_mlst_7 = load_kmer_arrays(mlst_test_df, kmc7_arrays, '_k7_k7')

# Serotype
X_train_sero_5, y_train_sero_5 = load_kmer_arrays(serotype_train_df, kmc5_arrays, '_k5_k5')
X_val_sero_5, y_val_sero_5 = load_kmer_arrays(serotype_val_df, kmc5_arrays, '_k5_k5')
X_test_sero_5, y_test_sero_5 = load_kmer_arrays(serotype_test_df, kmc5_arrays, '_k5_k5')

X_train_sero_7, y_train_sero_7 = load_kmer_arrays(serotype_train_df, kmc7_arrays, '_k7_k7')
X_val_sero_7, y_val_sero_7 = load_kmer_arrays(serotype_val_df, kmc7_arrays, '_k7_k7')
X_test_sero_7, y_test_sero_7 = load_kmer_arrays(serotype_test_df, kmc7_arrays, '_k7_k7')

# Subspecies
X_train_sub_5, y_train_sub_5 = load_kmer_arrays(subspecies_train_df, kmc5_arrays, '_k5_k5')
X_val_sub_5, y_val_sub_5 = load_kmer_arrays(subspecies_val_df, kmc5_arrays, '_k5_k5')
X_test_sub_5, y_test_sub_5 = load_kmer_arrays(subspecies_test_df, kmc5_arrays, '_k5_k5')

X_train_sub_7, y_train_sub_7 = load_kmer_arrays(subspecies_train_df, kmc7_arrays, '_k7_k7')
X_val_sub_7, y_val_sub_7 = load_kmer_arrays(subspecies_val_df, kmc7_arrays, '_k7_k7')
X_test_sub_7, y_test_sub_7 = load_kmer_arrays(subspecies_test_df, kmc7_arrays, '_k7_k7')

  sample_id = row[0]
  label = row[1]


In [1]:
# --- XGBoost Model Training Function ---

def train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test):
    # Encode labels if they are categorical
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_val_encoded = le.transform(y_val)
    y_test_encoded = le.transform(y_test)

    # Create and fit the model
    xgb_model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        random_state=42
    )
    xgb_model.fit(X_train, y_train_encoded)

    # --- VALIDATION SET ---
    y_val_pred_xgb = xgb_model.predict(X_val)
    print("XGBoost Validation Set Classification Report:")
    print(classification_report(y_val_encoded, y_val_pred_xgb))

    f1_val_macro = f1_score(y_val_encoded, y_val_pred_xgb, average="macro")
    print(f"Macro-averaged F1 (Validation): {f1_val_macro:.4f}\n")

    # --- TEST SET ---
    y_test_pred_xgb = xgb_model.predict(X_test)
    print("XGBoost Test Set Classification Report:")
    print(classification_report(y_test_encoded, y_test_pred_xgb))

    f1_test_macro = f1_score(y_test_encoded, y_test_pred_xgb, average="macro")
    print(f"Macro-averaged F1 (Test): {f1_test_macro:.4f}\n")

    # Optionally return the trained model and F1 scores
    return xgb_model, f1_val_macro, f1_test_macro

In [6]:
def train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, num_boost_round=200):
    # Encode labels if categorical
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    # Convert to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train_enc)
    dval = xgb.DMatrix(X_val, label=y_val_enc)
    dtest = xgb.DMatrix(X_test, label=y_test_enc)

    # GPU parameters
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'multi:softprob',
        'num_class': len(le.classes_),
        'eval_metric': 'mlogloss',
        'seed': 42
    }

    # Train
    evals = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=evals, verbose_eval=False)

    # Predict
    y_val_pred = model.predict(dval)
    y_val_pred_labels = y_val_pred.argmax(axis=1)
    f1_val = f1_score(y_val_enc, y_val_pred_labels, average='macro')

    y_test_pred = model.predict(dtest)
    y_test_pred_labels = y_test_pred.argmax(axis=1)
    f1_test = f1_score(y_test_enc, y_test_pred_labels, average='macro')

    print("Validation F1 macro:", f1_val)
    print("Test F1 macro:", f1_test)

    return model, f1_val, f1_test

In [7]:
# Define a dictionary to store results
results = []

# --- MLST ---
_, f1_val, f1_test = train_xgboost(X_train_mlst_5, y_train_mlst_5,
                                   X_val_mlst_5, y_val_mlst_5,
                                   X_test_mlst_5, y_test_mlst_5)
results.append({
    "dataset": "MLST",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Serotype ---
_, f1_val, f1_test = train_xgboost(X_train_sero_5, y_train_sero_5,
                                   X_val_sero_5, y_val_sero_5,
                                   X_test_sero_5, y_test_sero_5)
results.append({
    "dataset": "Serotype",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Subspecies ---
_, f1_val, f1_test = train_xgboost(X_train_sub_5, y_train_sub_5,
                                   X_val_sub_5, y_val_sub_5,
                                   X_test_sub_5, y_test_sub_5)
results.append({
    "dataset": "Subspecies",
    "kmer": 5,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv("xgboost_macro_f1_results_k5.csv", index=False)
print("F1 scores saved to xgboost_macro_f1_results_k5.csv")

Validation F1 macro: 0.964368671579205
Test F1 macro: 0.8149791657745349
Validation F1 macro: 0.967780652295746
Test F1 macro: 0.8356484335530661
Validation F1 macro: 0.9895689098126098
Test F1 macro: 0.9666361416361418
F1 scores saved to xgboost_macro_f1_results_k5.csv


In [8]:
# Define a list to store results
results_k7 = []

# --- MLST ---
_, f1_val, f1_test = train_xgboost(X_train_mlst_7, y_train_mlst_7,
                                   X_val_mlst_7, y_val_mlst_7,
                                   X_test_mlst_7, y_test_mlst_7)
results_k7.append({
    "dataset": "MLST",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Serotype ---
_, f1_val, f1_test = train_xgboost(X_train_sero_7, y_train_sero_7,
                                   X_val_sero_7, y_val_sero_7,
                                   X_test_sero_7, y_test_sero_7)
results_k7.append({
    "dataset": "Serotype",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# --- Subspecies ---
_, f1_val, f1_test = train_xgboost(X_train_sub_7, y_train_sub_7,
                                   X_val_sub_7, y_val_sub_7,
                                   X_test_sub_7, y_test_sub_7)
results_k7.append({
    "dataset": "Subspecies",
    "kmer": 7,
    "f1_val_macro": f1_val,
    "f1_test_macro": f1_test
})

# Convert results to DataFrame
results_k7_df = pd.DataFrame(results_k7)

# Save to CSV
results_k7_df.to_csv("xgboost_macro_f1_results_k7.csv", index=False)
print("F1 scores for k=7 saved to xgboost_macro_f1_results_k7.csv")

Validation F1 macro: 0.9888716834763865
Test F1 macro: 0.9431231076280088
Validation F1 macro: 0.9892864743123514
Test F1 macro: 0.9420092226648146
Validation F1 macro: 0.9916391434347757
Test F1 macro: 0.9585157390035439
F1 scores for k=7 saved to xgboost_macro_f1_results_k7.csv
