In [7]:
import pandas as pd
import os
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [6]:
# --- Load data ---
mlst_train_df = pd.read_csv('../assets/training_set_mlst.csv')
mlst_test_df = pd.read_csv('../assets/test_set_mlst.csv')
mlst_val_df = pd.read_csv('../assets/validation_set_mlst.csv')

serotype_train_df = pd.read_csv('../assets/training_set_serotype.csv')
serotype_test_df = pd.read_csv('../assets/test_set_serotype.csv')
serotype_val_df = pd.read_csv('../assets/validation_set_serotype.csv')

subspecies_train_df = pd.read_csv('../assets/training_set_subspecies.csv')
subspecies_test_df = pd.read_csv('../assets/test_set_subspecies.csv')
subspecies_val_df = pd.read_csv('../assets/validation_set_subspecies.csv')

kmc5_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc5_arrays/')
kmc7_arrays = os.path.expanduser('~/PROJECTS/GaTech/FCGR_classifier/salmonella_kmc7_arrays/')

def load_kmer_arrays(df, array_dir, suffix):
    arrays = []
    labels = []
    for idx, row in df.iterrows():
        sample_id = row[0]
        label = row[1]
        array_path = os.path.join(array_dir, f"{sample_id}{suffix}.npy")
        if os.path.exists(array_path):
            array = np.load(array_path).flatten()
            arrays.append(array)
            labels.append(label)
        else:
            print(f"Warning: Array file {array_path} not found.")
    return np.array(arrays), np.array(labels)

# MLST
X_train_mlst_5, y_train_mlst_5 = load_kmer_arrays(mlst_train_df, kmc5_arrays, '_k5_k5')
X_val_mlst_5, y_val_mlst_5 = load_kmer_arrays(mlst_val_df, kmc5_arrays, '_k5_k5')
X_test_mlst_5, y_test_mlst_5 = load_kmer_arrays(mlst_test_df, kmc5_arrays, '_k5_k5')

X_train_mlst_7, y_train_mlst_7 = load_kmer_arrays(mlst_train_df, kmc7_arrays, '_k7_k7')
X_val_mlst_7, y_val_mlst_7 = load_kmer_arrays(mlst_val_df, kmc7_arrays, '_k7_k7')
X_test_mlst_7, y_test_mlst_7 = load_kmer_arrays(mlst_test_df, kmc7_arrays, '_k7_k7')

# Serotype
X_train_sero_5, y_train_sero_5 = load_kmer_arrays(serotype_train_df, kmc5_arrays, '_k5_k5')
X_val_sero_5, y_val_sero_5 = load_kmer_arrays(serotype_val_df, kmc5_arrays, '_k5_k5')
X_test_sero_5, y_test_sero_5 = load_kmer_arrays(serotype_test_df, kmc5_arrays, '_k5_k5')

X_train_sero_7, y_train_sero_7 = load_kmer_arrays(serotype_train_df, kmc7_arrays, '_k7_k7')
X_val_sero_7, y_val_sero_7 = load_kmer_arrays(serotype_val_df, kmc7_arrays, '_k7_k7')
X_test_sero_7, y_test_sero_7 = load_kmer_arrays(serotype_test_df, kmc7_arrays, '_k7_k7')

# Subspecies
X_train_sub_5, y_train_sub_5 = load_kmer_arrays(subspecies_train_df, kmc5_arrays, '_k5_k5')
X_val_sub_5, y_val_sub_5 = load_kmer_arrays(subspecies_val_df, kmc5_arrays, '_k5_k5')
X_test_sub_5, y_test_sub_5 = load_kmer_arrays(subspecies_test_df, kmc5_arrays, '_k5_k5')

X_train_sub_7, y_train_sub_7 = load_kmer_arrays(subspecies_train_df, kmc7_arrays, '_k7_k7')
X_val_sub_7, y_val_sub_7 = load_kmer_arrays(subspecies_val_df, kmc7_arrays, '_k7_k7')
X_test_sub_7, y_test_sub_7 = load_kmer_arrays(subspecies_test_df, kmc7_arrays, '_k7_k7')

  sample_id = row[0]
  label = row[1]


In [8]:
def train_logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test):
    # Create and train logistic regression model with standard scaler
    model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42))
    model.fit(X_train, y_train)  # Train the model on the training set

    # --- VALIDATION SET ---
    y_val_pred = model.predict(X_val)
    print("Validation Set Classification Report:")
    print(classification_report(y_val, y_val_pred))
    f1_val_macro = f1_score(y_val, y_val_pred, average="macro")
    print(f"Macro-averaged F1 (Validation): {f1_val_macro:.4f}\n")

    # --- TEST SET ---
    y_test_pred = model.predict(X_test)
    print("Test Set Classification Report:")
    print(classification_report(y_test, y_test_pred))
    f1_test_macro = f1_score(y_test, y_test_pred, average="macro")
    print(f"Macro-averaged F1 (Test): {f1_test_macro:.4f}\n")

    return model, f1_val_macro, f1_test_macro


In [9]:
# k-mer size 5

results_logreg = []

# MLST
_, f1_val, f1_test = train_logistic_regression(X_train_mlst_5, y_train_mlst_5,
                                               X_val_mlst_5, y_val_mlst_5,
                                               X_test_mlst_5, y_test_mlst_5)
results_logreg.append({"dataset": "MLST", "kmer": 5, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Serotype
_, f1_val, f1_test = train_logistic_regression(X_train_sero_5, y_train_sero_5,
                                               X_val_sero_5, y_val_sero_5,
                                               X_test_sero_5, y_test_sero_5)
results_logreg.append({"dataset": "Serotype", "kmer": 5, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Subspecies
_, f1_val, f1_test = train_logistic_regression(X_train_sub_5, y_train_sub_5,
                                               X_val_sub_5, y_val_sub_5,
                                               X_test_sub_5, y_test_sub_5)
results_logreg.append({"dataset": "Subspecies", "kmer": 5, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Save results to CSV
import pandas as pd
results_logreg_df = pd.DataFrame(results_logreg)
results_logreg_df.to_csv("logreg_macro_f1_results.csv", index=False)
print("Macro F1 scores saved to logreg_macro_f1_results.csv")

Validation Set Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.97      0.99        80
           2       0.97      0.96      0.97        80
           4       1.00      1.00      1.00        80
           5       1.00      1.00      1.00        80
          10       1.00      1.00      1.00        80
          11       0.87      0.72      0.79        80
          13       1.00      1.00      1.00        80
          14       1.00      1.00      1.00        80
          15       1.00      1.00      1.00        80
          16       1.00      1.00      1.00        80
          17       1.00      1.00      1.00        80
          18       1.00      1.00      1.00        80
          19       0.92      0.81      0.86        80
          20       0.99      1.00      0.99        80
          22       0.98      0.99      0.98        80
          23       1.00      0.88      0.93        80
          24       1.00      0.99      0.99

In [10]:
# k-mer size 7
results_logreg = []

# MLST
_, f1_val, f1_test = train_logistic_regression(X_train_mlst_7, y_train_mlst_7,
                                               X_val_mlst_7, y_val_mlst_7,
                                               X_test_mlst_7, y_test_mlst_7)
results_logreg.append({"dataset": "MLST", "kmer": 7, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Serotype
_, f1_val, f1_test = train_logistic_regression(X_train_sero_7, y_train_sero_7,
                                               X_val_sero_7, y_val_sero_7,
                                               X_test_sero_7, y_test_sero_7)
results_logreg.append({"dataset": "Serotype", "kmer": 7, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Subspecies
_, f1_val, f1_test = train_logistic_regression(X_train_sub_7, y_train_sub_7,
                                               X_val_sub_7, y_val_sub_7,
                                               X_test_sub_7, y_test_sub_7)
results_logreg.append({"dataset": "Subspecies", "kmer": 7, "f1_val_macro": f1_val, "f1_test_macro": f1_test})

# Save results to CSV
import pandas as pd
results_logreg_df = pd.DataFrame(results_logreg)
results_logreg_df.to_csv("logreg_macro_f1_results_k7.csv", index=False)
print("Macro F1 scores saved to logreg_macro_f1_results_k7.csv")


Validation Set Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        80
           2       1.00      0.97      0.99        80
           4       1.00      1.00      1.00        80
           5       1.00      1.00      1.00        80
          10       1.00      1.00      1.00        80
          11       0.99      0.94      0.96        80
          13       1.00      1.00      1.00        80
          14       1.00      1.00      1.00        80
          15       1.00      1.00      1.00        80
          16       1.00      1.00      1.00        80
          17       1.00      1.00      1.00        80
          18       1.00      1.00      1.00        80
          19       1.00      0.96      0.98        80
          20       0.99      1.00      0.99        80
          22       1.00      1.00      1.00        80
          23       0.97      0.96      0.97        80
          24       1.00      0.99      0.99