In [None]:
# Data (pre-)processing
import os
import pandas as pd
import numpy as np
from fcsy import DataFrame

# Performance & evaluation
from sklearn.model_selection import GroupKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from statistics import mean, median

# Modeling
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from GMMClassifier import GMMClassifier
from FlowSOMClassifier import FlowSOMClassifier
import joblib

# Set-up

In [None]:
# INPUT FOLDERS
# Path of pre-processed BLAST110 FCS files
BLAST110_FCS_path = "data/BLAST110/FCS/"
# Path of BLAST110 labeling
BLAST110_label_path = "<ENTER PATH>"
# Path of pre-processed LAIP29 FCS files
LAIP29_FCS_path = "data/LAIP29/FCS/"
# Path of LAIP29 labeling
LAIP29_label_path = "<ENTER PATH>"

# OUTPUT FOLDERS
BLAST110_output = "output/BLAST110/"
LAIP29_output = "output/LAIP29/"
if not os.path.exists(BLAST110_output):
    os.makedirs(BLAST110_output)
if not os.path.exists(LAIP29_output):
    os.makedirs(LAIP29_output)  

In [None]:
# Define which markers we use for modeling
features = ["SSC-A_scaled", "Horizon V500-A", "PerCP-A", "PC7-A"]
# Jobs to use for modeling
n_jobs = -10

# Aggregate training data


In [None]:
# We create a version of 2K and 5K sampled cells per file
if not os.path.exists(BLAST110_output + "/BLAST110_5K.pkl"):
    dataframes_2K = []
    dataframes_5K = []
    for root, dirs, files in os.walk(BLAST110_FCS_path):
        for file in files:
            sample_id = "_".join(file.split("_")[0:3])
            patient_id = "_".join(file.split("_")[0:2])
            ff = DataFrame.from_fcs(root+file)
            labels = pd.read_csv(BLAST110_label_path+sample_id+".csv", index_col=0)
            ff = pd.merge(ff, labels)
            ff = ff[features + ["Blast", "event_ID"]]
            ff["patient_id"] = patient_id
            ff["sample_id"] = sample_id
            ff_2K = ff.sample(n=2000, random_state=42)
            ff_5K = ff.sample(n=5000, random_state=42)
            dataframes_2K.append(ff_2K)
            dataframes_5K.append(ff_5K)
    
    # Pickle the output
    data = pd.concat(dataframes_2K)
    data = data.reset_index(drop=True)
    data.to_pickle(BLAST110_output + "/BLAST110_2K.pkl")
    
    data = pd.concat(dataframes_5K)
    data = data.reset_index(drop=True)
    data.to_pickle(BLAST110_output + "/BLAST110_5K.pkl") 

# Models and hyperparameter space

In [None]:
def LR():
    clf = SGDClassifier(loss="log_loss", n_jobs=n_jobs, 
                        class_weight="balanced", max_iter=10000, 
                        random_state=42)
    params = {"alpha": 10.0**-np.arange(1,7)}    
    return clf, params

def SVM():
    clf = SGDClassifier(loss="hinge", n_jobs=n_jobs, 
                        class_weight="balanced", max_iter=10000, 
                        random_state=42)
    params = {"alpha": 10.0**-np.arange(1,7)}
    return clf, params

def RF():
    clf = RandomForestClassifier(n_jobs=n_jobs, class_weight="balanced", 
                                 random_state=42)
    params = {"max_depth" : range(2, 11),
              "min_samples_split": range(100, 10001),
              "min_samples_leaf": range(100, 10001)}
    return clf, params

def LightGBM():
    clf = lgb.LGBMClassifier(verbose=-1, n_jobs=n_jobs, objective="binary", 
                             is_unbalance=True, random_state=42)
    params = {"n_estimators": range(50, 1001),
              "num_leaves": range(2, 31),
              "max_depth": range(2, 11),
              "lambda_l2": range(0, 201)}
    return clf, params

def GMMclf():
    clf = GMMClassifier(random_state=42)
    params = {"n_components_class0": range(1, 21),
              "n_components_class1": range(1, 11)}
    return clf, params

def FlowSOMclf():
    clf = FlowSOMClassifier(random_state=42)
    params = {"ratio_threshold": [0.5, 1, 2.5, 5, 7.5, 10]}
    return clf, params
    
models = {"LR": LR(),
          "SVM": SVM(),
          "RF": RF(),
          "LightGBM": LightGBM(),
          "GMMclf": GMMclf(),
          "FlowSOMclf": FlowSOMclf()}

# Cross-validation

In [None]:
# Load the training data
data_train = pd.read_pickle(BLAST110_output + "/BLAST110_2K.pkl") 

X = data_train[features]
y = data_train["Blast"]
groups = data_train["patient_id"]
samples = data_train["sample_id"]

# Define inner and outer CV
outer_CV = GroupKFold(n_splits=10)
inner_CV = GroupKFold(n_splits=10)

# Modeling

**NOTE: The following block takes multiple hours to run!**

In [None]:
if not os.path.exists(BLAST110_output + "/CV/"):
    os.makedirs(BLAST110_output + "/CV/")
    
# Make the outer CV split
for i, (train_index, test_index) in enumerate(outer_CV.split(X, y, groups)):
    # Get training and test set
    X_train = X.iloc[train_index]
    y_train = y[train_index]
    X_test = X.iloc[test_index]
    y_test = y[test_index]

    train_groups = groups[train_index]
    test_groups = groups[test_index]
    train_ids = groups[train_index].unique()
    test_ids = groups[test_index].unique()
    # Use sample IDs to load the full FCS files in the inner split
    sample_ids = samples[test_index].unique()

    # Load the full test data for every outer fold
    test_data = []
    for sample_id in sample_ids:
        ff = DataFrame.from_fcs(BLAST110_FCS_path+sample_id+"_preprocessed.fcs")
        labels = pd.read_csv(BLAST110_label_path+sample_id+".csv", index_col=0)
        ff = pd.merge(ff, labels)
        ff = ff[features + ["Blast", "event_ID"]]
        ff["patient_id"] = "_".join(sample_id.split("_")[:-1])
        ff["sample_id"] = sample_id
        test_data.append(ff)
    test_data = pd.concat(test_data).reset_index(drop=True)

    # Perform 10-fold cross-validation in the inner folds
    for model in models:
        # Check if this modeling iteration was already done
        if os.path.isfile(BLAST110_output+"CV/fold"+str(i)+"_"+model+"_outerCV.csv"):
            continue
        else:
            print("Fitting")
            print(model)

        # Load the model with its associated hyperparameters
        clf, params = models[model]

        # Optimize hyperparameters
        if model in ["RF", "LightGBM", "GMMclf"]:
            # For tree-based models, use randomized search
            opt = RandomizedSearchCV(clf, params, cv=inner_CV, n_iter=20, 
                                     n_jobs=n_jobs, random_state=i, refit="f1",
                                     scoring=["accuracy", "precision", "recall", "f1"])
            opt.fit(X_train, y_train, groups=train_groups)
        else:
            # Use gridsearch for logistic regression and SVM
            opt = GridSearchCV(clf, params, cv=inner_CV, n_jobs=n_jobs, refit="f1",
                               scoring=["accuracy", "precision", "recall", "f1"])
            opt.fit(X=X_train, y=y_train, groups=train_groups)

        # Save the training results
        cv_results = pd.DataFrame(opt.cv_results_)
        cv_results.to_csv(BLAST110_output+"CV/fold"+str(i)+"_"+model+"_innerCV.csv")

        # Predict on the individual outer fold samples and save results
        model_results = []
        for sample_id in sample_ids:
            full_test = test_data[test_data["sample_id"]==sample_id]
            full_pred = opt.predict(full_test[features])

            # Store the results
            result_dict = {"fold": i,
                           "sample_id": sample_id,
                           "model": model,
                           "gt_count": len(full_test[full_test["Blast"]==1]),
                           "gt_perc": len(full_test[full_test["Blast"]==1]) / len(full_test),
                           "pred_count": len(full_pred[full_pred == 1]),
                           "pred_perc": len(full_pred[full_pred == 1]) / len(full_pred),
                           "accuracy": accuracy_score(full_test["Blast"], full_pred),
                           "precision": precision_score(full_test["Blast"], full_pred),
                           "recall": recall_score(full_test["Blast"], full_pred),
                           "f1": f1_score(full_test["Blast"], full_pred)}
            model_results.append(result_dict)
        # Save the model output
        pd.DataFrame(model_results).to_csv(BLAST110_output+"CV/fold"+str(i)+"_"+model+"_outerCV.csv")

# Final GMMclf model

In [None]:
# Train a final GMMclf model based on the 5K training dataset
if not os.path.exists(BLAST110_output+"/GMMclf.pkl"):
    # Set up training data
    data_train = pd.read_pickle(BLAST110_output + "/BLAST110_5K.pkl") 
    X = data_train[features]
    y = data_train["Blast"]
    groups = data_train["patient_id"]
    CV = GroupKFold(n_splits=10)

    # Set up model and hyperparameter search
    clf = GMMClassifier(random_state=42)
    params = {"n_components_class0": range(1, 21),
              "n_components_class1": range(1, 11)}
    opt = GridSearchCV(clf, params, cv=CV, n_jobs=n_jobs, refit="f1",
                       scoring=["accuracy", "precision", "recall", "f1"])
    opt.fit(X=X, y=y, groups=groups)
    
    # Save gridsearch results
    pd.DataFrame(opt.cv_results_).to_csv(BLAST110_output+"/GMMclf_gridsearch.csv")
    
    # Save model
    clf = opt.best_estimator_
    joblib.dump(clf, BLAST110_output+"/GMMclf.pkl")

# LAIP29 evaluation

In [None]:
if not os.path.exists(LAIP29_output + "/GMMclf_predictions.csv"):
    clf = joblib.load(BLAST110_output+"/GMMclf.pkl")
    results = []
    for root, dirs, files in os.walk(LAIP29_FCS_path):
        for file in files:
            sample_id = "_".join(file.split("_")[0:3])
            tube = file.split('_')[3]
            ff = DataFrame.from_fcs(root+file)
            labels = pd.read_csv(LAIP29_label_path+sample_id+"_"+tube+".csv", index_col=0, low_memory=False)
            ff = pd.merge(ff, labels)
            ff["pred"] = clf.predict(ff[features])
            result_dict = {"file":file,
                           "total_count": len(ff),
                           "total_LAIP_cells": len(ff[ff["LAIP"]==1]),
                           "total_conserved_LAIP_cells": len(ff[(ff["pred"]==1)&(ff["LAIP"]==1)]),
                           "gt_count": len(ff[ff["Blast"]==1]),
                           "pred_count": len(ff[ff["pred"]==1]),
                           "accuracy": accuracy_score(ff["Blast"], ff["pred"]),
                           "precision": precision_score(ff["Blast"], ff["pred"]),
                           "recall": recall_score(ff["Blast"], ff["pred"]),
                           "f1": f1_score(ff["Blast"], ff["pred"])}
            results.append(result_dict)
    results = pd.DataFrame(results)
    results.to_csv(LAIP29_output + "/GMMclf_predictions.csv")