In [1]:
# Data pre-processing
import os
import pandas as pd
import numpy as np
from fcsy import DataFrame
import joblib

# Modeling
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV

# Set-up

In [6]:
# These files are confidential and will not be shared online
# This is the location of the preprocessed RBM18 files
RBM18_FCS_path = "data/RBM18/FCS/"
# This is the location of the RBM18 labels
RBM18_label_path = "<ENTER PATH>"

blast_features = ["SSC-A_scaled", "Horizon V500-A", "PerCP-A", "PC7-A"]
markers = ["FITC-A", "PE-A", "PerCP-A", "PC7-A", "APC-A", 
           "APC-H7-A", "Horizon V450-A", "Horizon V500-A"]

RBM18_output = "output/RBM18"
BLAST110_output = "output/BLAST110"

n_jobs = -10

# Create aggregated datasets

We create two aggregated datasets containing blasts from all 18 samples:
* Manually gated blasts
* Predicted blasts

Both datasets contain a maximum of 4,000 cells per sample.

In [4]:
if not os.path.exists(RBM18_output+"/RBM18_P4_blasts_GMMclf.csv"):
    os.makedirs(RBM18_output)

    # Load the GMMclf
    clf = joblib.load(BLAST110_output+"/GMMclf.pkl")

    RBM_manual = {'P1':[], 'P2':[], 'P3':[], 'P4':[]}
    RBM_GMMclf = {'P1':[], 'P2':[], 'P3':[], 'P4':[]}
    for root, dirs, files in os.walk(RBM18_FCS_path):
        for file in files:
            sample_id = "_".join(file.split("_")[0:3])
            patient_id = "_".join(file.split("_")[0:2])
            tube = file.split('_')[2]
            ff = DataFrame.from_fcs(root+file)
            ff["patient_id"] = patient_id
            ff["sample_id"] = sample_id
            labels = pd.read_csv(RBM18_label_path+sample_id+".csv", index_col=0, low_memory=False)
            ff = pd.merge(ff, labels)
            # Remove residual LAIPs identified in manual gating
            ff = ff[ff['LAIP']==0]
            
            # Downsample and save the manually gated blasts
            manual_blasts = ff[ff["Blast"] == 1]
            if len(manual_blasts) > 4000:
                manual_blasts = manual_blasts.sample(n = 4000, random_state=42)
            RBM_manual[tube].append(manual_blasts)

            # Downsample and save the predicted blasts
            ff["GMMclf"] = clf.predict(ff[blast_features])
            GMMclf_blasts = ff[ff["GMMclf"] == 1]
            if len(GMMclf_blasts) > 4000:
                GMMclf_blasts = GMMclf_blasts.sample(n = 4000, random_state=42)
            RBM_GMMclf[tube].append(GMMclf_blasts)
            
    for tube in ["P1", "P2", "P3", "P4"]:
        manual_blasts = pd.concat(RBM_manual[tube]).reset_index(drop=True)
        manual_blasts.to_csv(RBM18_output+"/RBM18_"+tube+"_blasts_manual.csv")

        GMMclf_blasts = pd.concat(RBM_GMMclf[tube]).reset_index(drop=True)
        GMMclf_blasts.to_csv(RBM18_output+"/RBM18_"+tube+"_blasts_GMMclf.csv")

# Train reference models

The K parameter in each GMM is optimized using the BIC score and LOOCV.

In [5]:
# BIC optimizer from https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_selection.html
def gmm_bic_score(estimator, X):
    """Callable to pass to GridSearchCV that will use the BIC score."""
    # Make it negative since GridSearchCV expects a score to maximize
    return -estimator.bic(X)

In [8]:
# Train model for manual blast gating
for gating in ["manual", "GMMclf"]:
    if not os.path.exists(RBM18_output+"/GMMref_P4_"+gating+".pkl"):
        for tube in ["P1", "P2", "P3", "P4"]:
            data = pd.read_csv(RBM18_output+"/RBM18_"+tube+"_blasts_"+gating+".csv")
            X = data[markers]
            groups = data["patient_id"]
            CV = LeaveOneGroupOut()
            param_grid = {"n_components": range(1, 16)}
            gmm = GaussianMixture(random_state=42)
            opt = GridSearchCV(gmm, param_grid=param_grid, 
                               scoring=gmm_bic_score, cv=CV, n_jobs=n_jobs)
            opt.fit(X, groups=groups)
            
            # Save gridsearch results
            pd.DataFrame(opt.cv_results_).to_csv(RBM18_output+"/GMMref_"+tube+"_"+gating+"_gridsearch.csv")
            
            gmm = opt.best_estimator_
            joblib.dump(gmm, RBM18_output+"/GMMref_"+tube+"_"+gating+".pkl")

# Identify log-likelihood percentiles

In [9]:
cutoffs = [0.1, 1, 2.5, 5, 10, 25, 50]
for gating in ["manual", "GMMclf"]:
    if not os.path.exists(RBM18_output+"/GMMref_P4_"+gating+"_percentiles.csv"):
        for tube in ["P1", "P2", "P3", "P4"]:
            data = pd.read_csv(RBM18_output+"/RBM18_"+tube+"_blasts_"+gating+".csv")
            gmm = joblib.load(RBM18_output+"/GMMref_"+tube+"_"+gating+".pkl")
            llr = gmm.score_samples(data[markers])
            
            cutoff_dict = {}
            for cutoff in cutoffs:
                cutoff_dict[str(cutoff)] =  np.percentile(llr, cutoff)
            output = pd.DataFrame([cutoff_dict])
            output.to_csv(RBM18_output+"/GMMref_"+tube+"_"+gating+"_percentiles.csv")