In [None]:
import os
import pandas as pd
import numpy as np
from fcsy import DataFrame
import joblib

# Set-up

In [None]:
# INPUT FOLDERS
LAIP29_FCS_path = "data/LAIP29/FCS/"
LAIP29_label_path = "<ENTER PATH>"

# These files are confidential and will not be shared online
# This is the location of the preprocessed RBM18 files
RBM18_FCS_path = "data/RBM18/FCS/"
# This is the location of the RBM18 labels
RBM18_label_path = "<ENTER PATH>"

# OUTPUT FOLDERS
RBM18_output = "output/RBM18"
BLAST110_output = "output/BLAST110"
LAIP29_output = "output/LAIP29"

features = ["SSC-A_scaled", "Horizon V500-A", "PerCP-A", "PC7-A"]
markers = ["FITC-A", "PE-A", "PerCP-A", "PC7-A", "APC-A", 
           "APC-H7-A", "Horizon V450-A", "Horizon V500-A"]

# Load models

In [None]:
# Load the GMMclf
clf = joblib.load(BLAST110_output+"/GMMclf.pkl")

# Load the reference models and quantiles
refGMMs = {"GMMclf":{}, "manual":{}}
percentiles = {"GMMclf":{}, "manual":{}}
for gating in ["GMMclf", "manual"]:
    for tube in ["P1", "P2", "P3", "P4"]:
        refGMMs[gating][tube] = joblib.load(RBM18_output+"/GMMref_"+tube+"_"+gating+".pkl")

# Generate model components for RBM18

In [None]:
percentages = {"GMMclf": [], "manual": []}
if not os.path.exists(RBM18_output+"/labels/"):
    os.makedirs(RBM18_output+"/labels/")
    for root, dirs, files in os.walk(RBM18_FCS_path):
        for file in files:
            print(file)
            sample_id = "_".join(file.split("_")[0:3])
            patient_id = "_".join(file.split("_")[0:2])
            tube = file.split('_')[2]
            ff = DataFrame.from_fcs(root+file)
            labels = pd.read_csv(RBM18_label_path+sample_id+".csv", index_col=0, low_memory=False)
            ff = pd.merge(ff, labels)
            WBC = len(ff[ff["WBC"]==1])
            ff["GMMclf"] = clf.predict(ff[features])
            for gating in ["manual", "GMMclf"]:
                ff["GMMref_"+gating+"_component"] = refGMMs[gating][tube].predict(ff[markers])
                ff["GMMref_"+gating+"_component"] = ff["GMMref_"+gating+"_component"] + 1
                # Get the component percentages
                if gating == "manual":
                    subset = ff[ff["Blast"]==1]
                else:
                    subset = ff[ff["GMMclf"]==1]
                counts = pd.DataFrame(subset["GMMref_"+gating+"_component"].value_counts())
                counts = counts.transpose().reset_index(drop=True)
                percentage = (counts / WBC) * 100
                percentage["patient_id"] = patient_id
                percentage["sample_id"] = sample_id
                percentage["tube"] = tube
                percentages[gating].append(percentage)
            ff = ff[["event_ID", "Blast", "GMMclf", "GMMref_manual_component", "GMMref_GMMclf_component"]]
            ff.to_csv(RBM18_output+"/labels/"+sample_id+".csv")