In [None]:
# Data (pre-)processing
import os
import pandas as pd
import numpy as np
from fcsy import DataFrame

# Performance & evaluation
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict

# Modeling
from sklearn.linear_model import LinearRegression
import joblib

# Set-up

In [None]:
# INPUT FOLDERS
# Path of preprocessed BLAST110 FCS files
BLAST110_FCS_path = "data/BLAST110/FCS/"
# Path of BLAST110 labeling
BLAST110_label_path = "<ENTER PATH>"

# Path of preprocessed LAIP29 FCS files
LAIP29_FCS_path = "data/LAIP29/FCS/"
# Path of LAIP29 labeling
LAIP29_label_path = "<ENTER PATH>"

# OUTPUT FOLDERS
BLAST110_output = "output/BLAST110"
LAIP29_output = "output/LAIP29"

# Path of trained GMM classifier
clf = joblib.load(BLAST110_output+"/GMMclf.pkl")

In [None]:
# Define which markers used for blast prediction
features = ["SSC-A_scaled", "Horizon V500-A", "PerCP-A", "PC7-A"]

# Create train and test datasets

Datasets are obtained by calculating the number of cells belonging to each of the GMMclf non-blast components

In [None]:
def get_component_counts(file, label_path, sample_id, patient_id):
    ff = DataFrame.from_fcs(file)
    labels = pd.read_csv(label_path, index_col=0)
    ff = pd.merge(ff, labels)
    # Predict the clusters based on the non-blast GMM
    ff["NB_GMM_cluster"] = clf.gmm_class0.predict(ff[features])
    # Format the component counts
    counts = pd.DataFrame(ff["NB_GMM_cluster"].value_counts())
    counts = counts.transpose().reset_index(drop=True)
    counts["sample_id"] = sample_id
    counts["patient_id"] = patient_id
    counts["WBC_count"] = len(ff[ff["WBC"]==1])
    return(counts)

In [None]:
if not os.path.exists(BLAST110_output + "/BLAST110_GMMclf_NB_counts.csv"):
    data_train = []
    for root, dirs, files in os.walk(BLAST110_FCS_path):
        for file in files:
            sample_id = "_".join(file.split("_")[0:3])
            patient_id = "_".join(file.split("_")[0:2])
            label_path = BLAST110_label_path+sample_id+".csv"
            counts = get_component_counts(root+file, label_path, sample_id, patient_id)
            data_train.append(counts)
    data_train = pd.concat(data_train)
    data_train.to_csv(BLAST110_output + "/BLAST110_GMMclf_NB_counts.csv")

if not os.path.exists(LAIP29_output + "/LAIP29_GMMclf_NB_counts.csv"):
    data_test = []
    for root, dirs, files in os.walk(LAIP29_FCS_path):
        for file in files:
            sample_id = "_".join(file.split("_")[0:4])
            patient_id = "_".join(file.split("_")[0:2])
            label_path = LAIP29_label_path+sample_id+".csv"
            counts = get_component_counts(root+file, label_path, sample_id, patient_id)
            data_test.append(counts)     
    data_test = pd.concat(data_test)
    data_test.to_csv(LAIP29_output + "/LAIP29_GMMclf_NB_counts.csv")

# Modeling

In [None]:
features = [str(i) for i in range(0, clf.gmm_class0.n_components)]
model = LinearRegression()
logo = LeaveOneGroupOut()

if not os.path.exists(BLAST110_output + "/BLAST110_GMMclf_WBC_predictions.csv"):
    data_train = pd.read_csv(BLAST110_output + "/BLAST110_GMMclf_NB_counts.csv")
    data_train["WBC_count_pred"] = cross_val_predict(model, data_train[features], 
                                                     data_train["WBC_count"], 
                                                     groups=data_train["patient_id"], 
                                                     cv=logo)
    data_train["WBC_count_pred"] = data_train["WBC_count_pred"].astype(int)
    data_train.to_csv(BLAST110_output + "/BLAST110_GMMclf_WBC_predictions.csv")
    
if not os.path.exists(LAIP29_output + "/LAIP29_GMMclf_WBC_predictions.csv"):
    data_test = pd.read_csv(LAIP29_output + "/LAIP29_GMMclf_NB_counts.csv")
    # Train the final model and get test set predictions
    model.fit(data_train[features], data_train["WBC_count"])
    
    joblib.dump(model, BLAST110_output+"/GMMclf_WBCreg.pkl")

    data_test["WBC_count_pred"] = model.predict(data_test[features])
    data_test["WBC_count_pred"] = data_test["WBC_count_pred"].astype(int)
    data_test.to_csv(LAIP29_output + "/LAIP29_GMMclf_WBC_predictions.csv")