In [None]:
# Data (pre-)processing
import os
import pandas as pd
import numpy as np
from fcsy import DataFrame

import joblib

from datetime import datetime

In [None]:
# INPUT FOLDERS
# Path of preprocessed LAIP29 FCS files
LAIP29_FCS_path = "data/LAIP29/FCS/"

# OUTPUT FOLDERS
BLAST110_output = "output/BLAST110"
LAIP29_output = "output/LAIP29"
RBM18_output = "output/RBM18"

In [None]:
# Define which markers we used for blast prediction
features = ["SSC-A_scaled", "Horizon V500-A", "PerCP-A", "PC7-A"]
# Markers used for reference models
markers = ["FITC-A", "PE-A", "PerCP-A", "PC7-A", "APC-A", 
           "APC-H7-A", "Horizon V450-A", "Horizon V500-A"]

In [None]:
# Load the GMMclf
clf = joblib.load(BLAST110_output+"/GMMclf.pkl")

# Load the WBC regressor
reg = joblib.load(BLAST110_output+"/GMMclf_WBCreg.pkl")

ref_data = {"GMMclf":{}, "manual":{}}
for gating in ["GMMclf", "manual"]:
    for tube in ["P1", "P2", "P3", "P4"]:
        ref_data[gating][tube] = pd.read_csv(RBM18_output+"/RBM18_"+tube+"_blasts_"+gating+".csv", index_col=0)
        
# Load the reference models and quantiles
refGMMs = {"GMMclf":{}, "manual":{}}
percentiles = {"GMMclf":{}, "manual":{}}
for gating in ["GMMclf", "manual"]:
    for tube in ["P1", "P2", "P3", "P4"]:
        refGMMs[gating][tube] = joblib.load(RBM18_output+"/GMMref_"+tube+"_"+gating+".pkl")
        cutoffs = pd.read_csv(RBM18_output+"/GMMref_"+tube+"_"+gating+"_percentiles.csv", index_col=0)
        percentiles[gating][tube] = cutoffs.to_dict(orient='records')[0]

In [None]:
all_results = []

for root, dirs, files in os.walk(LAIP29_FCS_path):
    for file in files:
        print(file)
        sample_id = "_".join(file.split("_")[0:4])
        patient_id = "_".join(file.split("_")[0:2])
        timepoint = file.split("_")[2]
        tube = file.split('_')[3]
        
        results = {}
        results["file"] = file
        
        startTime = datetime.now()
        ff = DataFrame.from_fcs(root+file)
        results["read FCS (py)"] = datetime.now() - startTime
        
        # Predict blasts
        startTime = datetime.now()
        ff["GMMclf"] = clf.predict(ff[features])
        results["GMMclf"] = datetime.now() - startTime
        ff = ff[ff["GMMclf"]==1]
        
        # Predict and save the WBC counts
        startTime = datetime.now()
        ff["NB_GMM_cluster"] = clf.gmm_class0.predict(ff[features])
        counts = pd.DataFrame(ff["NB_GMM_cluster"].value_counts())
        counts = counts.transpose().reset_index(drop=True)
        for i in range(0, clf.gmm_class0.n_components):
            if i not in counts:
                counts[i] = 0
        count_cols = [i for i in range(0, clf.gmm_class0.n_components)]
        WBC_count_pred = int(reg.predict(counts[count_cols]))
        results["WBC"] = datetime.now() - startTime

        # GMM
        ref_model = refGMMs["GMMclf"][tube]
        
        startTime = datetime.now()
        scores = ref_model.score_samples(ff[markers])
        results["LLR"] = datetime.now() - startTime
        all_results.append(results)
all_results = pd.DataFrame(all_results)

In [None]:
# Merge with previous results from R
R_results = pd.read_csv(BLAST110_output + "/R_timing.csv", index_col=0)
R_results["file"] = [i.split("/")[-1] for i in R_results["file"]]
all_results = pd.merge(all_results, R_results)

In [None]:
# Convert to right format
for col in all_results.columns:
    if col in ["read FCS (py)", "GMMclf", "WBC", "LLR"]:
        all_results[col] = [i.total_seconds() for i in all_results[col]]

In [None]:
# Calculate the mean for all numerical columns
all_results['Sum'] = all_results.select_dtypes(include=[int, float]).sum(axis=1)
mean_values = all_results.select_dtypes(include=[int, float]).mean()

# Calculate the standard deviation for all numerical columns
std_values = all_results.select_dtypes(include=[int, float]).std()

print("Mean values:\n", mean_values)
print("\nStandard Deviation values:\n", std_values)