In [1]:
import pandas as pd
import numpy as np
import os
import FlowCal
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the label data from "EU_label.xlsx"
labels_df = pd.read_excel("EU_label.xlsx")

# Assuming the label data has a column "Patient_ID" for patient IDs and a column "Label" for labelss
labels_dict = dict(zip(labels_df['file_flow_id'], labels_df['label']))

# Load the marker-channel mapping data from "EU_marker_channel_mapping.xlsx"
marker_mapping_df = pd.read_excel("EU_marker_channel_mapping.xlsx")

# Filter for channels with "use" = 1
marker_channels = marker_mapping_df[marker_mapping_df['use'] == 1]['PxN(channel)'].tolist()

In [2]:
# Create lists to store feature vectors and labels
feature_vectors = []
labels = []

# Loop through sub-folders and FCS files
main_folder = "/Users/atifsarwar/Downloads/raw_fcs"  
for sub_folder in os.listdir(main_folder):
    sub_folder_path = os.path.join(main_folder, sub_folder)
    if os.path.isdir(sub_folder_path):
        patient_id = sub_folder  # Assuming the sub-folder name is the patient ID
        if patient_id in labels_dict:
            for fcs_file in os.listdir(sub_folder_path):
                if fcs_file.endswith(".fcs"):
                    fcs_file_path = os.path.join(sub_folder_path, fcs_file)
                    # Load and preprocess the FCS file for the patient
                    fcs_data = FlowCal.io.FCSData(fcs_file_path)
                    
                    # Extract data from specific marker channels based on your marker_channels list
                    features = fcs_data[:, marker_channels]
                    
                    if features is not None:
                        feature_vectors.append(features)
                        labels.append(labels_dict[patient_id])

In [3]:
# Convert FCSData objects to NumPy arrays
arrays = [fcs_data.view(np.ndarray) for fcs_data in feature_vectors]
# Initialize empty lists for data and labels
data = []
labels_1 = []

# Iterate through the list of patient data and labels
for patient_data, label in zip(arrays, labels):
    # Iterate through the patient's data
    for row in patient_data:
        # Extend the data list with the row and the labels list with the corresponding label
        data.append(row)
        labels_1.append(label)

print(f"Values: Data_shape= {np.shape(data)}, lable_shape = {np.shape(labels_1)}")

Values: Data_shape= (5052720, 31), lable_shape = (5052720,)


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels_1, test_size=0.2, random_state=42)

# Choose and train a machine learning model (Random Forest in this example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8467983581120664
Classification Report:
              precision    recall  f1-score   support

     Healthy       0.83      0.67      0.74    331142
        Sick       0.85      0.93      0.89    679402

    accuracy                           0.85   1010544
   macro avg       0.84      0.80      0.82   1010544
weighted avg       0.85      0.85      0.84   1010544

