Import Data

In [1]:
import pandas as pd
import numpy as np
import os
import FlowCal
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the label data from "EU_label.xlsx"
labels_df = pd.read_excel("EU_label.xlsx")

# Assuming the label data has a column "Patient_ID" for patient IDs and a column "Label" for labels
labels_dict = dict(zip(labels_df['file_flow_id'], labels_df['label']))

# Load the marker-channel mapping data from "EU_marker_channel_mapping.xlsx"
marker_mapping_df = pd.read_excel("EU_marker_channel_mapping.xlsx")

# Filter for channels with "use" = 1
marker_channels = marker_mapping_df[marker_mapping_df['use'] == 1]['PxN(channel)'].tolist()

In [2]:
# Create lists to store feature vectors and labels
feature_vectors = []
labels = []

# Loop through sub-folders and FCS files
main_folder = "/Users/atifsarwar/Downloads/raw_fcs"  
for sub_folder in os.listdir(main_folder):
    sub_folder_path = os.path.join(main_folder, sub_folder)
    if os.path.isdir(sub_folder_path):
        patient_id = sub_folder  # Assuming the sub-folder name is the patient ID
        if patient_id in labels_dict:
            for fcs_file in os.listdir(sub_folder_path):
                if fcs_file.endswith(".fcs"):
                    fcs_file_path = os.path.join(sub_folder_path, fcs_file)
                    # Load and preprocess the FCS file for the patient
                    fcs_data = FlowCal.io.FCSData(fcs_file_path)
                    
                    # Extract data from specific marker channels based on your marker_channels list
                    features = fcs_data[:, marker_channels]
                    
                    if features is not None:
                        feature_vectors.append(features)
                        labels.append(labels_dict[patient_id])

Data Preprocessing

In [3]:
# Convert FCSData objects to NumPy arrays
arrays = [fcs_data.view(np.ndarray) for fcs_data in feature_vectors]
# Initialize empty lists for data and labels
data = []
labels_1 = []

# Iterate through the list of patient data and labels
for patient_data, label in zip(arrays, labels):
    # Iterate through the patient's data
    for row in patient_data:
        # Extend the data list with the row and the labels list with the corresponding label
        data.append(row)
        labels_1.append(label)

print(f"Values: Data_shape= {np.shape(data)}, lable_shape = {np.shape(labels_1)}")

Values: Data_shape= (5052720, 31), lable_shape = (5052720,)


Check missing Data

In [4]:
data_array = np.array(data)
missing_data_indices = np.isnan(data_array)

# Find the indices of missing data
missing_data_indices = np.isnan(data_array)

# Show the missing values
missing_values = data_array[missing_data_indices]

# Count the missing values
count_missing = np.sum(missing_data_indices)

print("Missing Values:")
print(missing_values)
print("Number of Missing Values:", count_missing)

Missing Values:
[]
Number of Missing Values: 0


Mapping labels

In [5]:
from sklearn.preprocessing import LabelEncoder

# Assuming y contains non-numeric class labels like 'Healthy' and 'Sick'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels_1)

# Get the mapping from class labels to numeric values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print(label_mapping)
# Output: {'Healthy': 0, 'Sick': 1}


{'Healthy': 0, 'Sick': 1}


ML Algorithm

In [7]:
import numpy as np
import xgboost as xgb

# Assuming X is your feature matrix and y is your target variable
# X should be a 2D array where rows are samples and columns are features
# y should be a 1D array or list of target labels

# Create an XGBoost classifier
model = xgb.XGBClassifier()

# Fit the XGBoost model to your data
model.fit(data_array, y_encoded)

# Get feature importances
feature_importances = model.feature_importances_

# Sort feature importances in descending order
sorted_feature_importances = np.argsort(feature_importances)[::-1]

# Select the top k features (e.g., top 5 features)
k = 20
selected_feature_indices = sorted_feature_importances[:k]

print("Selected Feature Indices:", selected_feature_indices)


Selected Feature Indices: [ 8 14 22  6 26  3 15 13 27 23  0 20  9 11 10  7 29 19 18  4]


In [8]:
# Extract the selected features from X
X = data_array[:, selected_feature_indices]

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Choose and train a machine learning model (Random Forest in this example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8394567678398961
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.67      0.73    331142
           1       0.85      0.92      0.89    679402

    accuracy                           0.84   1010544
   macro avg       0.83      0.79      0.81   1010544
weighted avg       0.84      0.84      0.83   1010544



In [10]:

# Create an XGBoost classifier
model = xgb.XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

#Prediction 
y_pred = model.predict(X_test)

# For classification problems
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8332343767317405
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.68      0.73    331142
           1       0.85      0.91      0.88    679402

    accuracy                           0.83   1010544
   macro avg       0.82      0.79      0.80   1010544
weighted avg       0.83      0.83      0.83   1010544

