In [None]:
import pandas as pd

CellMetadataSyn18485175 = pd.read_csv('CellMetadataSyn18485175.csv')

In [None]:
CellMatrixSyn18485175 = pd.read_parquet('CellMatrixSyn18485175.parquet')

In [None]:
import numpy as np
from sklearn.metrics import roc_curve

# Sample predictions and true labels
y_true = np.array([0, 1])
y_scores = np.array([])
#from sklearn import metrics
#fpr, tpr, thresholds = metrics.roc_curve(Y_test,p)
#ROC Curves and Thresholds
fpr, tpr, thresholds = roc_curve(y_true, y_scores)

# Calculate Youden's J statistic
youden_j = tpr - fpr
optimal_idx = np.argmax(youden_j)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold based on Youden's J statistic: {optimal_threshold}")

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (accuracy_score, classification_report, roc_auc_score,
                             average_precision_score, recall_score, precision_score,
                             f1_score, matthews_corrcoef, roc_curve)

print("step 1")
# Determine Alzheimer's or control status
CellMetadataSyn18485175['alzheimers_or_control'] = CellMetadataSyn18485175['age_first_ad_dx'].notnull().astype(int)

# Identify unique cell types
cell_types = CellMetadataSyn18485175['broad.cell.type'].unique()

# Dictionary to store results for each cell type
results = {}

print("step 2")
# Split data by sample_number
sample_train, sample_test = train_test_split(
    CellMetadataSyn18485175['sample'].unique(), test_size=0.5, random_state=42
)

# Filter metadata based on sample numbers
train_metadata = CellMetadataSyn18485175[CellMetadataSyn18485175['sample'].isin(sample_train)]
test_metadata = CellMetadataSyn18485175[CellMetadataSyn18485175['sample'].isin(sample_test)]

print("step 3")
# Iterate over each cell type
for cell_type in cell_types:
    print(f'Processing cell type: {cell_type}')
    
    # Filter metadata for the current cell type
    cell_type_train_metadata = train_metadata[train_metadata['broad.cell.type'] == cell_type]
    cell_type_test_metadata = test_metadata[test_metadata['broad.cell.type'] == cell_type]
    
    # Extract cell names for training and testing sets
    train_cell_names = cell_type_train_metadata['TAG']
    test_cell_names = cell_type_test_metadata['TAG']
    
    # Extract gene expression data for training and testing sets
    X_train = CellMatrixSyn18485175[train_cell_names]
    X_test = CellMatrixSyn18485175[test_cell_names]
    
    # Transpose the data to have cells as rows and genes as columns
    X_train = X_train.T
    X_test = X_test.T
    
    # Extract labels for training and testing sets
    y_train = cell_type_train_metadata.set_index('TAG').loc[train_cell_names, 'alzheimers_or_control']
    y_test = cell_type_test_metadata.set_index('TAG').loc[test_cell_names, 'alzheimers_or_control']
    
    # Initialize LGBMClassifier with class weights balanced
    clf = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
    
    # Perform stratified K-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    print("step 4")
    
    cv_scores = []
    youden_thresholds = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Fit the model on the current fold
        clf.fit(X_train_fold, y_train_fold)
        
        # Predict probabilities on the validation fold
        y_val_prob = clf.predict_proba(X_val_fold)[:, 1]
        
        # Calculate ROC curve and Youden's J statistic
        fpr, tpr, thresholds = roc_curve(y_val_fold, y_val_prob)
        youden_j = tpr - fpr
        optimal_idx = np.argmax(youden_j)
        optimal_threshold = thresholds[optimal_idx]
        youden_thresholds.append(optimal_threshold)
        
        # Predict using the optimal threshold
        y_val_pred = (y_val_prob >= optimal_threshold).astype(int)
        
        # Calculate accuracy for the fold
        fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
        cv_scores.append(fold_accuracy)
    
    # Determine the final threshold from cross-validation
    final_threshold = np.mean(youden_thresholds)
    
    # Fit the model on the full training set and predict on the test set
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= final_threshold).astype(int)
    
    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_roc_auc = roc_auc_score(y_test, y_prob)
    test_avg_precision = average_precision_score(y_test, y_prob)
    test_recall = recall_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    test_mcc = matthews_corrcoef(y_test, y_pred)
    
    # Store results
    results[cell_type] = {
        'cross_val_scores': cv_scores,
        'final_threshold': final_threshold,
        'accuracy': test_accuracy,
        'roc_auc': test_roc_auc,
        'average_precision': test_avg_precision,
        'recall': test_recall,
        'precision': test_precision,
        'f1': test_f1,
        'mcc': test_mcc,
        'classification_report': classification_report(y_test, y_pred)
    }

print("step 5")
# Print result for each cell type
for cell_type, result in results.items():
    print(f'Cell Type: {cell_type}')
    print(f'Cross-Validation Scores: {result["cross_val_scores"]}')
    print(f'Optimal Threshold from Youden: {result["final_threshold"]}')
    print(f'Test Set Accuracy: {result["accuracy"]}')
    print(f'Test Set ROC AUC: {result["roc_auc"]}')
    print(f'Test Set Recall: {result["recall"]}')
    print(f'Test Set Precision: {result["precision"]}')
    print(f'Test Set F1 Score: {result["f1"]}')
    print(f'Test Set MCC: {result["mcc"]}')
    print('Classification Report:')
    print(result['classification_report'])
    print('\n' + '='*40 + '\n')


In [None]:
from sklearn.metrics import roc_curve
import numpy as np

# During cross-validation
youden_thresholds = []

for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model on the current fold
    clf.fit(X_train_fold, y_train_fold)
    
    # Predict probabilities on the validation fold
    y_val_prob = clf.predict_proba(X_val_fold)[:, 1]  # y_scores

    # Calculate ROC curve using true labels and predicted probabilities
    fpr, tpr, thresholds = roc_curve(y_val_fold, y_val_prob)  # y_true = y_val_fold

    # Calculate Youden's J statistic
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]
    youden_thresholds.append(optimal_threshold)
    
    # Predict using the optimal threshold
    y_val_pred = (y_val_prob >= optimal_threshold).astype(int)
    
    # Calculate accuracy for the fold
    fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(fold_accuracy)

# Determine the final threshold from cross-validation
final_threshold = np.mean(youden_thresholds)

# Fit the model on the full training set and predict on the test set
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)[:, 1]  # y_scores for the test set

# Use the final threshold determined from cross-validation
y_pred = (y_prob >= final_threshold).astype(int)