In [1]:
import pandas as pd

CellMetadataSyn18485175 = pd.read_csv('CellMetadataSyn18485175.csv')

In [2]:
CellMatrixSyn18485175 = pd.read_parquet('CellMatrixSyn18485175.parquet')

In [3]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (accuracy_score, classification_report, roc_auc_score,
                             average_precision_score, recall_score, precision_score,
                             f1_score, matthews_corrcoef)
from sklearn.ensemble import HistGradientBoostingClassifier

print("step 1")
# Determine Alzheimer's or control status
CellMetadataSyn18485175['alzheimers_or_control'] = CellMetadataSyn18485175['age_first_ad_dx'].notnull().astype(int)

# Identify unique cell types
cell_types = CellMetadataSyn18485175['broad.cell.type'].unique()

# Dictionary to store results for each cell type
results = {}

print("step 2")
# Split data by sample_number
sample_train, sample_test = train_test_split(
    CellMetadataSyn18485175['sample'].unique(), test_size=0.5, random_state=42
)

# Filter metadata based on sample numbers
train_metadata = CellMetadataSyn18485175[CellMetadataSyn18485175['sample'].isin(sample_train)]
test_metadata = CellMetadataSyn18485175[CellMetadataSyn18485175['sample'].isin(sample_test)]

print("step 3")
# Iterate over each cell type
for cell_type in cell_types:
    print(f'Processing cell type: {cell_type}')
    
    # Filter metadata for the current cell type
    cell_type_train_metadata = train_metadata[train_metadata['broad.cell.type'] == cell_type]
    cell_type_test_metadata = test_metadata[test_metadata['broad.cell.type'] == cell_type]
    
    # Extract cell names for training and testing sets
    train_cell_names = cell_type_train_metadata['TAG']
    test_cell_names = cell_type_test_metadata['TAG']
    
    # Extract gene expression data for training and testing sets
    X_train = CellMatrixSyn18485175[train_cell_names]
    X_test = CellMatrixSyn18485175[test_cell_names]
    
    # Transpose the data to have cells as rows and genes as columns
    X_train = X_train.T
    X_test = X_test.T
    
    # Extract labels for training and testing sets
    y_train = cell_type_train_metadata.set_index('TAG').loc[train_cell_names, 'alzheimers_or_control']
    y_test = cell_type_test_metadata.set_index('TAG').loc[test_cell_names, 'alzheimers_or_control']
    
    # Initialize LGBMClassifier with class weights balanced
    clf = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
    
    # Perform stratified K-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    print("step 4")
    
    cv_scores = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Fit the model on the current fold
        clf.fit(X_train_fold, y_train_fold)
        
        # Predict on the validation fold
        y_val_pred = clf.predict(X_val_fold)
        y_val_prob = clf.predict_proba(X_val_fold)[:, 1]
        
        # Calculate accuracy for the fold
        fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
        cv_scores.append(fold_accuracy)
    
    # Fit the model on the full training set and predict on the test set
    #clf = HistGradientBoostingClassifier().fit(X_train, y_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_roc_auc = roc_auc_score(y_test, y_prob)
    test_avg_precision = average_precision_score(y_test, y_prob)
    test_recall = recall_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    test_mcc = matthews_corrcoef(y_test, y_pred)
    
    # Store results
    results[cell_type] = {
        'cross_val_scores': cv_scores,
        'accuracy': test_accuracy,
        'roc_auc': test_roc_auc,
        'average_precision': test_avg_precision,
        'recall': test_recall,
        'precision': test_precision,
        'f1': test_f1,
        'mcc': test_mcc, #Matthewscorrelationcoefficient
        'classification_report': classification_report(y_test, y_pred)
    }

print("step 5")
# Print result for each cell type
for cell_type, result in results.items():
    print(f'Cell Type: {cell_type}')
    print(f'Cross-Validation Scores: {result["cross_val_scores"]}')
    print(f'Test Set Accuracy: {result["accuracy"]}')
    print(f'Test Set ROC AUC: {result["roc_auc"]}')
    print(f'Test Set Recall: {result["recall"]}')
    print(f'Test Set Precision: {result["precision"]}')
    print(f'Test Set F1 Score: {result["f1"]}')
    print(f'Test Set MCC: {result["mcc"]}')
    print('Classification Report:')
    print(result['classification_report'])
    print('\n' + '='*40 + '\n')



step 1
step 2
step 3
Processing cell type: Ex
step 4
[LightGBM] [Info] Number of positive: 2787, number of negative: 9860
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.611169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95416
[LightGBM] [Info] Number of data points in the train set: 12647, number of used features: 15836
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2786, number of negative: 9861
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.645761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95280
[LightGBM] [Info] Number of data points in the train set: 12647, number of used features: 15843
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [