In [5]:
import pandas as pd

CellMetadataSyn18485175 = pd.read_csv('CellMetadataSyn18485175.csv')

In [6]:
CellMatrixSyn18485175 = pd.read_parquet('CellMatrixSyn18485175.parquet')

In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix, roc_curve
import matplotlib.pyplot as plt

# Assuming CellMetadataSyn18485175 and CellMatrixSyn18485175 are already loaded and preprocessed

# Determine Alzheimer's or control status
CellMetadataSyn18485175['alzheimers_or_control'] = CellMetadataSyn18485175['age_first_ad_dx'].notnull().astype(int)

# Set cell type of interest
cell_type = 'Mic'

# Filter metadata for the cell type of interest
cell_type_metadata = CellMetadataSyn18485175[CellMetadataSyn18485175['broad.cell.type'] == cell_type]

# Extract unique sample names and labels
unique_samples = cell_type_metadata[['sample', 'alzheimers_or_control']].drop_duplicates()

# Use StratifiedShuffleSplit to ensure balanced class distribution
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform the split
for train_index, test_index in sss.split(unique_samples['sample'], unique_samples['alzheimers_or_control']):
    sample_train, sample_test = unique_samples['sample'].values[train_index], unique_samples['sample'].values[test_index]

print(f'Samples in training set: {sample_train}')
print(f'Samples in test set: {sample_test}')
print(f'Number of unique samples in training set: {len(sample_train)}')
print(f'Number of unique samples in test set: {len(sample_test)}')


# Filter metadata based on sample numbers -- these are unique identifiers for each individual
train_metadata = cell_type_metadata[cell_type_metadata['sample'].isin(sample_train)]
test_metadata = cell_type_metadata[cell_type_metadata['sample'].isin(sample_test)]

# Extract cell names for training and testing sets
train_cell_names = train_metadata['TAG']
test_cell_names = test_metadata['TAG']

# Extract gene expression data for training and testing sets
X_train = CellMatrixSyn18485175[train_cell_names]
X_test = CellMatrixSyn18485175[test_cell_names]

# Transpose the data to have cells as rows and genes as columns
X_train = X_train.T
X_test = X_test.T

# Extract labels for training and testing sets
y_train = train_metadata.set_index('TAG').loc[train_cell_names, 'alzheimers_or_control']
y_test = test_metadata.set_index('TAG').loc[test_cell_names, 'alzheimers_or_control']

# Convert y_train and y_test to pandas Series for value_counts method
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

# Check the class balance
train_class_balance = y_train_series.value_counts()
test_class_balance = y_test_series.value_counts()

print(f'Training set class balance:\n{train_class_balance}')
print(f'Testing set class balance:\n{test_class_balance}')

#coded that was added today
import numpy as np
import pandas as pd
from sklearn.model_selection import PredefinedSplit, GridSearchCV
import lightgbm as lgb

def create_unique_balanced_folds(train_metadata, n_splits=3):
    controls = train_metadata[train_metadata['alzheimers_or_control'] == 0]['sample'].unique()
    cases = train_metadata[train_metadata['alzheimers_or_control'] == 1]['sample'].unique()
    
    np.random.shuffle(controls)
    np.random.shuffle(cases)
    
    num_controls_per_fold = len(controls) // n_splits
    num_cases_per_fold = len(cases) // n_splits
    
    # Prepare fold assignment array
    fold_assignments = np.full(len(train_metadata), -1)  # Initialize with -1 indicating unassigned
    
    for i in range(n_splits):
        start_control = i * num_controls_per_fold
        end_control = (i + 1) * num_controls_per_fold
        start_case = i * num_cases_per_fold
        end_case = (i + 1) * num_cases_per_fold
        
        fold_controls = controls[start_control:end_control]
        fold_cases = cases[start_case:end_case]
        
        fold_samples = np.concatenate((fold_controls, fold_cases))
        fold_metadata = train_metadata[train_metadata['sample'].isin(fold_samples)]
        fold_assignments[fold_metadata.index] = i
        
        # Print details about this fold
        num_controls = (fold_metadata['alzheimers_or_control'] == 0).sum()
        num_cases = (fold_metadata['alzheimers_or_control'] == 1).sum()
        print(f"Fold {i + 1}:")
        print(f"  Number of controls: {num_controls}")
        print(f"  Number of cases: {num_cases}")
        print(f"  Samples in this fold: {fold_samples}")
        print()
    
    return fold_assignments

# Custom cross-validation strategy using the manually created folds
cv_results = []
best_params_list = []
# Create the fold assignments
fold_assignments = create_unique_balanced_folds(train_metadata, n_splits=3)

# Initialize PredefinedSplit with the created folds
ps = PredefinedSplit(test_fold=fold_assignments)

# Initialize LightGBM Classifier
clf = lgb.LGBMClassifier(class_weight='balanced', random_state=42)

#Parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 150, 200],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'verbosity': [0, 1, 2]
}

# Setup GridSearchCV with PredefinedSplit
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=ps,  # Use PredefinedSplit for cross-validation
    scoring='roc_auc',  # Metric to optimize
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters from the search
best_params = grid_search.best_params_

# Best score from the search
best_score = grid_search.best_score_
print(f'Best Parameters: {best_params}')
print(f'Best Score (ROC AUC): {best_score}')
      
for i in range(len(folds)):
    fold_val_idx = folds[i]
    fold_train_idx = np.concatenate([folds[j] for j in range(len(folds)) if j != i])
    
    X_fold_train = X_train.loc[fold_train_idx]
    y_fold_train = y_train.loc[fold_train_idx]
    X_fold_val = X_train.loc[fold_val_idx]
    y_fold_val = y_train.loc[fold_val_idx]
    
    clf = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
    
    random_search = RandomizedSearchCV(
        clf, param_distributions=param_dist, n_iter=30, scoring='neg_log_loss', n_jobs=-1, cv=3, random_state=42
    )
    
    random_search.fit(X_fold_train, y_fold_train)
    best_params = random_search.best_params_
    best_params_list.append(best_params)
    
    clf_best = lgb.LGBMClassifier(**best_params, class_weight='balanced', random_state=42)
    clf_best.fit(X_fold_train, y_fold_train)
    
    y_fold_prob = clf_best.predict_proba(X_fold_val)[:, 1]
    
    fold_roc_auc = roc_auc_score(y_fold_val, y_fold_prob)
    cv_results.append(fold_roc_auc)

mean_cv_roc_auc = np.mean(cv_results)



# Fit the final model on the entire training set with the best parameters found during cross-validation
clf_final = lgb.LGBMClassifier(**best_params, class_weight='balanced', random_state=42)
clf_final.fit(X_train, y_train)
y_prob = clf_final.predict_proba(X_test)[:, 1]


#Youdens stat
youden_stat = []

#iterates through the different threshold values
#for threshold in thresholds:
    #y_pred = (y_prob >= threshold).astype(int)
    #f1_scores.append(f1_score(y_test, y_pred))

for threshold in thresholds:
    y_pred = (y_prob >= threshold).astype(int)
    tpr = recall_score(y_test, y_pred, pos_label=1)
    tnr = recall_score(y_test, y_pred, pos_label=0)
    youden_stat.append(tpr + tnr - 1)

y_pred_optimal = (y_prob >= optimal_threshold).astype(int)
optimal_threshold = thresholds[np.argmax(youden_stat)]

# Calculate metrics using the optimal threshold
test_accuracy = accuracy_score(y_test, y_pred_optimal)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_avg_precision = average_precision_score(y_test, y_prob)
test_recall = recall_score(y_test, y_pred_optimal)
test_precision = precision_score(y_test, y_pred_optimal)
test_f1 = f1_score(y_test, y_pred_optimal)
test_mcc = matthews_corrcoef(y_test, y_pred_optimal)

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred_optimal)

# Print results
print(f'Cell Type: {cell_type}')
print(f'Optimal Threshold: {optimal_threshold}')
print(f'Test Set Accuracy: {test_accuracy}')
print(f'Test Set ROC AUC: {test_roc_auc}')
print(f'Test Set Average Precision: {test_avg_precision}')
print(f'Test Set Recall: {test_recall}')
print(f'Test Set Precision: {test_precision}')
print(f'Test Set F1 Score: {test_f1}')
print(f'Test Set MCC: {test_mcc}')
print('Confusion Matrix:')
print(cm)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % test_roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


Samples in training set: [12 45 36 41  4  6  7 17  8 37 31 18  1 42 13 25 40 30 32  3 10  2 26 34
 38 35 22 44 28 15 23  5 21 48 16 19 29 24]
Samples in test set: [ 9 20 39 43 11 47 33 46 14 27]
Number of unique samples in training set: 38
Number of unique samples in test set: 10
Training set class balance:
alzheimers_or_control
0    1106
1     544
Name: count, dtype: int64
Testing set class balance:
alzheimers_or_control
0    154
1    116
Name: count, dtype: int64


IndexError: index 62407 is out of bounds for axis 0 with size 1650