# Top predictable genes

In [None]:
import torch
# Predictions can be the output of model and observations can be the raw gene expression from adata.
num_slides = 10
predictions = {"slide"+str(i): torch.randn(1000, 300) for i in range(num_slides)}
observations = {"slide"+str(i): torch.randn(1000, 300) for i in range(num_slides)}

from scipy.stats import pearsonr, spearmanr
pcc_dic, p_v_dic = {}, {}
for i in range(len(predictions)):
    # Select single slide
    prediction, observation = predictions[list(predictions.keys())[i]], observations[list(predictions.keys())[i]]

    # The hypothesis is the correlation between predictions and observations is positive (greater than zero)
    pcc = [pearsonr(prediction.numpy()[:,g], observation.numpy()[:,g], alternative="greater")[0] for g in range(observation.shape[1])]
    p_v = [pearsonr(prediction.numpy()[:,g], observation.numpy()[:,g], alternative="greater")[1] for g in range(observation.shape[1])]
    pcc_dic["slide"+str(i)] = pcc
    p_v_dic["slide"+str(i)] = p_v


In [None]:
!pip install scanpy
import scanpy as sc
import numpy as np

pred_adata_dic, obs_adata_dic = {}, {}
# Store useful information in anndata
for slide in list(predictions.keys()):
    pred_adata_dic[slide] = sc.AnnData(predictions[slide].numpy())
    pred_adata_dic[slide].var["PCC"] = pcc_dic[slide]
    pred_adata_dic[slide].var["P_value"] = p_v_dic[slide]
    pred_adata_dic[slide].var["Slide"] = slide
    pred_adata_dic[slide].var["Gene_names"] = ["gene"+str(i) for i in range(predictions[slide].numpy().shape[1])]
    # Count cells with expression above the threshold for each gene
    n_cells_by_counts = np.sum(observations[slide].numpy() >= 1, axis=0)
    pred_adata_dic[slide].var['n_cells_by_counts'] = n_cells_by_counts
    # Percent of positive spots within slide
    pred_adata_dic[slide].var['Abundance'] = n_cells_by_counts / pred_adata_dic[slide].n_obs


In [None]:
import pandas as pd
df = pd.concat([pred_adata_dic[slide].var for slide in list(predictions.keys())])
df


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

predictable_genes = df[df["P_value"]<0.05]
plt.figure(figsize=(18, 6)) 
sns.boxplot(predictable_genes, x = "Gene_names", y = "PCC")


In [None]:
# Group by Gene_names and calculate mean, variance, and standard deviation for each gene
summary_df = predictable_genes.groupby('Gene_names')['PCC'].agg(['mean', 'var', 'std']).reset_index()

# Rename columns for clarity
summary_df.columns = ['Gene_names', 'PCC_mean', 'PCC_variance', 'PCC_std']

# Calculate high and low bounds
summary_df['High_bound'] = summary_df['PCC_mean'] + summary_df['PCC_std']
summary_df['Low_bound'] = summary_df['PCC_mean'] - summary_df['PCC_std']
summary_df


In [None]:
# Find the predictable genes with high pcc and low variance.
# We can filter genes by ranks or threshold.
topk = 50
low_var_genes = summary_df.sort_values(by='PCC_variance', ascending=True)["Gene_names"][:topk]
high_acc_genes = summary_df.sort_values(by='PCC_mean', ascending=False)["Gene_names"][:topk]
overlap_genes = list(set(low_var_genes).intersection(high_acc_genes))

# Convert the result back to a list (if needed)
print("Overlap elements:", overlap_genes)