In [None]:
import numpy as np
from statsmodels.stats.multitest import multipletests
from scipy.stats import ranksums
import pandas as pd
import os

# Create AUC files with cluster information

In [None]:
ht5_clust_list = {}
clusters = ['S1','S2','S3','U1']
for c in clusters:
    cluster = c
    cells = []
    with open(f'../../Results/Results_no_ptprc_adgre1/Clusters_Cell_lists/5Ht_Basal_{c}.txt','r') as f:
        for line in f:
            line = line.strip().split()
            cells.append(line[0])
    f.close()
    ht5_clust_list[cluster] = cells
ht5_clust_list

In [None]:
ho6_clust_list = {}
clusters = ['S1','S2','S3']
for c in clusters:
    cluster = f'{c}'
    cells = []
    with open(f'../../Results/Results_no_ptprc_adgre1/Clusters_Cell_lists/6Ho_Basal_{c}.txt','r') as f:
        for line in f:
            line = line.strip().split()
            cells.append(line[0])
    f.close()
    ho6_clust_list[cluster] = cells
ho6_clust_list

In [None]:
AUC_5Ht = pd.read_csv('../../Results/Results_no_ptprc_adgre1/Cluster3/gene_cell_corr/Average_linkage/5Ht/Reorder_cells_reg_paper/AUC_mtx_5ht_basal_reord_paper_for_ClusterAnalysis.cdt',sep='\t')

AUC_5Ht = AUC_5Ht.iloc[2:,:]
AUC_5Ht = AUC_5Ht.drop(['GID','GWEIGHT'],axis = 1)

clusters = []
for i,cell in enumerate(AUC_5Ht['Cell']):
    for key in ht5_clust_list:
        if cell in ht5_clust_list[key]:
            clusters.append(key)
            
AUC_5Ht['NAME'] = clusters

AUC_5Ht.rename(columns={'NAME': 'Cluster'}, inplace=True)
AUC_5Ht.reset_index(drop=True,inplace=True)

AUC_5Ht.to_csv('../../Results/Results_no_ptprc_adgre1/Cluster_Analysis_avg_link_gene_cell_corr/AUC_mtx_with_clusters/5ht_basal_AUC_with_clusts.csv',index=0)

In [None]:
AUC_5Ht['Cluster'].value_counts()

In [None]:
AUC_6Ho = pd.read_csv('../../Results/Results_no_ptprc_adgre1/Cluster3/gene_cell_corr/Average_linkage/6Ho/Reorder_cells_reg_paper/AUC_mtx_6ho_basal_reord_paper_for_ClusterAnalysis.cdt',sep='\t')

AUC_6Ho = AUC_6Ho.iloc[1:,:]
AUC_6Ho = AUC_6Ho.drop(['GID','GWEIGHT'],axis = 1)

clusters = []
for i,cell in enumerate(AUC_6Ho['Cell']):
    for key in ho6_clust_list:
        if cell in ho6_clust_list[key]:
            clusters.append(key)
            
AUC_6Ho['NAME'] = clusters

AUC_6Ho.rename(columns={'NAME': 'Cluster'}, inplace=True)
AUC_6Ho.reset_index(drop=True,inplace=True)

AUC_6Ho.to_csv('../../Results/Results_no_ptprc_adgre1/Cluster_Analysis_avg_link_gene_cell_corr/AUC_mtx_with_clusters/6ho_basal_AUC_with_clusts.csv',index=0)

In [None]:
AUC_6Ho['Cluster'].value_counts()

# Get difference and pvalue -  Taking difference to find FC

In [None]:
def sort_dict_and_filter(dictionary,threshold):
    # Sort the dictionary by values
    sorted_filtered_dict = {}
    for c in dictionary:

        sorted_items = sorted(dictionary[c].items(), key=lambda x: x[1][0])

        # Create a new dictionary from the sorted items
        sorted_dict = {k: v for k, v in sorted_items}

         # Create a new dictionary to store filtered key-value pairs
        filtered_dict = {}

        # Iterate over the items in the input dictionary
        for key, value in sorted_items:
            # Check if the value is less than x
            if value[0] < threshold:
                # If the value is less than x, add the key-value pair to the filtered dictionary
                filtered_dict[key] = value
        sorted_filtered_dict[c] = filtered_dict
        dictionary[c] = sorted_dict
    return dictionary, sorted_filtered_dict

def sort_dict_and_filter_AUC_diff(dictionary,threshold):
    # Sort the dictionary by values
    sorted_filtered_dict = {}
    for c in dictionary:

        sorted_items = sorted(dictionary[c].items(), key=lambda x: x[1][0])

        # Create a new dictionary from the sorted items
        sorted_dict = {k: v for k, v in sorted_items}

         # Create a new dictionary to store filtered key-value pairs
        filtered_dict = {}

        # Iterate over the items in the input dictionary
        for key, value in sorted_items:
            # Check if the value is less than x
            if abs(value[2]) > threshold:
                # If the value is less than x, add the key-value pair to the filtered dictionary
                filtered_dict[key] = value
        sorted_filtered_dict[c] = filtered_dict
        dictionary[c] = sorted_dict
    return dictionary, sorted_filtered_dict

def fdr_adjusted_pvalues(p_values):
    """
    Calculate adjusted p-values correcting for false discovery rate (FDR).

    Parameters:
        p_values (array-like): List or array of p-values.

    Returns:
        array-like: Array of adjusted p-values.
    """
    # Perform FDR correction
    _, adjusted_p_values, _, _ = multipletests(p_values, method='fdr_bh')#fdr_bh : Benjamini/Hochberg (non-negative)

    return adjusted_p_values

# Similar Clusters

In [None]:
regulons = set(AUC_5Ht.columns[2:]).intersection(AUC_6Ho.columns[2:])
#regulons_dict = {reg:None for reg in regulons}
common_clusts = [1,2,3]
all_cluster_stats = {}

for i in common_clusts:
    current_cluster = f'Cluster_{i}'
    all_cluster_stats[current_cluster] = {reg:None for reg in regulons}
    
    for regulon in all_cluster_stats[current_cluster]:
        current_ht5_df = AUC_5Ht[AUC_5Ht['Cell'].isin(ht5_clust_list[f'S{i}'])] 
        current_ho6_df = AUC_6Ho[AUC_6Ho['Cell'].isin(ho6_clust_list[f'S{i}'])]
        ht5_scores = np.array(current_ht5_df[regulon],dtype=float)
        ho6_scores = np.array(current_ho6_df[regulon],dtype=float)
      
        AUC_diff = np.mean(ho6_scores)-np.mean(ht5_scores)
        #logfc = np.log2(abs(AUC_diff))
        stat,pval = ranksums(ho6_scores,ht5_scores)
        all_cluster_stats[current_cluster][regulon] = [pval,AUC_diff,abs(AUC_diff)]
    
    pvals = [stat[0] for stat in all_cluster_stats[current_cluster].values()]
    adjusted_pvals = fdr_adjusted_pvalues(pvals)
    for regulon, p_adj in zip(all_cluster_stats[current_cluster].keys(), adjusted_pvals):
        all_cluster_stats[current_cluster][regulon][0] = p_adj
    
all_cluster_stats

In [None]:
#Sort stats and find significant regulons based on p-value
sorted_stats_ori,significant_stats = sort_dict_and_filter(all_cluster_stats,0.05)
significant_stats

In [None]:
#Sort stats and further filter significant regulons based on AUC difference threshold of 0.15
sorted_stats,significant_stats_AUC_diff_15 = sort_dict_and_filter_AUC_diff(significant_stats,0.15)
significant_stats_AUC_diff_15

In [None]:
results_dir = '../../Results/Results_no_ptprc_adgre1/Cluster_Analysis_avg_link_gene_cell_corr/Sig_Regulons'
os.makedirs(results_dir, exist_ok=True)

for c in significant_stats:
    curr_c = 'Cluster_S' + c.split('_')[-1]
    curr_reg_dict = sorted_stats_ori[c]
    curr_reg_significant_dict = significant_stats[c]
    curr_reg_sig_AUC_diff_15_dict = significant_stats_AUC_diff_15[c]
    
    # Sorted results file
    filename = f'{results_dir}/{curr_c}_basal_reg_wilcoxon_AUC_diff_sorted.csv'
    with open(filename, 'w') as f:
        f.write("Regulon,P_adj,AUC_diff(6ho-5ht),Abs_AUC_diff\n")
        for key, value in curr_reg_dict.items():
            f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")
    
    # Significant results file
    filename = f'{results_dir}/{curr_c}_basal_reg_wilcoxon_AUC_diff_sig.csv'
    with open(filename, 'w') as f:
        f.write("Regulon,P_adj,AUC_diff(6ho-5ht),Abs_AUC_diff\n")
        for key, value in curr_reg_significant_dict.items():
            f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")
    
    # 25% significant results file
    filename = f'{results_dir}/{curr_c}_basal_reg_wilcoxon_AUC_diff_15_sig.csv'
    with open(filename, 'w') as f:
        f.write("Regulon,P_adj,AUC_diff(6ho-5ht),Abs_AUC_diff\n")
        for key, value in curr_reg_sig_AUC_diff_15_dict.items():
            f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")

# Unique Cluster - U1_wt

In [None]:
#regulons_dict = {reg:None for reg in regulons}
unique_cluster_stats = {}


# current_cluster = f'Cluster_{i}'

unique_cluster_stats = {reg:None for reg in regulons}
for regulon in unique_cluster_stats:
    current_u1_wt_df = AUC_5Ht[AUC_5Ht['Cell'].isin(ht5_clust_list['U1'])] 
    current_other_5ht_clust_df = AUC_5Ht[~AUC_5Ht['Cell'].isin(ht5_clust_list['U1'])]
    u1_wt_scores = np.array(current_u1_wt_df[regulon],dtype=float)
    other_5ht_scores = np.array(current_other_5ht_clust_df[regulon],dtype=float)

    AUC_diff = np.mean(u1_wt_scores)-np.mean(other_5ht_scores)
    #logfc = np.log2(abs(AUC_diff))
    stat,pval = ranksums(u1_wt_scores, other_5ht_scores)
    unique_cluster_stats[regulon] = [pval,AUC_diff,abs(AUC_diff)]

unique_cluster_stats

In [None]:
#Adjust for FDR - BH correction
pvals = [stat[0] for stat in unique_cluster_stats.values()]
adjusted_pvals = fdr_adjusted_pvalues(pvals)
for regulon, p_adj in zip(unique_cluster_stats.keys(), adjusted_pvals):
    unique_cluster_stats[regulon][0] = p_adj
    
unique_cluster_stats

In [None]:
#Functions to sort
def sort_dict_and_filter_unique(dictionary, threshold):
    # Sort the dictionary by the first element of the list in values
    sorted_items = sorted(dictionary.items(), key=lambda x: x[1][0])

    # Create a new dictionary from the sorted items
    sorted_dict = {k: v for k, v in sorted_items}

    # Create a new dictionary to store filtered key-value pairs
    filtered_dict = {}

    # Iterate over the items in the sorted dictionary
    for key, value in sorted_items:
        # Check if the first element of the value (list) is less than the threshold
        if value[0] < threshold:
            filtered_dict[key] = value

    return sorted_dict, filtered_dict


def sort_dict_and_filter_AUC_diff(dictionary, threshold):
    # Sort the dictionary by the first element of the list in values
    sorted_items = sorted(dictionary.items(), key=lambda x: x[1][0])

    # Create a new dictionary from the sorted items
    sorted_dict = {k: v for k, v in sorted_items}

    # Create a new dictionary to store filtered key-value pairs
    filtered_dict = {}

    # Iterate over the items in the sorted dictionary
    for key, value in sorted_items:
        # Check if the absolute value of the third element is greater than the threshold
        if abs(value[2]) > threshold:
            filtered_dict[key] = value

    return sorted_dict, filtered_dict

In [None]:
#Get significant regulons based on p-value (<0.05)
sorted_unique, filtered_pval_u1_wt = sort_dict_and_filter_unique(unique_cluster_stats, 0.05)

In [None]:
#Filter significant regulons based on AUC difference threshold
_,filtered_pval_AUC_diff_u1_wt = sort_dict_and_filter_AUC_diff(filtered_pval_u1_wt, 0.15)

In [None]:
filtered_pval_AUC_diff_u1_wt

In [None]:
#Save the results
filename = f'{results_dir}/U1_wt_basal_reg_wilcoxon_AUC_diff_sorted.csv'
with open(filename, 'w') as f:
    f.write("Regulon,P_adj,AUC_diff(U1 - other 5ht),Abs_AUC_diff\n")
    for key, value in sorted_unique.items():
        f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")

# Significant results file
filename = f'{results_dir}/U1_wt_basal_reg_wilcoxon_AUC_diff_sig.csv'
with open(filename, 'w') as f:
    f.write("Regulon,P_adj,AUC_diff(U1 - other 5ht),Abs_AUC_diff\n")
    for key, value in filtered_pval_u1_wt.items():
        f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")
        
filename = f'{results_dir}/U1_wt_basal_reg_wilcoxon_AUC_diff_15_sig.csv'
with open(filename, 'w') as f:
    f.write(f"Regulon,P_adj,AUC_diff(U1 - other 5ht),Abs_AUC_diff\n")
    for key, value in filtered_pval_AUC_diff_u1_wt.items():
        f.write(f"{key},{value[0]},{value[1]},{value[2]}\n")
f.close()