In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns

In [2]:
def filter_GeneRanks(generank_dir, output_dir, experiment):
    cutnrun = ['H3K4me3_RNA', 'H3K4me1_RNA', 'H3K27Ac_RNA']
    
    df = pd.read_csv("{0}/GeneRanks.tsv".format(generank_dir), sep='\t')
    df = df.rename(columns={"Unnamed: 0": "Genes"})
    
    df_pval = pd.read_csv("{0}/GeneRanks_PValues.tsv".format(generank_dir), sep='\t')
    df_pval = df_pval.rename(columns={"Unnamed: 0": "Genes"})
    
    if experiment == 'ATAC_RNA':
        df = df[['Genes','Naive','Early_Pre_TFH','Late_Pre_TFH', 'GC']].set_index('Genes')
        df = df.rename(columns={'Early_Pre_TFH': 'Early Extra-GC TFH', 'Late_Pre_TFH': 'Late Extra-GC TFH'})

        df_pval = df_pval[['Genes','Naive','Early_Pre_TFH','Late_Pre_TFH', 'GC']].set_index('Genes')
        df_pval = df_pval.rename(columns={'Early_Pre_TFH': 'Early Extra-GC TFH', 'Late_Pre_TFH': 'Late Extra-GC TFH'})

    if experiment in cutnrun:
        df = df[['Genes','Naive','CXCR5pos_PD1neg','CXCR5pos_PD1int', 'CXCR5hi_PD1hi']].set_index('Genes')
        df = df.rename(columns={'CXCR5pos_PD1neg': 'Early Extra-GC TFH', 'CXCR5pos_PD1int': 'Late Extra-GC TFH', 'CXCR5hi_PD1hi': 'GC'})

        df_pval = df_pval[['Genes','Naive','CXCR5pos_PD1neg','CXCR5pos_PD1int', 'CXCR5hi_PD1hi']].set_index('Genes')
        df_pval = df_pval.rename(columns={'CXCR5pos_PD1neg': 'Early Extra-GC TFH', 'CXCR5pos_PD1int': 'Late Extra-GC TFH', 'CXCR5hi_PD1hi': 'GC'})
        
    #Filter by p-value
    filtered_pvals = df_pval[(df_pval < 0.01).all(axis=1)]
    df_filtered = df[df.index.isin(filtered_pvals.index)]
    
    df_filtered.to_csv('{0}/{1}_processed.csv'.format(output_dir, experiment))
    
    return df_filtered

In [3]:
### Get ranks of the 4 TAIJI result sets from the GeneRank.tsv (with processed column names) file
def rank_TFs(analysis, experiment_list):
    cols_list = ['Naive', 'Early Extra-GC TFH', 'Late Extra-GC TFH', 'GC']
    
    for experiment in experiment_list:
        print('Processing experiment: {0}'.format(experiment))
        all_rank_df = pd.read_csv('{0}/{1}_processed.csv'.format(analysis,experiment)).set_index('Genes')

        for col in cols_list:
            print('Sorting by {0}'.format(col))
            ranking_df = all_rank_df.sort_values('{0}'.format(col), ascending = False)
            rank_list = []

            #for i in range(1, len(ranking_df)+1):
            for i in range(1, len(ranking_df)+1):
                if i == 1:
                    rank = 10**-4
                else:
                    rank = (i-1)/len(ranking_df)
                rank_list.append(rank)

            ranking_df['{0}_rank'.format(col)] = rank_list
            all_rank_df = ranking_df


        rank_df = all_rank_df[['Naive_rank', 'Early Extra-GC TFH_rank', 'Late Extra-GC TFH_rank', 'GC_rank']]
        print(rank_df)

        rank_df.to_csv('{0}/{1}_ranked.csv'.format(analysis,experiment), index=True)
        
    return rank_df

In [4]:
def get_top_genes(experiment_list, top_num):
    atac_ranked = pd.read_csv('{0}/{1}_ranked.csv'.format(analysis, experiment_list[0])).set_index('Genes')
    H3K4me1_ranked = pd.read_csv('{0}/{1}_ranked.csv'.format(analysis, experiment_list[1])).set_index('Genes')
    H3K4me3_ranked = pd.read_csv('{0}/{1}_ranked.csv'.format(analysis, experiment_list[2])).set_index('Genes')
    H3K27Ac_ranked = pd.read_csv('{0}/{1}_ranked.csv'.format(analysis, experiment_list[3])).set_index('Genes')
    
    ### Sort each dataset by Minimum Rank:
    atac_ranked['Minimum'] = atac_ranked.min(axis=1)
    H3K4me1_ranked['Minimum'] = H3K4me1_ranked.min(axis=1)
    H3K4me3_ranked['Minimum'] = H3K4me3_ranked.min(axis=1)
    H3K27Ac_ranked['Minimum'] = H3K27Ac_ranked.min(axis=1) 
    
    atac_ranked_ind = atac_ranked.sort_values('Minimum', ascending = True).reset_index()
    H3K4me1_ranked_ind = H3K4me1_ranked.sort_values('Minimum', ascending = True).reset_index()
    H3K4me3_ranked_ind = H3K4me3_ranked.sort_values('Minimum', ascending = True).reset_index()
    H3K27Ac_ranked_ind = H3K27Ac_ranked.sort_values('Minimum', ascending = True).reset_index()
    
    ### Pick the top genes from each dataset
    top_genes = pd.concat([atac_ranked_ind[:top_num], H3K4me1_ranked_ind[:top_num], H3K4me3_ranked_ind[:top_num], H3K27Ac_ranked_ind[:top_num]], axis=0).reset_index()
    
    final_genes = top_genes[['Genes', 'Naive_rank', 'Early Extra-GC TFH_rank', 'Late Extra-GC TFH_rank', 'GC_rank']]
    final_genes = final_genes.drop_duplicates(subset=['Genes'])
    final_genes = final_genes.set_index('Genes')
    final_genes = -np.log(final_genes)
    
    final_genes.to_csv("{0}/final_genes_all_sets_{1}.csv".format(analysis,len(final_genes)))
    
    return final_genes
    

In [5]:
def get_top_genes(experiment_list, top_num, exclude_znf=True):
    """
    Load and process ranked gene data from multiple experiments, optionally removing ZNF genes.

    Parameters:
    -----------
    experiment_list : list of str
        List of experiment identifiers in the order: [ATAC, H3K4me1, H3K4me3, H3K27Ac]
    top_num : int
        Number of top genes to select from each dataset
    exclude_znf : bool
        Whether to exclude genes starting with 'ZNF' from the top list (default: True)

    Returns:
    --------
    final_genes : pd.DataFrame
        -log-transformed rank values of top genes (after optional filtering)
    """
    atac_ranked = pd.read_csv(f'{analysis}/{experiment_list[0]}_ranked.csv').set_index('Genes')
    H3K4me1_ranked = pd.read_csv(f'{analysis}/{experiment_list[1]}_ranked.csv').set_index('Genes')
    H3K4me3_ranked = pd.read_csv(f'{analysis}/{experiment_list[2]}_ranked.csv').set_index('Genes')
    H3K27Ac_ranked = pd.read_csv(f'{analysis}/{experiment_list[3]}_ranked.csv').set_index('Genes')

    # Compute minimum rank for sorting
    for df in [atac_ranked, H3K4me1_ranked, H3K4me3_ranked, H3K27Ac_ranked]:
        df['Minimum'] = df.min(axis=1)

    # Sort by minimum rank and reset index
    atac_ranked_ind = atac_ranked.sort_values('Minimum').reset_index()
    H3K4me1_ranked_ind = H3K4me1_ranked.sort_values('Minimum').reset_index()
    H3K4me3_ranked_ind = H3K4me3_ranked.sort_values('Minimum').reset_index()
    H3K27Ac_ranked_ind = H3K27Ac_ranked.sort_values('Minimum').reset_index()

    # Concatenate top genes
    top_genes = pd.concat([
        atac_ranked_ind[:top_num],
        H3K4me1_ranked_ind[:top_num],
        H3K4me3_ranked_ind[:top_num],
        H3K27Ac_ranked_ind[:top_num]
    ], axis=0).reset_index(drop=True)

    # Optional: Remove ZNF genes
    if exclude_znf:
        top_genes = top_genes[~top_genes['Genes'].str.startswith('ZNF')]

    # Extract relevant rank columns and drop duplicates
    final_genes = top_genes[['Genes', 'Naive_rank', 'Early Extra-GC TFH_rank', 'Late Extra-GC TFH_rank', 'GC_rank']]
    final_genes = final_genes.drop_duplicates(subset=['Genes']).set_index('Genes')

    # Log transform
    final_genes = -np.log(final_genes)

    return final_genes

In [6]:
def get_all_gene_ranks(experiment_list, exclude_znf=True):
    """
    Load and process ranked gene data from multiple experiments, optionally removing ZNF genes.

    Parameters:
    -----------
    experiment_list : list of str
        List of experiment identifiers in the order: [ATAC, H3K4me1, H3K4me3, H3K27Ac]
    top_num : int
        Number of top genes to select from each dataset
    exclude_znf : bool
        Whether to exclude genes starting with 'ZNF' from the top list (default: True)

    Returns:
    --------
    final_genes : pd.DataFrame
        -log-transformed rank values of top genes (after optional filtering)
    """
    atac_ranked = pd.read_csv(f'{analysis}/{experiment_list[0]}_ranked.csv').set_index('Genes')
    H3K4me1_ranked = pd.read_csv(f'{analysis}/{experiment_list[1]}_ranked.csv').set_index('Genes')
    H3K4me3_ranked = pd.read_csv(f'{analysis}/{experiment_list[2]}_ranked.csv').set_index('Genes')
    H3K27Ac_ranked = pd.read_csv(f'{analysis}/{experiment_list[3]}_ranked.csv').set_index('Genes')

    # Compute minimum rank for sorting
    for df in [atac_ranked, H3K4me1_ranked, H3K4me3_ranked, H3K27Ac_ranked]:
        df['Minimum'] = df.min(axis=1)

    # Sort by minimum rank and reset index
    atac_ranked_ind = atac_ranked.sort_values('Minimum').reset_index()
    H3K4me1_ranked_ind = H3K4me1_ranked.sort_values('Minimum').reset_index()
    H3K4me3_ranked_ind = H3K4me3_ranked.sort_values('Minimum').reset_index()
    H3K27Ac_ranked_ind = H3K27Ac_ranked.sort_values('Minimum').reset_index()

    # Concatenate top genes
    top_genes = pd.concat([
        atac_ranked_ind,
        H3K4me1_ranked_ind,
        H3K4me3_ranked_ind,
        H3K27Ac_ranked_ind
    ], axis=0).reset_index(drop=True)

    # Optional: Remove ZNF genes
    if exclude_znf:
        top_genes = top_genes[~top_genes['Genes'].str.startswith('ZNF')]

    # Extract relevant rank columns and drop duplicates
    final_genes = top_genes[['Genes', 'Naive_rank', 'Early Extra-GC TFH_rank', 'Late Extra-GC TFH_rank', 'GC_rank']]
    final_genes = final_genes.drop_duplicates(subset=['Genes']).set_index('Genes')

    # Log transform
    final_genes = -np.log(final_genes)

    return final_genes

In [7]:
generank_dir = ['/ix/djishnu/Alisa/Tfh/Taiji/ATAC_RNA/output_atac','/ix/djishnu/Alisa/Tfh/Taiji/CUTnRUN_H3K4me1_RNA/output_cutnrun', '/ix/djishnu/Alisa/Tfh/Taiji/CUTnRUN_H3K4me3_RNA/output_cutnrun', '/ix/djishnu/Alisa/Tfh/Taiji/CUTnRUN_H3K27Ac_RNA/output_cutnrun']
#analysis = '/ix/djishnu/Alisa/Tfh/Taiji/gene_analysis/combined'
analysis = '/ix/djishnu/Alisa/Tfh/ForPaper/processed_Taiji'
experiment_list = ['ATAC_RNA', 'H3K4me1_RNA', 'H3K4me3_RNA', 'H3K27Ac_RNA']

In [26]:
all_genes = pd.read_csv('/ix/djishnu/Alisa/Tfh/Taiji/ATAC_RNA/output_atac/GeneRanks.tsv', sep='\t')
all_genes = all_genes.rename(columns={'Unnamed: 0':"Genes"})

In [30]:
my_df = pd.read_csv('/ix/djishnu/Alisa/Tfh/Taiji/CUTnRUN_H3K4me1_RNA/output_cutnrun/GeneRanks.tsv')

In [7]:
for i in range(0, len(generank_dir)):
    filtered_df = filter_GeneRanks(generank_dir[i], output_dir = analysis, experiment =experiment_list[i])

In [15]:
top_genes = get_top_genes(experiment_list, top_num=25)
top_genes = top_genes.rename(columns={'Naive_rank':'Naive','Early Extra-GC TFH_rank':'Early Extra-GC', 'Late Extra-GC TFH_rank':'Late Extra-GC','GC_rank':'GC'})
top_genes = top_genes.rename(columns={'Early Extra-GC TFH':'Early Extra-GC', 'Late Extra-GC TFH':'Late Extra-GC'})

In [13]:
all_genes = get_all_gene_ranks(experiment_list)
all_genes = all_genes.rename(columns={'Naive_rank':'Naive','Early Extra-GC TFH_rank':'Early Extra-GC', 'Late Extra-GC TFH_rank':'Late Extra-GC','GC_rank':'GC'})
all_genes.to_csv('/ix/djishnu/Alisa/Tfh/ForPaper/processed_Taiji/final_genes_all_sets_ALLGENES.csv')