In [1]:
import pandas as pd
import numpy as np
import csv
import h5py
import pickle
import random
random.seed(42)

### Functions to process data:
make_hint_df - makes a dataframe from the HINT interactions and Gene IDs.

sig_set_nodot - removes all the proteins with . after gene name since these are duplicates.

In [2]:
### Make a dataframe with HINT interactions and the Gene IDs
def make_hint_df(hint_data):
    gene_sets = []
    for entry in hint_data:
        prot_1 = entry[0].decode('utf-8')
        prot_2 = entry[1].decode('utf-8')
        
        gene_sets.append([prot_1, prot_2])
    return gene_sets

In [3]:
###Removes genes with . in them (isoforms)
def sig_set_nodot(gene_set, set_name='PPS', output_dir='/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int', save='True', taiji='False'):
    nodot = []
    if taiji=='True':
        gene_list = pd.read_csv('{0}'.format(gene_set))
        gene_list = set(gene_list['Genes'])
    else:
        gene_list = list(pd.read_csv('{0}'.format(gene_set)))
        gene_list = set(gene_list)
    for gene in gene_list:
        new_gene = gene.split(".")[0]
        if new_gene not in nodot:  
            nodot.append(new_gene)        
    if save=='True':
        with open("{0}/{1}_nodot_genes.pkl".format(output_dir, set_name), "wb") as fp:   #Pickling
            pickle.dump(nodot, fp)
    
    return nodot

### Functions to get first degree interactors from the genes of interest
first_degree_interactors - from a list of interest find all interacting pairs (including first degrees) in the HINT dataset.

first_degree_goldstand_set - from a gene_set of interest (Vinuesa, Crotty, etc.) how many significant genes and first degree interactors are there. Keeps all genes from the gene_set of interest, and provides lables (found in both, found only in significant genes, found only in gene_set.

first_degree_cytoscape_set -  from a gene_set of interest (Vinuesa, Crotty, etc.) how many significant genes and first degree interactors are there. Keeps all genes from the gene_set of interest, and provides lables (found in both, found only in significant genes, found only in gene_set for cytoscape mapping). 

In [4]:
### Finds all first degree interactors of a significant gene list, saves the pairs in a df and a list of all unique genes.
def first_degree_interactors(sig_genes, hint_data, output_dir='/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int', set_name='PPS', save ='off'):
    gene_sets = []
    genes = []
    for entry in hint_data:
        prot_1 = entry[0].decode('utf-8')
        prot_2 = entry[1].decode('utf-8')

        if prot_1 in sig_genes or prot_2 in sig_genes:
            print('Hit:', prot_1, prot_2)
            gene_sets.append([prot_1, prot_2])

            if prot_1 not in genes:
                genes.append(prot_1)
            if prot_2 not in genes:
                genes.append(prot_2)
                
    if save == 'on':
        print('Saving...')
        gene_set_df = pd.DataFrame(gene_sets, columns = ['Prot_1', 'Prot_2'])
        gene_set_df.to_csv('{0}/{1}_protein_pairs_df.csv'.format(output_dir, set_name), sep =',', index = False)
        
        with open("{0}/{1}_unique_genes.pkl".format(output_dir, set_name), "wb") as fp:   #Pickling
            pickle.dump(genes, fp)
            
        my_genes_df = pd.DataFrame(genes, columns=['Genes'])
        my_genes_df.to_csv('{0}/{1}_unique_genes_df.csv'.format(output_dir, set_name), sep =',', index = False)
        
    
    return gene_set_df, genes, my_genes_df

In [5]:
### Compares the genes from a set to the found genes
def first_degree_goldstand_set(gene_set, sig_list, hint_data, save='off', set_name='PPS', output_dir='/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int'):
    gene_sets = []
    all_genes = []
    
    for entry in hint_data:
        prot_1 = entry[0].decode('utf-8')
        prot_2 = entry[1].decode('utf-8')
        
    
        if prot_1 in gene_set or prot_2 in gene_set:
            if prot_1 in sig_list or prot_2 in sig_list:
                print('Hit:', prot_1, prot_2)
                gene_sets.append([prot_1, prot_2])

                if prot_1 not in all_genes:
                    all_genes.append(prot_1)
                if prot_2 not in all_genes:
                    all_genes.append(prot_2)
    
    prot_label = []
    for my_gene in all_genes:
        if my_gene in gene_set and my_gene in sig_list:
            prot_label.append([my_gene, 0])
        if my_gene not in gene_set and my_gene in sig_list:
            prot_label.append([my_gene, 1])
        if my_gene in gene_set and my_gene not in sig_list:
            prot_label.append([my_gene, 2])
        if my_gene not in gene_set and my_gene not in sig_list:
            prot_label.append([my_gene, 3])
        
    if save == 'on':
        print('Saving files ...')
        gene_sets_df = pd.DataFrame(gene_sets, columns = ['Prot_1', 'Prot_2'])
        gene_sets_df.to_csv('{0}/{1}_nodot_gene_sets_df.csv'.format(output_dir, set_name), sep =',', index = False)
        
        gene_labels_df = pd.DataFrame(prot_label, columns = ['Prot', 'Label'])
        gene_labels_df.to_csv('{0}/{1}_nodot_gene_labels_df.csv'.format(output_dir, set_name), sep =',', index = False)
        
        with open("{0}/{1}_nodot_unique_genes.pkl".format(directory, output), "wb") as fp:   #Pickling
            pickle.dump(all_genes, fp)
            
    return gene_sets, all_genes

In [6]:
### Map the full gold standard set (gene_set)
def first_degree_cytoscape_set(gene_set, sig_list, sig_fd, hint_data, save='off', output='genes', directory='/ix/djishnu/Alisa/Tfh/Network_analysis/results/'):
    gene_sets = []
    all_genes = []
    
    for entry in hint_data:
        prot_1 = entry[0].decode('utf-8')
        prot_2 = entry[1].decode('utf-8')
        
    
        if prot_1 in gene_set and prot_2 in gene_set:
            gene_sets.append([prot_1, prot_2])
        
            if prot_1 not in all_genes:
                all_genes.append(prot_1)
            if prot_2 not in all_genes:
                all_genes.append(prot_2)
                
        if (prot_1 not in gene_set and prot_2 in gene_set) or (prot_1 in gene_set and prot_2  not in gene_set):
            if prot_1 in sig_list and prot_2 in sig_list:
                gene_sets.append([prot_1, prot_2])

                if prot_1 not in all_genes:
                    all_genes.append(prot_1)
                if prot_2 not in all_genes:
                    all_genes.append(prot_2)
    
    prot_label = []
    for my_gene in all_genes:
        if my_gene in gene_set and my_gene in sig_fd:
            prot_label.append([my_gene, 0])
        if my_gene not in gene_set and my_gene in sig_fd:
            prot_label.append([my_gene, 1])
        if my_gene in gene_set and my_gene not in sig_fd:
            prot_label.append([my_gene, 2])
        if my_gene not in gene_set and my_gene not in sig_fd:
            prot_label.append([my_gene, 3])
        
    if save == 'on':
        print('Saving files ...')
        gene_sets_df = pd.DataFrame(gene_sets, columns = ['Prot_1', 'Prot_2'])
        gene_sets_df.to_csv('{0}/{1}_nodot_gene_sets_df.csv'.format(directory, output), sep =',', index = False)
        
        gene_labels_df = pd.DataFrame(prot_label, columns = ['Prot', 'Label'])
        gene_labels_df.to_csv('{0}/{1}_nodot_gene_labels_df.csv'.format(directory, output), sep =',', index = False)
        
        with open("{0}/{1}_nodot_unique_genes.pkl".format(directory, output), "wb") as fp:   #Pickling
            pickle.dump(all_genes, fp)
            
    return gene_sets_df, all_genes, gene_labels_df

### Find overlap function:
find_overlap - gives either the percent, length, or genes that are found in the significant genes with the gene_sets of interest (vinuesa, Crotty, etc.)

In [7]:
def find_overlap(gene_list, list_name = 'patternB', output='percent'):
    vinuesa_overlap = vinuesa_list & gene_list
    vinuesa_overlap_percent = len(vinuesa_overlap)/len(vinuesa_list)

    beckys_overlap = beckys_genes & gene_list
    beckys_overlap_percent = len(beckys_overlap)/len(beckys_genes)

    crotty_overlap = crotty_mouse & gene_list
    crotty_overlap_percent = len(crotty_overlap)/len(crotty_mouse)

    TFH_review_overlap = TFH_reviews & gene_list
    TFH_review_overlap_percent = len(TFH_review_overlap)/len(TFH_reviews)

    #all_sets_overlap = all_sets & gene_list
    #all_sets_overlap_percent = len(all_sets_overlap)/len(all_sets)
    if output == 'genes':
        return vinuesa_overlap, TFH_review_overlap, crotty_overlap,  beckys_overlap #, liu_overlap, all_sets_overlap
    if output == 'gene_len':
        return len(vinuesa_overlap), len(TFH_review_overlap), len(crotty_overlap), len(beckys_overlap),  #, liu_overlap, all_sets_overlap
    if output == 'genes_gene_len':
        return len(vinuesa_overlap), len(TFH_review_overlap), len(crotty_overlap),  len(beckys_overlap), gene_list # liu_overlap, all_sets_overlap
    if output == 'percent':
        return vinuesa_overlap_percent, TFH_review_overlap_percent, crotty_overlap_percent, beckys_overlap_percent #, all_sets_overlap_percent

### Random Overlaps functions:
These find how many genes would overlapping if a set were randomly made. 

random_set - make a set which is of a specific length from the hint genes.

random_overlaps_perm - for a length of a sig_gene set, make a random set, and record a dataframe of how many genes are overlapping with known gene_sets, and repeat a desired amount.

get_random_gene_list - needed for the upset plot, will take the average random and make a gene list that would mach the reviews such that one list of genes will map to the correct random.

In [9]:
def random_set(gene_set):
    gene_sets = []
    all_genes = []

    random_size_matched_list = set(random.sample(all_hint_genes, len(gene_set)))

    return random_size_matched_list

In [10]:
def random_overlaps_perm(sig_genes, set_name = 'PatternB', num_perm = 1000, output='percent'):
    for i in range(0,num_perm):
        print('Num_perm=', i)
        random_gene_set = random_set(sig_genes)
        random_genes_overlapping = find_overlap(random_gene_set, output=output)
        df1 = {'Set':['Random_{0}_{1}'.format(set_name, i)],
               'Vinuesa':[random_genes_overlapping[0]], 'TFH Review': [random_genes_overlapping[1]],'Crotty': [random_genes_overlapping[2]], 'Beckys': [random_genes_overlapping[3]],
                'Genes':[random_genes_overlapping[4]]}
        #df1 = {'Set':['Random_{0}_{1}'.format(set_name, i)],
        #       'Vinuesa':[random_genes_overlapping[0]],'TFH Review': [random_genes_overlapping[1]], 'Genes':[random_genes_overlapping[2]]}
            
        #print(df1)
        if i==0:
            random_percents_df = pd.DataFrame(df1)
        else:
            df1 = pd.DataFrame(df1)
            random_percents_df = pd.concat([random_percents_df,df1], axis=0)
        print(random_percents_df)
    
    return random_percents_df

In [None]:
def get_random_gene_list(random_overlap_set, output_dir = '/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int', set_name = 'PPS', save='on'):
    random_overlap_set = random_overlap_set.reset_index().drop(columns=['index'])
    random_overlap_subset_human = random_overlap_set.loc[(random_overlap_set['Vinuesa'].eq(np.round(np.mean(random_overlap_set['Vinuesa'])))) 
                                                   & (random_overlap_set['TFH Review'].eq(np.round(np.mean(random_overlap_set['TFH Review']))))].reset_index()
    random_overlap_subset_mouse = random_overlap_set.loc[(random_overlap_set['Crotty'].eq(np.round(np.mean(random_overlap_set['Crotty']))))
                                                  & (random_overlap_set['Beckys'].eq(np.round(np.mean(random_overlap_set['Beckys']))))].reset_index()
    
    random_overlap_genes_human = random_overlap_subset_human.loc[0]['Genes']
    random_overlap_genes_mouse = random_overlap_subset_mouse.loc[0]['Genes']
    gene_list_human = []
    gene_list_mouse = []
    
    for gene in random_overlap_genes_human:
        gene_list_human.append(gene)
    
    for gene in random_overlap_genes_mouse:
        gene_list_mouse.append(gene)
    random_overlap_genes_df_human = pd.DataFrame()
    random_overlap_genes_df_human['Genes'] = gene_list_human
    
    random_overlap_genes_df_mouse = pd.DataFrame()
    random_overlap_genes_df_mouse['Genes'] = gene_list_mouse
    
    if save=='on':
        print('Saving File...')
        with open('{0}/{1}_random_upset_human.txt'.format(output_dir, set_name), 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(random_overlap_genes_human)
            
        with open("{0}/{1}_random_upset_human.pkl".format(output_dir, set_name), "wb") as fp:   #Pickling
            pickle.dump(random_overlap_genes_human, fp)
        
        random_overlap_genes_df_human.to_csv("{0}/{1}_random_upset_human.csv".format(output_dir, set_name), index=False )
        
        with open('{0}/{1}_random_upset_mouse.txt'.format(output_dir, set_name), 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(random_overlap_genes_mouse)
            
        with open("{0}/{1}_random_upset_mouse.pkl".format(output_dir, set_name), "wb") as fp:   #Pickling
            pickle.dump(random_overlap_genes_mouse, fp)
        
        random_overlap_genes_df_mouse.to_csv("{0}/{1}_random_upset_mouse.csv".format(output_dir, set_name), index=False )
    return(random_overlap_genes_human, random_overlap_genes_mouse)

### Load in datasets:

In [12]:
###Load HINT interactions:
ppi = h5py.File('/ix/djishnu/Alisa/Tfh/Network_analysis/data/HomoSapiens_binary_co_complex_Feb2023_1_ppr_0.4.h5', 'r')
#list(ppi.keys())
hint_data = ppi['edges']

with open("/ix/djishnu/Alisa/Tfh/Network_analysis/results/all_hint_genes_list.pkl", "rb") as fp:   #Pickling
    all_hint_genes = pickle.load(fp)

In [13]:
hint = make_hint_df(hint_data)
hint_df = pd.DataFrame(hint)

In [14]:
###Load in outputs from network approaches:

#taiji_output = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/processed_Taiji/final_genes_all_sets_152.csv')
#taiji_set = set(taiji_output['Genes'])

#pps_output = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/process_network_prop_output/PPS_1_significant_gene_list.csv')
#pps_set= set(pps_output['Genes'])

#pps_output = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/process_network_prop_output/PPS_nomodule1_significant_gene_list_p0.05.csv')
#pps_set= set(pps_output['Genes'])

#logFC_rna = pd.read_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/data/Pattern_1234_RNASeq_S1234_logFC_top328.csv')
#log_FC_rna_set = set(logFC_rna['gene'])

#pps_noprop = pd.read_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/data/Pattern_1234_Genes_top328.csv')
#pps_noprop_set= set(pps_noprop['Gene'])

pps_noprop = pd.read_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/data/Pattern_1234_Genes_top184.csv')
pps_noprop_set= set(pps_noprop['Gene'])

#rna_output = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/process_network_prop_output/RNA_Early_Late_significant_gene_list_p0.05.csv')
#rna_set= set(rna_output['Genes'])

###Load in datasets from literature:
with open('/ix/djishnu/Alisa/Tfh/ForPaper/literature_sets/vinuesa_list.pkl',"rb") as fp:
    vinuesa_list = pickle.load(fp)

with open('/ix/djishnu/Alisa/Tfh/ForPaper/literature_sets/TFH_review_list.pkl',"rb") as fp:
    TFH_reviews = pickle.load(fp)

with open('/ix/djishnu/Alisa/Tfh/ForPaper/literature_sets/mouse_gene_list.pkl',"rb") as fp:
    crotty_mouse = pickle.load(fp)

with open('/ix/djishnu/Alisa/Tfh/ForPaper/literature_sets/beckys_list.pkl',"rb") as fp:
    beckys_genes = pickle.load(fp)


In [15]:
#Generate datasets with first degree interactors.
#taiji_genes = first_degree_interactors(sig_genes = taiji_set, hint_data= hint_data, set_name = 'taiji', save='off')
#pps_genes = first_degree_interactors(sig_genes = pps_set, hint_data= hint_data, set_name = 'pps', save='off')
#rna_genes = first_degree_interactors(sig_genes = rna_set, hint_data= hint_data, set_name = 'rna_2', save='on')
#pps_genes = first_degree_interactors(sig_genes = pps_set, hint_data= hint_data, set_name = 'pps_nomodule1', save='on')
#logFC_genes = first_degree_interactors(sig_genes = log_FC_rna_set, hint_data= hint_data, set_name = 'log_FC_rna_set', save='on')
pps_noprop = first_degree_interactors(sig_genes=pps_noprop_set, hint_data= hint_data, set_name = 'PPS_noprop_set_184', save='on' )

Hit: UBE2Q1 C1QTNF2
Hit: ZNF707 CASP8
Hit: ZC3H15 CEP44
Hit: RNF111 ACTN3
Hit: FOXQ1 ARHGAP10
Hit: XPA ARID3A
Hit: FBL CFLAR
Hit: GTSE1 BMP2K
Hit: NSL1 CBX3
Hit: NSL1 BRCA2
Hit: COL4A2 C1QTNF2
Hit: BCL2A1 BCL2L11
Hit: BCL2A1 BIK
Hit: BCL2A1 CT45A1
Hit: BCL2A1 SLC9A3R1
Hit: BCL2A1 BAX
Hit: BCL2A1 BAK1
Hit: BCL2A1 FAM9B
Hit: BCL2A1 BAD
Hit: BCL2A1 REL
Hit: BCL2A1 PMAIP1
Hit: BCL2A1 BBC3
Hit: SMAD9 ARID1B
Hit: SMAD2 ANTXR2
Hit: SMAD3 CFLAR
Hit: TCEA2 CEP44
Hit: TCEA2 AGR2
Hit: CCDC116 CEP44
Hit: ATP1A3 AKAP14
Hit: ATP1A3 CASP8
Hit: ATP1A1 ECPAS
Hit: ATP1A1 LIMK2
Hit: ATP1A1 TCTN3
Hit: ATP1A1 HIPK4
Hit: ATP1A1 BIRC3
Hit: ATP1A1 ILK
Hit: ATP1A1 NCSTN
Hit: ATP1A1 KIF14
Hit: ATP1A1 GOLT1B
Hit: ATP1A1 MBOAT1
Hit: ATP1A1 EZH2
Hit: ATP1A1 DYRK4
Hit: ATP1A1 CIT
Hit: ATP1A1 A2M
Hit: ATP1A1 PEBP1
Hit: ATP1A1 TMEM17
Hit: ATP1A1 CAND1
Hit: ATP1A1 TUBA1C
Hit: ATP1A1 PPT1
Hit: ATP1A1 JUN
Hit: ATP1A1 TRADD
Hit: ATP1A1 MAPK7
Hit: ATP1A1 AURKB
Hit: ATP1A1 MAP1LC3A
Hit: ATP1A1 NFKB1
Hit: ATP1A1 IRAK1
Hit: 

In [15]:
taiji_sig_set = sig_set_nodot('/ix/djishnu/Alisa/Tfh/ForPaper/processed_Taiji/final_genes_all_sets_152.csv', set_name='Taiji', output_dir='/ix/djishnu/Alisa/Tfh/ForPaper', save='True', taiji='True')

In [11]:
taiji_genes = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/taiji_unique_genes_df.csv')
#pps_genes = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/pps_unique_genes_df.csv')
#rna_genes = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/rna_unique_genes_df.csv')

In [34]:
unique_to_df1 = set(rna_genes['Genes']) - set(pps_genes['Genes'])

In [17]:
pps_taiji_genes_fd = set(taiji_genes['Genes']).union(set(pps_genes[2]['Genes']))
pps_taiji_fd_df = pd.DataFrame()
pps_taiji_fd_df['uGenes'] = list(pps_taiji_genes_fd)
pps_taiji_fd_df.to_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/pps_taiji_nomodule1_unique_genes_df.txt', index=False)
#pps_taiji = pps_set.union(taiji_set)

In [42]:
pps_taiji_rna_genes_fd = set(taiji_genes['Genes']).union(set(pps_genes['Genes'])).union(set(rna_genes['Genes']))
pps_taiji_rna_fd_df = pd.DataFrame()
pps_taiji_rna_fd_df['uGenes'] = list(pps_taiji_rna_genes_fd)
pps_taiji_rna_fd_df.to_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/pps_taiji_rna_unique_genes_df.txt', index=False)
#pps_taiji = pps_set.union(taiji_set)

In [43]:
pps_unique = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/pps_unique_genes_df.csv')
taiji_unique = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/taiji_unique_genes_df.csv')
rna_unique = pd.read_csv('/ix/djishnu/Alisa/Tfh/ForPaper/first_degree_int/rna_unique_genes_df.csv')

In [44]:
pps_unique = list(pps_unique['Genes'])
taiji_unique= list(taiji_unique['Genes'])
rna_unique = list(rna_unique['Genes'])

In [73]:
#Find overlap with known datasets:
vin_taiji_overlap = first_degree_goldstand_set(vinuesa_list, taiji_genes, hint_data, set_name= 'taiji')

In [16]:
#taiji_overlapping = find_overlap(taiji, list_name='taiji_151', output='percent')

#taiji_overlapping_random = random_overlaps_perm(taiji_set, set_name = 'taiji', num_perm=1000,output= 'genes_gene_len')
#taiji_overlapping_random_fd = random_overlaps_perm_(taiji_genes['Genes'], set_name = 'taiji_fd', num_perm=1000, output= 'genes_gene_len')
#taiji_overlapping_random.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_taiji_overlaps_df.csv')
#taiji_overlapping_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_taiji_fd_overlaps_df.csv')

#pps_overlapping_random = random_overlaps_perm(pps_set, set_name = 'pps', num_perm=1000,output= 'genes_gene_len')
#pps_overlapping_random_fd = random_overlaps_perm(pps_genes['Genes'], set_name = 'pps_fd', num_perm=1000, output= 'genes_gene_len')
#pps_overlapping_random.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_overlaps_df.csv')
#pps_overlapping_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_fd_overlaps_df.csv')

#logFC_random_fd = random_overlaps_perm(logFC_genes[1], set_name = 'logFC_fd', num_perm=1000, output= 'genes_gene_len')
#logFC_random_fd_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_logFC_random_fd_overlaps_df.csv')

pps_noprop_random_fd = random_overlaps_perm(pps_noprop[1], set_name = 'pps_noprop_fd', num_perm=1000, output= 'genes_gene_len')
pps_noprop_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_noprop_random_fd_overlaps_df_184.csv')

Num_perm= 0
                      Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_noprop_fd_0        6           8      11       8   

                                               Genes  
0  {TBX19, ZC3HAV1L, ELSPBP1, FZD7, RIMS4, SLC4A8...  
Num_perm= 1
                      Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_noprop_fd_0        6           8      11       8   
0  Random_pps_noprop_fd_1        7           9       9       6   

                                               Genes  
0  {TBX19, ZC3HAV1L, ELSPBP1, FZD7, RIMS4, SLC4A8...  
0  {FCHO2, TCEA1, YLPM1, ZC3HAV1L, APOM, BRD2, LR...  
Num_perm= 2
                      Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_noprop_fd_0        6           8      11       8   
0  Random_pps_noprop_fd_1        7           9       9       6   
0  Random_pps_noprop_fd_2        3           6      11      11   

                                               Genes  
0  {TBX19, ZC3HAV1L, ELSPBP1, FZD7, R

In [1]:
#taiji_overlapping_random.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_taiji_overlaps_df.csv')
#taiji_overlapping_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_taiji_fd_overlaps_TR_df.csv')

#pps_overlapping_random = random_overlaps_perm(pps_set, set_name = 'pps', num_perm=1000,output= 'genes_gene_len')
#pps_overlapping_random_fd = random_overlaps_perm(pps_genes['Genes'], set_name = 'pps_fd', num_perm=1000, output= 'genes_gene_len')
#pps_overlapping_random.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_overlaps_df.csv')
#pps_overlapping_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_fd_overlaps_df.csv')

In [99]:
pps_taiji_overlapping_random_fd =  random_overlaps_perm(pps_taiji_genes_fd, set_name = 'pps_taiji_fd', num_perm=1000, output= 'genes_gene_len')
pps_taiji_overlapping_random_fd.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_taiji_fd_overlaps_df.csv')

Num_perm= 0
                     Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_fd_0       11          15      26       9   

                                               Genes  
0  {GKN2, ADSS1, APEH, MTHFS, PP2135, LRP1B, RBM4...  
Num_perm= 1
                     Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_fd_0       11          15      26       9   
0  Random_pps_taiji_fd_1       17          18      27      16   

                                               Genes  
0  {GKN2, ADSS1, APEH, MTHFS, PP2135, LRP1B, RBM4...  
0  {Q9H368, APEH, GLRX2, NDUFAF1, CKM, FAM120B, T...  
Num_perm= 2
                     Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_fd_0       11          15      26       9   
0  Random_pps_taiji_fd_1       17          18      27      16   
0  Random_pps_taiji_fd_2       14          17      19      12   

                                               Genes  
0  {GKN2, ADSS1, APEH, MTHFS, PP2135, LRP1B, R

In [101]:
pps_taiji_overlapping_random =  random_overlaps_perm(pps_taiji, set_name = 'pps_taiji', num_perm=1000, output= 'genes_gene_len')
pps_taiji_overlapping_random.to_csv('/ix/djishnu/Alisa/Tfh/Network_analysis/results/percent_recouped_plots/random_pps_taiji_overlaps_df.csv')

Num_perm= 0
                  Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_0        0           0       0       1   

                                               Genes  
0  {PRPF8, CA14, Q5VU21, SMYD1, ARMC7, ANO2, PER1...  
Num_perm= 1
                  Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_0        0           0       0       1   
0  Random_pps_taiji_1        1           0       1       1   

                                               Genes  
0  {PRPF8, CA14, Q5VU21, SMYD1, ARMC7, ANO2, PER1...  
0  {B4DH63, SOX18, DNAJC14, ALDH1A3, DDX41, AGO2,...  
Num_perm= 2
                  Set  Vinuesa  TFH Review  Crotty  Beckys  \
0  Random_pps_taiji_0        0           0       0       1   
0  Random_pps_taiji_1        1           0       1       1   
0  Random_pps_taiji_2        0           0       1       1   

                                               Genes  
0  {PRPF8, CA14, Q5VU21, SMYD1, ARMC7, ANO2, PER1...  
0  {B4DH63, SOX18,

In [83]:
print(np.round(np.mean(taiji_overlapping_random['Vinuesa'])))

0.0


In [84]:
taiji_random_gene_set = taiji_overlapping_random_fd.loc[(taiji_overlapping_random_fd['Vinuesa'].eq(6)) 
                                                        & (taiji_overlapping_random_fd['TFH Review'].eq(8))
                                                        & (taiji_overlapping_random_fd['Crotty'].eq(11))
                                                        #& (taiji_overlapping_random_fd['Beckys'].eq(7))
                                                       ].reset_index()

In [18]:
#get_random_gene_list(logFC_random_fd, set_name = 'logFC_RNA_fd', save='on')
get_random_gene_list(pps_noprop_random_fd, set_name = 'PPS_noprop_fd_184', save='on')

Saving File...


({'TBX19',
  'ZC3HAV1L',
  'ELSPBP1',
  'FZD7',
  'RIMS4',
  'SLC4A8',
  'WHAMM',
  'YA61',
  'ARAF',
  'BCKDHB',
  'ATPAF1',
  'TBX22',
  'MARS2',
  'LUZP2',
  'TNMD',
  'HMCN1',
  'DOCK8',
  'NUP62',
  'ANKHD1',
  'ZNF175',
  'TRAF3IP2',
  'WDR73',
  'LCE1F',
  'FCGR2C',
  'ATP6V0E2',
  'ZNF3',
  'LRRC10',
  'GTF2F1',
  'PARD6G',
  'G3V3K5',
  'UPI00004DF2EA',
  'FRG1',
  'GALNT13',
  'SLC36A1',
  'ARMC12',
  'PPP2R5C',
  'GK',
  'CCND1',
  'B4DRT3',
  'F2RL2',
  'ZBTB22',
  'SORL1',
  'B4DU58',
  'HSPB3',
  'CENPS-CORT',
  'SHPK',
  'GRB14',
  'BRCA2',
  'D6RCP2',
  'TRPA1',
  'RGP1',
  'E7EPB4',
  'MAD2L1BP',
  'NEXN',
  'SINHCAF',
  'PIGA',
  'ALDH4A1',
  'UTP6',
  'UPI000013D87D',
  'ADAP1',
  'HDAC4',
  'Q8TA90',
  'HIGD1A',
  'B4DL80',
  'FLNB',
  'PFAS',
  'METTL2A',
  'RAG2',
  'EPAS1',
  'CTC1',
  'UBXN1',
  'LPAR2',
  'RCAN3',
  'PSMD13',
  'C4ORF42',
  'Q53HG0',
  'ZNF599',
  'B4DET5',
  'GPATCH2',
  'FBXO22',
  'ANKIB1',
  'C22ORF9',
  'NLRC5',
  'DLGAP4',
  'FAM160B1',
 

In [86]:
get_random_gene_list(taiji_overlapping_random_fd, set_name = 'taiji_fd')
get_random_gene_list(taiji_overlapping_random, set_name = 'taiji')
get_random_gene_list(pps_overlapping_random, set_name = 'pps')
get_random_gene_list(pps_overlapping_random_fd, set_name = 'pps_fd')
get_random_gene_list(pps_taiji_overlapping_random_fd, set_name = 'pps_taiji_fd')
get_random_gene_list(pps_taiji_overlapping_random, set_name = 'pps_taiji')

Saving File...


({'NAP1L1',
  'TLK2',
  'Q5VU21',
  'INSYN1',
  'PDGFC',
  'ADARB2',
  'IL17RA',
  'P78451',
  'FLJ10842',
  'PPP1R3C',
  'PHC1',
  'ZNF69',
  'MTHFS',
  'FCHSD2',
  'MPP6',
  'SRFBP1',
  'LRP1B',
  'SDHD',
  'PTHLH',
  'OTUD1',
  'GLB1L3',
  'SRGAP2B',
  'AGFG2',
  'RBM41',
  'DACT1',
  'MET',
  'SLCO3A1',
  'THBS3',
  'TRBC2',
  'SERPINA10',
  'ZNF354A',
  'PRR20E',
  'VHLL',
  'EPHB6',
  'NUDT3',
  'SOD3',
  'MBD6',
  'ATAT1',
  'RPL6',
  'CCNG2',
  'ARP3BETA',
  'PCGF1',
  'MICOS13',
  'DOCK1',
  'Q8NAS7',
  'LSM6',
  'SUPT7L',
  'P2RY6',
  'RCN1',
  'MNT',
  'BIRC3',
  'LAMTOR3',
  'DYNC1I1',
  'OARD1',
  'TNK2',
  'SLC5A4',
  'CORO6',
  'ACBD6',
  'HLA-DRB1',
  'BMP3',
  'GPD1L',
  'SLC29A2',
  'BFAR',
  'GALNT8',
  'PAX9',
  'LYPLA2',
  'SLC16A10',
  'NOX1',
  'RAD21',
  'PIGV',
  'RFT1',
  'H3BP91',
  'PPM1E',
  'RIPPLY2',
  'A8KA83',
  'SNX33',
  'PCBD2',
  'Q9NWG8',
  'DDA1',
  'GRIN2A',
  'HINT1',
  'MLYCD',
  'GRK7',
  'KCNK16',
  'COL3A1',
  'USP26',
  'WNT7A',
  'RTKN',
 