### Import Libraries

In [13]:
import pandas as pd
import numpy as np

### Download human file from https://funcoup.org/downloads/ and import

In [14]:
human_funcoup = pd.read_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/data/FC5.0_H.sapiens_full', sep = '\t')

In [15]:
human_funcoup.head()

Unnamed: 0,#0:PFC,1:FBS_max,2:Gene1,3:Gene2,4:FBS_PPI,5:FBS_Metabolic,6:FBS_Signaling,7:FBS_Complex,8:LLR_MEX,9:LLR_PIN,...,30:LLR_SPO,31:LLR_BSU,32:LLR_ECO,33:LLR_PFA,34:LLR_DDI,35:LLR_SSO,36:LLR_SSC,37:LLR_MJA,38:LLR_OSA,39:Max_category
0,0.127,4.978,ENSG00000185008,ENSG00000196549,-0.572,4.978,2.823,-0.221,2.594,-0.038,...,,,,,,,,,,Metabolic
1,0.255,5.838,ENSG00000185008,ENSG00000115306,4.267,4.733,3.712,5.838,2.696,-0.208,...,,,,,,,,,,Complex
2,0.176,5.367,ENSG00000185008,ENSG00000149972,1.378,5.367,2.143,1.478,2.324,-0.038,...,,,,,,,-0.171,,,Metabolic
3,0.162,5.264,ENSG00000185008,ENSG00000083857,5.218,4.96,5.264,2.765,2.332,-0.017,...,,,,,,,,,,Signaling
4,0.132,5.021,ENSG00000185008,ENSG00000196090,2.433,3.29,5.021,0.659,2.721,-0.017,...,,,,,,,,,,Signaling


In [16]:
human_funcoup.shape

(5036826, 40)

### Filter and format 

In [17]:
# keep only selected columns and threshold > 0.9
selected_columns = ['#0:PFC', '2:Gene1', '3:Gene2']
human_funcoup = human_funcoup[selected_columns]

condition = human_funcoup['#0:PFC'] > 0.9
human_funcoup = human_funcoup[condition]

# view the result
human_funcoup

Unnamed: 0,#0:PFC,2:Gene1,3:Gene2
20,0.999,ENSG00000185008,ENSG00000082397
148,0.961,ENSG00000185008,ENSG00000109472
261,0.984,ENSG00000185008,ENSG00000116703
286,0.933,ENSG00000049541,ENSG00000126067
289,0.983,ENSG00000049541,ENSG00000105372
...,...,...,...
5036698,0.978,ENSG00000177370,ENSG00000100347
5036766,0.982,ENSG00000106153,ENSG00000100347
5036775,0.947,ENSG00000144821,ENSG00000264424
5036807,0.958,ENSG00000204392,ENSG00000111875


In [18]:
# combine the two gene columns
gene_concat_df = (
    human_funcoup[['2:Gene1', '3:Gene2']]
    .melt(value_name='gene_ID')
    .drop('variable', axis=1)
)
# view the result
gene_concat_df

Unnamed: 0,gene_ID
0,ENSG00000185008
1,ENSG00000185008
2,ENSG00000185008
3,ENSG00000049541
4,ENSG00000049541
...,...
814931,ENSG00000100347
814932,ENSG00000100347
814933,ENSG00000264424
814934,ENSG00000111875


In [19]:
# aggregate the counts for each gene (how many times a gene appears)
gene_counts = (
    gene_concat_df.value_counts()
    .rename_axis('gene_ID')
    .reset_index(name='counts')
)
# view the result
gene_counts

Unnamed: 0,gene_ID,counts
0,ENSG00000156508,1595
1,ENSG00000204628,1556
2,ENSG00000109971,1527
3,ENSG00000132341,1450
4,ENSG00000143947,1353
...,...,...
11496,ENSG00000174576,1
11497,ENSG00000174606,1
11498,ENSG00000132128,1
11499,ENSG00000174804,1


In [21]:
# verify the results
column_sum = gene_counts['counts'].sum()
print("Sum of counts column:", column_sum)

Sum of counts column: 814936


In [22]:
# Create UniProt Mapper input file
gene_counts['gene_ID'].to_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/data/gene_id.txt', index=False, header=False)

### Mapping results manipulation

1) Go to UniProt ID Mapper (https://www.uniprot.org/id-mapping) 
2) Load the "gene_id.txt" file (created in the previous line)
3) Save the .tsv results file  &  keep the IDs that were not mapped
4) Run IDs that were not mapped in UniProt to Ensembl BioMart and save the .txt (sep='\t')
5) Add Revigo results output file (sep='\t')

In [48]:
# load ensemble_to_uniprot file
uniprot_df = pd.read_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/data/Ensemble_to_Uniprot_2023_09_19.tsv', sep = '\t')
# load ensemble_to_biomart file (for those ids that were not mapped to uniprot)
biomart_df = pd.read_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/data/ensemble_BioMart_idmapping.txt', sep = '\t')
# load revigo result file
human_simrel_07_df = pd.read_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/Revigo results/human_simrel_7_human.txt', sep = ' ', header = None)

In [49]:
# drop unecessary columns from uniprot_df
uniprot_df.drop(columns=['Reviewed', 'Entry Name', 'Protein names', 'Gene Names'], inplace = True)
uniprot_df.columns = ['gene_ID', 'UniProt_ID']
# view the result
uniprot_df

Unnamed: 0,gene_ID,UniProt_ID
0,ENSG00000156508,P68104
1,ENSG00000156508,A0A087WV01
2,ENSG00000156508,A0A087WVQ9
3,ENSG00000156508,A0A7I2V3H3
4,ENSG00000156508,A0A7I2V5N4
...,...,...
55822,ENSG00000132128,A0A0B4J2G4
55823,ENSG00000174804,Q9ULV1
55824,ENSG00000000003,O43657
55825,ENSG00000000003,A0A087WYV6


In [50]:
# process the biomart output file accordingly
biomart_df.dropna(subset=['UniProtKB Gene Name ID'], inplace = True)
biomart_df.columns = ['gene_ID', 'UniProt_ID']
biomart_df

Unnamed: 0,gene_ID,UniProt_ID
0,ENSG00000099984,P0CG29
14,ENSG00000285292,A0A090N7Y2


In [51]:
# combine the two dataframes
uniprot_biomart_merged_df = pd.merge(uniprot_df, biomart_df, on=['gene_ID', 'UniProt_ID'], how='outer')
uniprot_biomart_merged_df

Unnamed: 0,gene_ID,UniProt_ID
0,ENSG00000156508,P68104
1,ENSG00000156508,A0A087WV01
2,ENSG00000156508,A0A087WVQ9
3,ENSG00000156508,A0A7I2V3H3
4,ENSG00000156508,A0A7I2V5N4
...,...,...
55824,ENSG00000000003,O43657
55825,ENSG00000000003,A0A087WYV6
55826,ENSG00000000003,A0A087WZU5
55827,ENSG00000099984,P0CG29


In [52]:
# format the revigo result file
human_simrel_07_df.columns = ['UniProt_ID', 'counts', 'GO terms']
human_simrel_07_df.drop(columns=['counts', 'GO terms'], inplace = True)
human_simrel_07_df

Unnamed: 0,UniProt_ID
0,A0A024RBG1
1,A0A075B6H5
2,A0A075B6H7
3,A0A075B6H8
4,A0A075B6H9
...,...
17813,S4R3P1
17814,S4R3Y5
17815,U3KPV4
17816,W5XKT8


In [54]:
# join the merged uniprot_biomart file with the revigo result file on 'UniProt_ID'
final_df = pd.merge(uniprot_biomart_merged_df, human_simrel_07_df, on='UniProt_ID', how='inner')
final_df

Unnamed: 0,gene_ID,UniProt_ID
0,ENSG00000156508,P68104
1,ENSG00000204628,P63244
2,ENSG00000109971,P11142
3,ENSG00000132341,P62826
4,ENSG00000143947,P62979
...,...,...
10884,ENSG00000174606,Q5VTE6
10885,ENSG00000132128,Q15345
10886,ENSG00000174804,Q9ULV1
10887,ENSG00000000003,O43657


In [55]:
# merge this 'final' df with the gene_counts df
final_merged_df = pd.merge(gene_counts, final_df, on = 'gene_ID')
final_merged_df = final_merged_df[['UniProt_ID', 'gene_ID', 'counts']]
final_merged_df = final_merged_df.rename(columns={'counts': 'Number of protein interactions'})
final_merged_df

Unnamed: 0,UniProt_ID,gene_ID,Number of protein interactions
0,P68104,ENSG00000156508,1595
1,P63244,ENSG00000204628,1556
2,P11142,ENSG00000109971,1527
3,P62826,ENSG00000132341,1450
4,P62979,ENSG00000143947,1353
...,...,...,...
10884,Q8IUM7,ENSG00000174576,1
10885,Q5VTE6,ENSG00000174606,1
10886,Q15345,ENSG00000132128,1
10887,Q9ULV1,ENSG00000174804,1


In [56]:
final_merged_df.to_csv('C:/Users/angel/Desktop/Applied Bioinformatics/Thesis/pleiotropy/data/human_simrel_7_protein_interactions.csv', index = False)