In [18]:
import pandas as pd
import numpy as np
import itertools

In [19]:
df = pd.read_csv("pipeline2/downloads/homology_databases/homo_sapiens.homologies.tsv.gz", compression="gzip", delimiter="\t")

In [20]:
df.head()

Unnamed: 0,gene_stable_id,protein_stable_id,species,identity,homology_type,homology_gene_stable_id,homology_protein_stable_id,homology_species,homology_identity,dn,ds,goc_score,wga_coverage,is_high_confidence,homology_id
0,ENSG00000271254,ENSP00000480818,homo_sapiens,27.4336,ortholog_many2many,ENSNLEG00000033214,ENSNLEP00000009514,nomascus_leucogenys,72.5752,,,0.0,4.03,0.0,54878896
1,ENSG00000271254,ENSP00000480818,homo_sapiens,20.9861,ortholog_many2many,ENSNLEG00000028011,ENSNLEP00000042954,nomascus_leucogenys,66.6667,,,0.0,75.36,0.0,54878908
2,ENSG00000271254,ENSP00000480818,homo_sapiens,24.6523,ortholog_one2many,ENSPTRG00000000232,ENSPTRP00000058531,pan_troglodytes,80.5785,,,0.0,6.69,0.0,55560202
3,ENSG00000271254,ENSP00000480818,homo_sapiens,14.5386,ortholog_many2many,ENSCHOG00000007713,ENSCHOP00000006838,choloepus_hoffmanni,18.6992,,,0.0,38.74,0.0,61939974
4,ENSG00000271254,ENSP00000480818,homo_sapiens,12.1365,ortholog_many2many,ENSCHOG00000007714,ENSCHOP00000006839,choloepus_hoffmanni,16.9312,,,0.0,12.31,0.0,61939989


In [21]:
hmm = pd.read_csv("pipeline3/config/hmm_table.tsv.gz", compression="gzip", delimiter="\t",names = ["family","gene_id","sequence_id", "species"])

In [22]:
hmm.head()

Unnamed: 0,family,gene_id,sequence_id,species
0,PTHR11432_SF3,ENSTNIG00000004376,ENSTNIP00000007010,tetraodon_nigroviridis
1,PTHR22773_SF41,ENSTNIG00000004377,ENSTNIP00000007011,tetraodon_nigroviridis
2,PTHR10422,ENSTNIG00000000533,ENSTNIP00000003835,tetraodon_nigroviridis
3,PTHR22888,ENSTNIG00000000112,ENSTNIP00000002153,tetraodon_nigroviridis
4,TF656662,ENSTNIG00000001181,ENSTNIP00000003540,tetraodon_nigroviridis


In [23]:
species = pd.read_csv("pipeline3/config/Valid_species_new_way.txt", delimiter=" ", names = ["species","species_id"])

In [24]:
species.head()

Unnamed: 0,species,species_id
0,ailuropoda_melanoleuca,ENSAMEG0
1,amphiprion_percula,ENSAPEG0
2,astatotilapia_calliptera,ENSACLG0
3,astyanax_mexicanus,ENSAMXG0
4,bos_taurus,ENSBTAG0


In [25]:
# consider only those homologous species in our species list
df = df[df.homology_species.isin(species.species)]

In [26]:
# sample 100 orthologs from each species, with replacement in case less than 100 orthologous pairs present
ortholog_samples = df[df.homology_type.str.contains("ortholog")].groupby("homology_species").sample(100,replace=True)
ortholog_samples = ortholog_samples.drop_duplicates() # drop the duplicated samples from replacement

## Generating gene pairs in the same protein family between two species

In [10]:
comparison_species = "ailuropoda_melanoleuca"
temp_hmm  = hmm[hmm.species.isin(["homo_sapiens",comparison_species])]

In [11]:
temp_hmm

Unnamed: 0,family,gene_id,sequence_id,species
3544665,PTHR26451_SF247,ENSAMEG00000005298,ENSAMEP00000041297,ailuropoda_melanoleuca
3544669,PTHR26451_SF319,ENSAMEG00000027833,ENSAMEP00000019703,ailuropoda_melanoleuca
3544671,PTHR22754,ENSAMEG00000025036,ENSAMEP00000025763,ailuropoda_melanoleuca
3544673,PTHR11006_SF52,ENSAMEG00000031399,ENSAMEP00000043819,ailuropoda_melanoleuca
3544683,PTHR22754,ENSAMEG00000001136,ENSAMEP00000001303,ailuropoda_melanoleuca
...,...,...,...,...
3786593,PTHR19271,ENSG00000198727,ENSP00000354554,homo_sapiens
3786594,PTHR12231_SF9,ENSG00000277666,ENSP00000482263,homo_sapiens
3786595,PTHR24118_SF46,ENSG00000276760,ENSP00000478796,homo_sapiens
3786596,PTHR26451_SF53,ENSG00000275249,ENSP00000484075,homo_sapiens


In [12]:
# get the list of genes that
family_pairs = []
for family in temp_hmm.family.drop_duplicates():
    # get all genes sharing the family
    temp = temp_hmm[temp_hmm.family == family]
    # split this into two databases for the human and the comparison
    human = temp[temp.species == "homo_sapiens"]
    other= temp[temp.species == comparison_species]
    # skip if only present in one species
    if (human.shape[0] == 0) | (other.shape[0] == 0):
        continue
    # enumerate all pairs of gene between the species, in the same family
    pairs = pd.Series(itertools.product(list(human.gene_id),list(other.gene_id)))
    # add it to the list
    family_pairs.append(pairs)
    

In [13]:
# concatenate this into a series of all gene pairs sharing a family
family_pairs = pd.concat(family_pairs)

In [14]:
family_pairs

0    (ENSG00000183024, ENSAMEG00000005298)
0    (ENSG00000172150, ENSAMEG00000027833)
1    (ENSG00000172150, ENSAMEG00000028317)
2    (ENSG00000172146, ENSAMEG00000027833)
3    (ENSG00000172146, ENSAMEG00000028317)
                     ...                  
1    (ENSG00000198908, ENSAMEG00000019140)
0    (ENSG00000198088, ENSAMEG00000010461)
0    (ENSG00000080572, ENSAMEG00000010454)
0    (ENSG00000101901, ENSAMEG00000011941)
0    (ENSG00000169684, ENSAMEG00000012716)
Length: 65289, dtype: object

In [15]:
# get all orthologous pairs
ortholog_pairs = df[(df.homology_species == comparison_species) & (df.homology_type.str.contains("ortholog"))][["gene_stable_id","homology_gene_stable_id"]].apply(tuple, axis = 1)

In [16]:
family_pairs.isin(ortholog_pairs).sum()

16710

In [17]:
ortholog_pairs

149        (ENSG00000276017, ENSAMEG00000027505)
378        (ENSG00000277196, ENSAMEG00000011835)
654        (ENSG00000278633, ENSAMEG00000027505)
707        (ENSG00000274847, ENSAMEG00000007524)
1087       (ENSG00000276345, ENSAMEG00000002153)
                           ...                  
3982737    (ENSG00000182899, ENSAMEG00000019159)
3982933    (ENSG00000185621, ENSAMEG00000004187)
3983644    (ENSG00000198899, ENSAMEG00000025297)
3984843    (ENSG00000277666, ENSAMEG00000014514)
3985208    (ENSG00000275249, ENSAMEG00000026002)
Length: 18702, dtype: object

In [None]:
for i,j in ortholog_pairs[~ortholog_pairs.isin(family_pairs)]:
    print(df[(df.gene_stable_id == i) & (df.homology_gene_stable_id == j)].homology_type)

In [27]:
df[(df.gene_stable_id == "ENSG00000088782") & (df.homology_gene_stable_id == "ENSAMEG00000011292")]

Unnamed: 0,gene_stable_id,protein_stable_id,species,identity,homology_type,homology_gene_stable_id,homology_protein_stable_id,homology_species,homology_identity,dn,ds,goc_score,wga_coverage,is_high_confidence,homology_id
9044,ENSG00000088782,ENSP00000371825,homo_sapiens,65.6566,ortholog_one2one,ENSAMEG00000011292,ENSAMEP00000011868,ailuropoda_melanoleuca,65.0,,,100.0,,1.0,83864229


In [28]:
df[(df.gene_stable_id == "ENSG00000163959") & (df.homology_gene_stable_id == "ENSAMEG00000003867")]

Unnamed: 0,gene_stable_id,protein_stable_id,species,identity,homology_type,homology_gene_stable_id,homology_protein_stable_id,homology_species,homology_identity,dn,ds,goc_score,wga_coverage,is_high_confidence,homology_id
3977993,ENSG00000163959,ENSP00000296327,homo_sapiens,84.4118,ortholog_one2one,ENSAMEG00000003867,ENSAMEP00000004087,ailuropoda_melanoleuca,84.9112,,,100.0,,1.0,83859110
