# Step 5 -- Genome Mapping
-- Alex Warwick Vesztrocy, February 2024

Generate genome mapping tables for each of the species. This is necessary as PANTHER genomes are imported from UniProt RPs, whereas bgee is using ensembl data directly.

This is going to be a combination of 3 mapping techniques for most of the files.

    1. PANTHER cross references inside the tree files
    2. bgee-uniprot cross references from the UniProt website
    3. ensembl-uniprot cross references taken from the relevant assembly used in bgee

Then for certain species these will be supplemented with bespoke mapping files from 3rd party resources. e.g., RAT, MOUSE, HUMAN.

    - RAT: using RGD gene definition file (contains ensembl cross-references). Downloaded on 7.3.24
    - MOUSE: using MGI gene model co-ordinates file (contains ensebl gene IDs). Downloaded on 7.3.24
    - HUMAN: use HGNC to ensembl mapping. Generated and downloaded on 7.3.24
    

---

In [1]:
from tqdm.auto import tqdm
import pandas as pd

from lib.PantherParser import PantherTrees

In [2]:
pt = PantherTrees(
     "./data/panther-18.0/trees/",
     taxonomy_fn="./data/panther-18.0/species_tree.nhx",
)

In [3]:
def load_panther_mapping(pt):
    def load():
        for fam in pt.iter_fams():
            for g in fam.genes.values():
                r = {'fam_id': fam.id} | g
                for (k, v) in r.items():
                    if k.lower().startswith('ensembl'):
                        r[k] = v.split('.')[0]
                yield r
    return pd.DataFrame(load())

In [4]:
pthr_genes = load_panther_mapping(pt)

For each family:   0%|          | 0/15693 [00:00<?, ?it/s]

In [5]:
# species that have data and are in PANTHER
dataset = {'animals': {'ANOCA','BOVIN','CAEEL','CANLF','CHICK','DANRE','DROME','FELCA','GORGO','HORSE','HUMAN','LEPOC','MACMU','MONDO','MOUSE','ORNAN','ORYLA','PANTR','PIG','RAT', 'XENTR'},
           'plants': {'AMBTC','ARATH','BRANA','CAPAN','CUCSA','HELAN','MAIZE','MANES','MARPO','MEDTR','ORYSJ','PHYPA','POPTR','SELML','SETIT','SOLLC','SOLTU','SOYBN','VITVI','WHEAT'}}

---
## Animal Gene Mapping to bgee

Ensuring we get a good mapping between bgee identifiers and PANTHER.

In [6]:
pthr_genes_animal = pthr_genes[pthr_genes.species.isin(dataset['animals'])]

In [7]:
# load the UniProt - bgee mapping file
bgee_map = pd.read_csv('./data/uniprot/bgee-uniprot.tsv.gz', sep='\t')
# remove the trailing ';'
bgee_map.Bgee = bgee_map.Bgee.apply(lambda x: x[:-1])
bgee_map.rename(columns={'Entry': 'UniProtKB'}, inplace=True)
bgee_map = bgee_map[['UniProtKB', 'Bgee']]

In [8]:
# load HGNC map
hgnc_map = pd.read_csv('./data/mapping/hgnc/hgnc.map', sep='\t')
hgnc_map.rename(columns={hgnc_map.keys()[0]: 'hgnc_id',
                         hgnc_map.keys()[1]: 'ensembl_hgnc_map'},
                inplace=True)
hgnc_map.hgnc_id = hgnc_map.hgnc_id.apply(lambda x: x.split(':')[1])

In [9]:
# load RGD map
def load_rgd():
    def gen():
        rgd_map = pd.read_csv('./data/mapping/rgd/GENES_RAT.txt', sep='\t', comment='#', low_memory=False)
        f = rgd_map.ENSEMBL_ID.notna()
        for (_, r) in rgd_map[f][['GENE_RGD_ID', 'ENSEMBL_ID']].iterrows():
            for x in r['ENSEMBL_ID'].split(';'):
                yield (str(r['GENE_RGD_ID']), x)
    return pd.DataFrame(gen(), columns=['rgd_id', 'ensembl_rgd_map'])

rgd_map = load_rgd()

In [10]:
# load MGI map
def load_mgi():
    mgi_map = pd.read_csv('./data/mapping/mgi/MGI_Gene_Model_Coord.rpt', sep='\t', low_memory=False, index_col=False)
    mgi_map = mgi_map.rename(columns={'1. MGI accession id': 'mgi_id',
                                      '11. Ensembl gene id': 'ensembl_mgi_map'})
    mgi_map = mgi_map[mgi_map['ensembl_mgi_map'].notna()][['mgi_id', 'ensembl_mgi_map']]
    mgi_map['mgi_id'] = mgi_map['mgi_id'].apply(lambda x: 'MGI={}'.format(x.split(':')[1]))
    return mgi_map

mgi_map = load_mgi()

In [11]:
stats = []
res = {}
for (sp, zdf) in tqdm(pthr_genes_animal.groupby('species')):
    # join with the ensembl-uniprot map
    ensembl_map = pd.read_csv(f'./data/ensembl/{sp}.tsv.gz', sep='\t')
    ensembl_map = ensembl_map[['gene_stable_id', 'xref']].rename(columns={'xref': 'UniProtKB', 'gene_stable_id': 'ensembl_xref'})
    zdf = pd.merge(zdf, ensembl_map, how='left', on='UniProtKB')

    # join with the uniprot-bgee map
    zdf = pd.merge(zdf, bgee_map, how='left', on='UniProtKB')
    # these are the genes that we have data on...
    expr_df = pd.read_hdf('./results/expr/animal_tpm_expr.h5', sp)
    genes_with_data = set(expr_df.index)

    # concatenate the different mappings
    a = zdf[['UniProtKB', 'Bgee']].rename(columns={'Bgee': 'xref'})
    a['type'] = 'uniprot-bgee'
    b = zdf[['UniProtKB', 'ensembl_xref']].rename(columns={'ensembl_xref': 'xref'})
    b['type'] = 'ensembl-uniprot'
    c = zdf[['UniProtKB', 'Ensembl']].rename(columns={'Ensembl': 'xref'})
    c['type'] = 'panther_ensembl'
    d = zdf[['UniProtKB', 'EnsemblGenome']].rename(columns={'EnsemblGenome': 'xref'})
    d['type'] = 'panther_ensemblgenome'

    if sp == 'HUMAN':
        zdf = pd.merge(zdf, hgnc_map, how='left', left_on='HGNC', right_on='hgnc_id')
        #map_df = pd.concat((map_df, zdf[['UniProtKB', 'ensembl_hgnc_map']].rename(columns={'ensembl_hgnc_map': 'xref'})))
        e = zdf[['UniProtKB', 'ensembl_hgnc_map']].rename(columns={'ensembl_hgnc_map': 'xref'})
        e['type'] = 'hgnc'
        map_df = pd.concat((e,d,c,b,a))
    elif sp == 'RAT':
        zdf = pd.merge(zdf, rgd_map, how='left', left_on='RGD', right_on='rgd_id')
        #map_df = pd.concat((map_df, zdf[['UniProtKB', 'ensembl_rgd_map']].rename(columns={'ensembl_rgd_map': 'xref'})))
        e = zdf[['UniProtKB', 'ensembl_rgd_map']].rename(columns={'ensembl_rgd_map': 'xref'})
        e['type'] = 'rgd'
        map_df = pd.concat((e,d,c,b,a))
    elif sp == 'MOUSE':
        zdf = pd.merge(zdf, mgi_map, how='left', left_on='MGI', right_on='mgi_id')
        #map_df = pd.concat((map_df, zdf[['UniProtKB', 'ensembl_mgi_map']].rename(columns={'ensembl_mgi_map': 'xref'})))
        e = zdf[['UniProtKB', 'ensembl_mgi_map']].rename(columns={'ensembl_mgi_map': 'xref'})
        e['type'] = 'mgi'
        map_df = pd.concat((e,d,c,b,a))
    else:
        # Note: concatenation is ordered and then duplicates removed to prioritise higher quality mappings
        # i.e., those in source (e.g., PANTHER or via MGI/RGD), then those from ensembl, then those from bgee.
        map_df = pd.concat((d,c,b,a))
    map_df = map_df[map_df.xref.notna()]

    # filter the map to xref that are in the expression data
    map_df = map_df[map_df['xref'].isin(genes_with_data)].drop_duplicates('UniProtKB')

    all_panther_fam_genes = set(zdf['UniProtKB'])
    have = set(map_df[map_df.UniProtKB.isin(all_panther_fam_genes)].UniProtKB)
    missing = all_panther_fam_genes - set(map_df.UniProtKB)
    stats.append({'species': sp,
                  'dataset': 'animal',
                  'n_mapped_with_data': len(have),
                  'n_in_fam': len(all_panther_fam_genes)
                 })
    stats[-1]['percent_with_data'] = '{:02%}'.format(stats[-1]['n_mapped_with_data'] / stats[-1]['n_in_fam'])

    res[sp] = map_df[map_df.UniProtKB.isin(all_panther_fam_genes)]

  0%|          | 0/21 [00:00<?, ?it/s]

---

## Plant Mapping
Check the consistency for the plant mapping files and reformat.

In [12]:
def find_uniprot(x):
    for y in x.split('|'):
        if y.startswith('UniProtKB'):
            return y.split('=')[1]

In [13]:
pthr_genes_plants = pthr_genes[pthr_genes.species.isin(dataset['plants'])]

In [14]:
pthr_genes_plants = {sp: zdf for (sp, zdf) in pthr_genes_plants.groupby('species')}

In [28]:
plant_expr = 'results/tpm_expr/plants'

for sp in tqdm(dataset['plants']):
    # load the expression to get all gene identifiers
    general_df = pd.read_hdf('./results/expr/plant_general_tpm_expr.h5', sp)
    specific_df = pd.read_hdf('./results/expr/plant_specific_tpm_expr.h5', sp)
    sp_gene_ids = (set(general_df.index) | set(specific_df.index))

    # load mapping file
    map_df = pd.read_csv(f'data/plant_expr/mapping/{sp}.conversion.panther.txt', sep='\t', names=['gene_id', 'panther_id'])
    map_df['uniprot_id'] = map_df.panther_id.apply(find_uniprot)
    assert map_df.uniprot_id.isna().sum() == 0

    map_df1 = map_df[map_df.gene_id.isin(sp_gene_ids)]

    # identify proportion that we have mapped
    stats.append({'species': sp,
                  'dataset': 'plant',
                  'n_mapped_with_data': len(set(pthr_genes_plants[sp].UniProtKB) & set(map_df1.uniprot_id)),
                  'n_in_fam': len(set(pthr_genes_plants[sp].UniProtKB))
                 })
    stats[-1]['percent_with_data'] = '{:.02%}'.format(stats[-1]['n_mapped_with_data']/ stats[-1]['n_in_fam'])

    # save similar to the plants
    map_df1 = map_df1[['uniprot_id', 'gene_id']].rename(columns={'uniprot_id': 'UniProtKB', 'gene_id': 'xref'})
    map_df1['type'] = 'plant_mapping'
    
    res[sp] = map_df1[map_df1.UniProtKB.isin(set(pthr_genes_plants[sp].UniProtKB))]

  0%|          | 0/20 [00:00<?, ?it/s]

In [29]:
for sp in tqdm(res):
    res[sp].to_hdf('./results/expr/gene_map.h5', sp, complevel=9, complib='zlib')

  0%|          | 0/41 [00:00<?, ?it/s]

In [30]:
stats_df = pd.DataFrame(stats)

In [31]:
stats_df.to_csv('./results/tables/gene_map_stats.tsv', sep='\t', index=False)

In [32]:
stats_df

Unnamed: 0,species,dataset,n_mapped_with_data,n_in_fam,percent_with_data
0,ANOCA,animal,12149,18355,66.189049%
1,BOVIN,animal,19996,22367,89.399562%
2,CAEEL,animal,12191,14390,84.718555%
3,CANLF,animal,565,19481,2.900262%
4,CHICK,animal,14436,16357,88.255793%
5,DANRE,animal,6946,24063,28.865894%
6,DROME,animal,10804,10985,98.352299%
7,FELCA,animal,16747,18549,90.285191%
8,GORGO,animal,16535,19527,84.677626%
9,HORSE,animal,17963,19857,90.461802%
