# Step 3 - Structure data

-- Alex Warwick Vesztrocy, Irene Julca, January 2024

This notebook contains the code to download the structure data and compute the foldseek alignment.


## 1 -- Download the structure data

In [1]:
from lib.PantherParser import PantherTrees
import glob
from tqdm.auto import tqdm
import pandas as pd 
import sys
import warnings

warnings.filterwarnings("ignore")

from parallel_pandas import ParallelPandas

In [None]:
pt = PantherTrees(
     "./data/panther-18.0/trees/",
     taxonomy_fn="./data/panther-18.0/species_tree.nhx",
)

In [None]:
def load_panther_mapping(pt):
    def load():
        for fam in pt.iter_fams():
            for g in fam.genes.values():
                r = {'fam_id': fam.id} | g
                for (k, v) in r.items():
                    if k.lower().startswith('ensembl'):
                        r[k] = v.split('.')[0]
                yield r
    return pd.DataFrame(load())

panther_genes = load_panther_mapping(pt)

In [None]:
!mkdir -p /results/structure/genes

for (sp, zdf) in panther_genes[['species', 'UniProtKB']].groupby('species'):
    zdf[['UniProtKB']].sort_values('UniProtKB').to_csv(f'results/structure/genes/{sp}.tsv.gz', sep='\t', index=False, header=False)

In [None]:
for file in glob.glob('results/structure/genes/*'):
    sp = file.split('/')[-1].split('.')[0]
    cmd = f"./structure/get.py {file} {sp} results/structure/db"
    !cmd

## 2 -- Align with foldseek

In [2]:
!mkdir -p results/structure/res_foldseek

! for SPECIES in `zcat results/outgroup_tests_structure.tsv.gz | cut -d $'\t' -f 20 | sort | uniq`; do\
if [ "$SPECIES" != "species" ]; \
then echo $SPECIES;\
PART=1;\
echo structure/align_foldseek.py results/outgroup_tests_structure.tsv.gz ${SPECIES} results/structure/db results/structure/res_foldseek/${SPECIES}_${PART}.h5 ${PART};\
PART=2;\
echo structure/align_foldseek.py results/outgroup_tests_structure.tsv.gz ${SPECIES} results/structure/db results/structure/res_foldseek/${SPECIES}_${PART}.h5 ${PART};\
PART=3;\
echo structure/align_foldseek.py results/outgroup_tests_structure.tsv.gz ${SPECIES} results/structure/db results/structure/res_foldseek/${SPECIES}_${PART}.h5 ${PART};\
fi;\
done


AMBTC
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz AMBTC ../results/structure/db ../results/structure/res_foldseek/AMBTC_1.h5 1
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz AMBTC ../results/structure/db ../results/structure/res_foldseek/AMBTC_2.h5 2
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz AMBTC ../results/structure/db ../results/structure/res_foldseek/AMBTC_3.h5 3
ANOCA
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz ANOCA ../results/structure/db ../results/structure/res_foldseek/ANOCA_1.h5 1
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz ANOCA ../results/structure/db ../results/structure/res_foldseek/ANOCA_2.h5 2
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz ANOCA ../results/structure/db ../results/structure/res_foldseek/ANOCA_3.h5 3
ANOGA
structure/align_foldseek.py ../results/outgroup_tests_structure.tsv.gz ANOGA ../results/struct

## 3 -- Combine for results

In [None]:
ParallelPandas.initialize()

pair_fn = 'results/outgroup_tests_structure.tsv.gz'
db_path = 'results/structure/res_foldseek/'
out_fn = 'results/structure/structure_results_FOLDSEEK.h5'

pair_df = pd.read_csv(pair_fn, sep='\t')
for (sp, zdf) in tqdm(pair_df.groupby('species')):
    print(sp)
    rmsd = pd.read_hdf(f'{db_path}/{sp}_1.h5', sp).set_index(['gene1', 'gene2'])['lddt'].to_dict()
    zdf['struct_ldo_mdo_lddt'] = zdf.p_apply(lambda x: rmsd.get((x['ldo_gene'], x['mdo_gene']), None), axis=1)

    rmsd = pd.read_hdf(f'{db_path}/{sp}_2.h5', sp).set_index(['gene1', 'gene2'])['lddt'].to_dict()
    zdf['struct_ldo_out_lddt'] = zdf.p_apply(lambda x: rmsd.get((x['ldo_gene'], x['out_gene']), None) if pd.notna(x['out_gene']) e
lse None, axis=1)

    rmsd = pd.read_hdf(f'{db_path}/{sp}_3.h5', sp).set_index(['gene1', 'gene2'])['lddt'].to_dict()
    zdf['struct_mdo_out_lddt'] = zdf.p_apply(lambda x: rmsd.get((x['mdo_gene'], x['out_gene']), None) if pd.notna(x['out_gene']) e
lse None, axis=1)

    zdf.to_hdf(out_fn, sp)
