# Overview
Previous results were done without the Halovivax sp. KZCA124 data. Updating to incorporate it.
In this notebook:
- I get the FsxA of Halovivax from the inferred homolog groups previously infered.
- Infer the position of the ectodomain for it (by means of an hmmsearch employing an Fsx ectodomain HMM as query against the sequence).
- Extract the ectodomain and join it to data send by Martin.
- Infer a new FsxA-ectodomain phylogeny
- Get metadata for the genome from NCBI.

# Getting Halovivax FsxA

In [38]:
# import libraries
import os
import subprocess
import glob
from Bio import SeqIO
import pandas as pd

# define auxiliary function
def create_dir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
        
# create target directories
target_dirs = ['../data/new_subset/halovivax_seqs',
               '../data/new_subset/halovivax_seqs/hmmsearch_results',
               '../data/new_subset/including_halovivax']

[create_dir(dir) for dir in target_dirs]

# load table for sequences of archaea, ORF annotation and so on and so on
orf_annot_table = pd.read_csv('../../analysis_mobile_elements_with_MAGs_and_Halovivax/results/MEs_annotated_features_vs_predicted_orfs_BRHs.tsv', sep = '\t')

# get sequences of the CG_3 cluster (i.e. FsxAs)
fsxa_seqs = [record for record in SeqIO.parse('../../analysis_mobile_elements_with_MAGs_and_Halovivax/results/trying_to_catch_homologs/results/connected_groups_filtered/fastas/protein/CG_3.faa', 'fasta')]
fsxa_ids = [record.id for record in fsxa_seqs]

# get table with taxa info
taxa_info = pd.read_csv('../../analysis_mobile_elements_with_MAGs_and_Halovivax/data/genomes_and_taxonomy.csv')

# get Halovivax ID
halovivax_id = taxa_info.query("`Taxonomy`.str.contains('Halovivax')")['Genome_id'].to_list()[0]

# identify that of Halovivax and save it
halovivax_fsxA_id = orf_annot_table.query("`subject_id` in @fsxa_ids and `subject_id`.str.contains('NZ_CP071597.1')")['subject_id'].to_list()[0]
halovivax_fsxA_name = orf_annot_table.query("`subject_id` in @fsxa_ids and `subject_id`.str.contains('NZ_CP071597.1')")['query_id'].to_list()[0]

halovivax_fsxA_record = [record for record in fsxa_seqs if record.id == halovivax_fsxA_id]

# rename and save
halovivax_fsxA_record[0].id = halovivax_fsxA_name
halovivax_fsxA_record[0].name = halovivax_fsxA_name
halovivax_fsxA_record[0]. description = ''

if not os.path.exists('../data/new_subset/halovivax_seqs/halovivax_fsxA.faa'):
    with open('../data/new_subset/halovivax_seqs/halovivax_fsxA.faa', 'w') as handle_fasta:
        SeqIO.write(halovivax_fsxA_record, handle_fasta, 'fasta')

# Extract Halovivax FsxA ectodomain

In [50]:
# perform hmmsearch to get ectodomain
if not os.path.exists('../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.domtblout'):
    hmmsearch_cmd = 'hmmsearch -o ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.hmmsearchout --tblout ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.tsv --domtblout ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.domtblout --pfamtblout ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.pfamtblout --cpu 10 ../../pR1SE/results/hmmer_search/data/fsx.ectos.hmm ../data/new_subset/halovivax_seqs/halovivax_fsxA.faa'.split(' ')
    subprocess.run(hmmsearch_cmd)

In [51]:
%%bash

cat ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.hmmsearchout

# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.1b2 (February 2015); http://hmmer.org/
# Copyright (C) 2015 Howard Hughes Medical Institute.
# Freely distributed under the GNU General Public License (GPLv3).
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  ../../pR1SE/results/hmmer_search/data/fsx.ectos.hmm
# target sequence database:        ../data/new_subset/halovivax_seqs/halovivax_fsxA.faa
# output directed to file:         ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.hmmsearchout
# per-seq hits tabular output:     ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.tsv
# per-dom hits tabular output:     ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.domtblout
# pfam-style tabular hit output:   ../data/new_subset/halovivax_seqs/hmmsearch_results/fsx_ectodomain_vs_halovivax_fsxA.pfamtblout
# 

In [55]:
# extract ectodomain sequence
halovivax_fsxA_ectodomain_record = halovivax_fsxA_record[0][23:501] # taking into account how to slice in Python, so 23 instead of 24 to start :)
halovivax_fsxA_ectodomain_record.id = halovivax_fsxA_record[0].id + '/24-501'
halovivax_fsxA_ectodomain_record.name = halovivax_fsxA_record[0].id + '/24-501'

# save into FASTA
if not os.path.exists('../data/new_subset/halovivax_seqs/halovivax_fsxA_ectodomain.faa'):
    with open('../data/new_subset/halovivax_seqs/halovivax_fsxA_ectodomain.faa', 'w') as handle_fasta:
        SeqIO.write(halovivax_fsxA_ectodomain_record, handle_fasta, 'fasta')

In [56]:
%%bash

cat ../data/new_subset/halovivax_seqs/halovivax_fsxA_ectodomain.faa

>lcl|NZ_CP071597.1_prot_WP_207587115.1_2894/24-501
AAVTSVDSIQFDSNSKFFSGEVFVIQYISNFDTDRIDVVLSSNDLEQAADGEVDQDLSID
VVQQDTAALYSISPSGEPRLGNLELTTAEKSTFEELDRWAWDTCYDINGDGGKEYAYESV
LTWSGTVYRGYCARENGFYGPVGRITKDREVFTTEWRVEASGESAQTATLSNGDTGRGVV
SDIGDHVKVRWDGNLDTGEEAPPADKEYALHGNQFEDGWRIIDRGRYADWRQHVRDLDTA
YEQWRDGDRSRDYLQNQLDTATEQAAAEYTGSPLTSAETVSSTYTDGQLRLEMDTDLAYP
SFTVYVDGAEYVSVSKPVGRPEITSTNGDEFGELDTGYVTGTVRNVGDGEGSFAGRLTSC
SDGFSFDSTQRTQRVDPGASVTYEFPVSFTSTGDQDEVGGSCTIEVTDTGSGERDFATAA
VTGVQENECSPGERFSKVASGGQHVIYQCSEDGMTFTEVERCEQGEEARQIDSELQCV


# Join to previous FASTA, align and infer ectodomain phylogeny

In [66]:
# get records for FsxA ectodomains
fsxA_ectodomains_records = [record for record in SeqIO.parse('../data/new_subset/NewSelEcoTree.fa', 'fasta')]

# add this record
fsxA_ectodomains_records.append(halovivax_fsxA_ectodomain_record)

# save into FASTA
if not os.path.exists('../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax.fa'):
    with open('../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax.fa', 'w') as handle_fasta:
        SeqIO.write(fsxA_ectodomains_records, handle_fasta, 'fasta')



**Note**: added by hand the sequence for Haloterrigena's ectodomain also!

In [70]:
# run MAFFT under linsi
if not os.path.exists('../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax_linsi.fa'):
    out_file = open('../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax_linsi.fa', 'w') 
    fasta_file = '../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax.fa'
    mafft_command = 'mafft --maxiterate 1000 --localpair {0}'.format(fasta_file).split(' ') # line for L-INS-I
    subprocess.run(mafft_command, stdout = out_file)
    
if not os.path.exists('../data/new_subset/including_halovivax/NewSelEcoTree_iq2.treefile'):
        # run IQTree
        iqtree_cmd = 'iqtree2 -s ../data/new_subset/including_halovivax/NewSelEcoTree_with_Halovivax_linsi.fa -nt AUTO -m MFP -safe -alrt 1000 -bb 1000 -pre ../data/new_subset/including_halovivax/NewSelEcoTree_iq2'.split(' ')
        subprocess.run(iqtree_cmd)

# Retrieve metadata of Halovivax from NCBI

In [68]:
# import libraries
from Bio import Entrez
import collections

# trying for one biosample
Entrez.email = 'mauricio.langleib@gmail.com'

# create empty list to allocate to allocate rows
geo_data_rows = []

# loop over BioSample IDs to get geographical data
biosample_id = 'SAMN18203374'
try:
    # retrieve data from BioSample
    # first get UID for given BioSample ID
    handle = Entrez.esearch(db = 'biosample', term = biosample_id)
    biosample_search_dict = Entrez.read(handle)
    biosample_uid = biosample_search_dict['IdList']
    # now search data for that UID
    handle = Entrez.efetch(db = 'biosample', id = biosample_uid, type = 'text')
    #print(handle.read())
    #biosample_data_dict = Entrez.read(handle)
    import xmltodict
    biosample_dict = xmltodict.parse(handle)
    # getting relevant geographical data
    # setting defaults to NA
    geo_loc_name = 'NA'
    lat = 'NA'
    lon = 'NA'
    isolation_source = 'NA'
    # parsing data
    # first case: only one organism annotated
    if type(biosample_dict['BioSampleSet']['BioSample']) == collections.OrderedDict:
        for attribute in biosample_dict['BioSampleSet']['BioSample']['Attributes']['Attribute']:
            if attribute['@attribute_name'] == 'geo_loc_name':
                geo_loc_name = attribute['#text']
            if attribute['@attribute_name'] == 'lat_lon':
                lat = attribute['#text'].split(' ')[0] + ' ' + attribute['#text'].split(' ')[1]
                lon = attribute['#text'].split(' ')[2] + ' ' + attribute['#text'].split(' ')[3]
            if attribute['@attribute_name'] == 'isolation_source':
                isolation_source = attribute['#text']
        if 'Description' in biosample_dict['BioSampleSet']['BioSample'].keys():
            if 'Title' in biosample_dict['BioSampleSet']['BioSample']['Description'].keys():
                title = biosample_dict['BioSampleSet']['BioSample']['Description']['Title']
            else:
                title = 'NA'
            if 'Organism' in biosample_dict['BioSampleSet']['BioSample']['Description'].keys():
                if '@taxonomy_id' in biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']:
                    taxa_id = biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']['@taxonomy_id']
                else:
                    taxa_id = 'NA'
                if 'OrganismName' in biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']:
                    organism = biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']['OrganismName']
                else:
                    organism = 'NA'
        else:
            title = 'NA'
            taxa_id = 'NA'
            organism = 'NA'
        # append pandas DataFrame with data to <geo_data_rows>
        geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                     'Geographical location': [geo_loc_name], 
                                                     'Latitude': [str(lat)], 
                                                     'Longitude': [str(lon)],
                                                     'Isolation source': [isolation_source],
                                                     'Title': [title],
                                                     'Taxa ID': [taxa_id],
                                                     'Organism': [organism]}))
    elif type(biosample_dict['BioSampleSet']['BioSample']) == list:
        for item in biosample_dict['BioSampleSet']['BioSample']:
            for attribute in item['Attributes']['Attribute']:
                if attribute['@attribute_name'] == 'geo_loc_name':
                    geo_loc_name = attribute['#text']
                if attribute['@attribute_name'] == 'lat_lon':
                    lat = attribute['#text'].split(' ')[0] + ' ' + attribute['#text'].split(' ')[1]
                    lon = attribute['#text'].split(' ')[2] + ' ' + attribute['#text'].split(' ')[3]
                if attribute['@attribute_name'] == 'isolation_source':
                    isolation_source = attribute['#text']
            if 'Description' in item.keys():
                if 'Title' in item['Description'].keys():
                    title = item['Description']['Title']
                else:
                    title = 'NA'
                if 'Organism' in item['Description'].keys():
                    if '@taxonomy_id' in item['Description']['Organism'].keys():
                        taxa_id = item['Description']['Organism']['@taxonomy_id']
                    else:
                        taxa_id = 'NA'
                    if 'OrganismName' in item['Description']['Organism'].keys():
                        organism = item['Description']['Organism']['OrganismName']
                    else:
                        organism = 'NA'
            else:
                title = 'NA'
                taxa_id = 'NA'
                organism = 'NA'
            # append pandas DataFrame with data to <geo_data_rows>
            geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                         'Geographical location': [geo_loc_name], 
                                                         'Latitude': [str(lat)], 
                                                         'Longitude': [str(lon)],
                                                         'Isolation source': [isolation_source],
                                                         'Title': [title],
                                                         'Taxa ID': [taxa_id],
                                                         'Organism': [organism]}))               
except:
    geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                 'Geographical location': ['ERROR'], 
                                                 'Latitude': [str('ERROR')], 
                                                 'Longitude': [str('ERROR')],
                                                 'Isolation source': ['ERROR'],
                                                 'Title': ['ERROR'],
                                                 'Taxa ID': ['ERROR'],
                                                 'Organism': ['ERROR']}))
    
halovivax_geo_data_table = pd.concat(geo_data_rows)

In [69]:
halovivax_geo_data_table

Unnamed: 0,BioSample ID,Geographical location,Latitude,Longitude,Isolation source,Title,Taxa ID,Organism
0,SAMN18203374,China,,,Salt lake,Microbe sample from Halovivax sp. KZCA124,2817025,Halovivax sp. KZCA124


**Extra info**: Halovivax species appear to be mesophiles that are usually grown under near neutral conditions (e.g. https://www.dsmz.de/collection/catalogue/details/culture/DSM-18321 and its growth medium specification https://www.dsmz.de/microorganisms/medium/pdf/DSMZ_Medium1460.pdf).
