# Double check example ing the database

In [36]:
import learn2therm.database
import pandas as pd

In [3]:
db =learn2therm.database.L2TDatabase('../data/database')

### get best protein pair in the database

In [78]:
protein = db.execute("""
    SELECT 
        taxa_m.filepath AS meso_file,
        taxa_t.filepath AS thermo_file,
        taxa_m.taxa_index AS meso_taxa_index,
        taxa_t.taxa_index AS thermo_taxa_index,
        taxa_m.seq_16srRNA AS meso_seq_16srRNA,
        taxa_t.seq_16srRNA AS thermo_seq_16srRNA,
        protein_pairs.scaled_local_symmetric_percent_id,
        protein_pairs.local_E_value, 
        protein_pairs.subject_align_cov, 
        protein_pairs.query_align_cov, 
        protein_pairs.thermo_protein_int_index,
        protein_pairs.meso_protein_int_index,
        taxa_pairs.local_E_value AS E_16s,
        taxa_pairs.scaled_local_symmetric_percent_id AS pid_16s
    FROM protein_pairs
    INNER JOIN taxa AS taxa_m ON (protein_pairs.meso_index=taxa_m.taxa_index)
    INNER JOIN taxa AS taxa_t ON (protein_pairs.thermo_index=taxa_t.taxa_index)
    INNER JOIN taxa_pairs ON (taxa_pairs.meso_index=protein_pairs.meso_index AND taxa_pairs.thermo_index=protein_pairs.thermo_index)
    WHERE
        taxa_m.len_16s>1300
        AND taxa_t.len_16s>1300
        AND protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
    LIMIT 1
""")

In [79]:
protein = pd.Series(protein.T[0])

In [80]:
protein

meso_file                            ./data/refseq/bacteria/GCF_013409865.1_ASM1340...
thermo_file                          ./data/refseq/bacteria/GCF_006363815.1_ASM6363...
meso_taxa_index                                                                   4886
thermo_taxa_index                                                                14963
meso_seq_16srRNA                     TTTACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
thermo_seq_16srRNA                   TTGATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...
scaled_local_symmetric_percent_id                                                  0.8
local_E_value                                                                      0.0
subject_align_cov                                                             0.969697
query_align_cov                                                               0.962406
thermo_protein_int_index                                                      28682671
meso_protein_int_index                     

In [81]:
thermo_prot_seq = db.execute("""SELECT * FROM proteins WHERE protein_int_index=28682671""")['protein_seq'][0]
thermo_prot_seq

'MPPRSKAGAKKVRRKEKKNIAHGHAHIKSTFNNTIVSITDPQGNVISWASAGHVGFKGSRKSTPFAAQQAAEAAARRAMEHGMRKVDVFVKGPGSGRETAIRSLQATGLEVGSIQDVTPVPHNGCRPPKRRRV'

In [82]:
meso_prot_seq = db.execute("""SELECT * FROM proteins WHERE protein_int_index=17226417""")['protein_seq'][0]
meso_prot_seq

'MATPKSAARKPRKKEKKNVAVGQAHIKSTFNNTIVSITDTTGAVISWASSGGVGFKGSRKSTPFAAQLAAESAARQAQEHGMKKVDVFVKGPGSGRETAIRSLQAAGLEVGSINDVTPQAHNGCRPPKRRRV'

> #### blast online for protein reports 96% query cov and 82% id. Our %id makes sense at 80% because it is scaled. this is pretty convincing

***

## Manually open files, get 16s sequence

In [83]:
import learn2therm.io

In [84]:
thermo = learn2therm.io.seq_io_gnuzipped('../'+protein['thermo_file'], 'genbank')

In [86]:
protein_sequences = {
    'sequence': [],
    'desc': [],
}
seq_16srRNA = None
for record in thermo:
    for feature in record.features:
        # check for 16s
        if feature.type == 'rRNA' and feature.qualifiers['product'][0] == '16S ribosomal RNA':
            seq_16srRNA = feature.extract(record.seq)
            print(seq_16srRNA)
        # check for protein with a translation
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            if len(feature.qualifiers['translation']) > 1:
                raise ValueError(f'Multiple translations for feature')
            protein_sequences['sequence'].append(feature.qualifiers['translation'][0])
            if 'product' in feature.qualifiers:
                protein_sequences['desc'].append(feature.qualifiers['product'][0])
            else:
                protein_sequences['desc'].append(None)

TTGATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGAAAGGCCCCTTCGGGGGTACTCGAGCGGCGAACGGGTGAGTAACACGTGAGCAACCTGCCCTCGACTCTGGGATAAGCCTGGGAAACCGGGTCTAATACCGGATATGACCCGTCATCGCATGATGTGCGGGTGGAAAGTTCCCCTTTTTGGGGTTCGGTCGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTAACGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGCGACCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGCGGAAGCCTGACGCAGCGACGCCGCGTGAGGGATGAAGGCCTTCGGGTTGTAAACCTCTTTCAGCACCGACGAATTCGGACGGTAGGTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGCTTGTCGCGTCTGCCGTGAAAGCCCACGGCTTAACCGTGGGTCTGCGGTGGATACGGGCAGGCTAGAGGCAGGTAGGGGAGCATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACACCGGTGGCGAAGGCGGTGCTCTGGGCCTGTCCTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGTTGGGCGCTAGGTGTGGGGTTCTTCCACGGGCTCCGCGCCGTAGCTAACGCATTAAGCGCCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGTTGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACATCACCCGAAAACCTGCAGAGA

In [87]:
thermo_16s = seq_16srRNA
thermo_proteins = protein_sequences

In [88]:
thermo_prot_seq in thermo_proteins['sequence']

True

> #### the protein was in the original file... yay!

In [95]:
protein['thermo_seq_16srRNA'] == thermo_16s

True

> #### the 16s was in the original file, yay!

In [96]:
meso = learn2therm.io.seq_io_gnuzipped('../'+protein['meso_file'], 'genbank')

In [97]:
protein_sequences = {
    'sequence': [],
    'desc': [],
}
seq_16srRNA = None
for record in meso:
    for feature in record.features:
        # check for 16s
        if feature.type == 'rRNA' and feature.qualifiers['product'][0] == '16S ribosomal RNA':
            seq_16srRNA = feature.extract(record.seq)
            print(seq_16srRNA)
        # check for protein with a translation
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            if len(feature.qualifiers['translation']) > 1:
                raise ValueError(f'Multiple translations for feature')
            protein_sequences['sequence'].append(feature.qualifiers['translation'][0])
            if 'product' in feature.qualifiers:
                protein_sequences['desc'].append(feature.qualifiers['product'][0])
            else:
                protein_sequences['desc'].append(None)

TTTACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGTGAAGCAGAGCTTGCTCTGTGGATCAGTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCGTTGGAAACGACGTCTAATACCGGATACGAGCTGAGACCGCATGGTCATCAGTTGGAAAGATTTTTTGGTCAAGGATGGACTCGCGGCCTATCAGCTTGTTGGTGAGGTAATGGCTCACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCAACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTTAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAAAAGCACCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAAACTGGAGGCTCAACCTCCAGCCTGCAGTGGGTACGGGCAGACTAGAGTGCGGTAGGGGAGATTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGATCTCTGGGCCGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGTTGGGAACTAGATGTGGGGGCCATTCCACGGTCTCCGTGTCGCAGCTAACGCATTAAGTTCCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGCGGATTAATTCGATGCAACGCGAAGAACCTTACCAAGGCTTGACATATACGAGAACGGGCCAGAAATGGT

In [98]:
meso_16s = seq_16srRNA
meso_proteins = protein_sequences

In [99]:
meso_prot_seq in meso_proteins['sequence']

True

> #### the protein was in the original file... yay!

In [103]:
protein['meso_seq_16srRNA'] == meso_16s

True

> #### the 16s was in the original file, yay!

In [104]:
protein['meso_seq_16srRNA']

'TTTACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGTGAAGCAGAGCTTGCTCTGTGGATCAGTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCGTTGGAAACGACGTCTAATACCGGATACGAGCTGAGACCGCATGGTCATCAGTTGGAAAGATTTTTTGGTCAAGGATGGACTCGCGGCCTATCAGCTTGTTGGTGAGGTAATGGCTCACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCAACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTTAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAAAAGCACCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAAACTGGAGGCTCAACCTCCAGCCTGCAGTGGGTACGGGCAGACTAGAGTGCGGTAGGGGAGATTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGATCTCTGGGCCGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGTTGGGAACTAGATGTGGGGGCCATTCCACGGTCTCCGTGTCGCAGCTAACGCATTAAGTTCCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGCGGATTAATTCGATGCAACGCGAAGAACCTTACCAAGGCTTGACATATACGAGAACGGGCCAGAAATGG

In [105]:
protein['thermo_seq_16srRNA']

'TTGATGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGAAAGGCCCCTTCGGGGGTACTCGAGCGGCGAACGGGTGAGTAACACGTGAGCAACCTGCCCTCGACTCTGGGATAAGCCTGGGAAACCGGGTCTAATACCGGATATGACCCGTCATCGCATGATGTGCGGGTGGAAAGTTCCCCTTTTTGGGGTTCGGTCGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTAACGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGCGACCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGCGGAAGCCTGACGCAGCGACGCCGCGTGAGGGATGAAGGCCTTCGGGTTGTAAACCTCTTTCAGCACCGACGAATTCGGACGGTAGGTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGCTTGTCGCGTCTGCCGTGAAAGCCCACGGCTTAACCGTGGGTCTGCGGTGGATACGGGCAGGCTAGAGGCAGGTAGGGGAGCATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACACCGGTGGCGAAGGCGGTGCTCTGGGCCTGTCCTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGTTGGGCGCTAGGTGTGGGGTTCTTCCACGGGCTCCGCGCCGTAGCTAACGCATTAAGCGCCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGTTGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACATCACCCGAAAACCTGCAGAG

> #### 16s online reports 87% id, pretty much the same as what we got