# Hi

You also need synbiolib from https://github.com/EndyLab/synbiolib.

```bash
git clone https://github.com/EndyLab/synbiolib
cd synbiolib
pip install -e .
```

Or just copy-paste the relevant code...

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%load_ext autoreload
%autoreload 2

import synbiolib as sb
import synbiolib.codon as codon

In [3]:
from Bio import Seq, SeqRecord, SeqIO, Entrez

In [4]:
Entrez.email = "acjs@stanford.edu"

CP016816.2 with 996 features


In [22]:
def get_genome(id):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=id) as handle:
        seq_record = SeqIO.read(handle, "gb")
    
    print("%s with %i features" % (seq_record.id, len(seq_record.features)))
    
    return seq_record
    
def optimize_coding_sequences(seq_record):
    genes = dict()
    table = codon.load_codon_table('ecoli')

    for feature in seq_record.features:
        if feature.type == 'CDS':
            locus = feature.qualifiers['locus_tag'][0]

            if 'translation' in feature.qualifiers:
                print(locus)
                trans = feature.qualifiers['translation'][0]
                genes[locus] = codon.optimize_protein(codon.codon_table_10plus(table), trans)
            else:
                print("{} doesn't have a translation".format(locus))
                
    return genes

In [19]:
genomes = dict()
genomes['syn3a'] = genes

In [23]:
for gen in ['L43967.2', 'CP017343.1']:
    print(gen)
    rec = get_genome(gen)
    genomes[gen] = optimize_coding_sequences(rec)
    print()
    

L43967.2
L43967.2 with 1084 features
MG_001
MG_002
MG_003
MG_004
MG_005
MG_006
MG_007
MG_008
MG_009
MG_010
MG_011
MG_012
MG_013
MG_014
MG_015
MG_018
MG_019
MG_020
MG_021
MG_022
MG_023
MG_024
MG_025
MG_026
MG_027
MG_028
MG_029
MG_030
MG_031
MG_032
MG_033
MG_034
MG_035
MG_036
MG_037
MG_038
MG_039
MG_040
MG_041
MG_042
MG_043
MG_044
MG_045
MG_046
MG_047
MG_048
MG_049
MG_050
MG_051
MG_052
MG_053
MG_054
MG_055
MG_473
MG_474
MG_056
MG_057
MG_058
MG_059
MG_060
MG_061
MG_062
MG_063
MG_064
MG_065
MG_066
MG_067
MG_068
MG_069
MG_070
MG_071
MG_072
MG_073
MG_074
MG_075
MG_076
MG_077
MG_078
MG_079
MG_080
MG_081
MG_082
MG_083
MG_084
MG_085
MG_086
MG_087
MG_088
MG_089
MG_090
MG_091
MG_092
MG_093
MG_094
MG_095
MG_096
MG_097
MG_098
MG_099
MG_100
MG_101
MG_102
MG_103
MG_476
MG_104
MG_105
MG_106
MG_107
MG_108
MG_109
MG_110
MG_111
MG_112
MG_113
MG_114
MG_115
MG_116
MG_117
MG_118
MG_119
MG_120
MG_121
MG_122
MG_123
MG_124
MG_125
MG_126
MG_127
MG_128
MG_129
MG_130
MG_132
MG_133
MG_134
MG_135
MG_136
MG_137
MG_1

In [34]:
for gen, seqs in genomes.items():
    print(gen, len("".join(seqs.values())))

syn3a 476499
L43967.2 526596
CP017343.1 716835


In [32]:
list(genomes.values())[0]

{'JCVSYN2_00005': 'ATGAATGTTAACGACATTCTGAAAGAATTAAAACTGTCCTTGATGGCCAACAAAAATATTGATGAAAGCGTTTACAATGATTATATTAAAACGATCAACATTCACAAGAAAGGCTTTAGTGATTATATTGTTGTAGTGAAAAGCCAGTTCGGTCTCTTAGCGATCAAGCAATTTCGTCAAACCATCGAGAATGAAATTAAAAACATCCTCAAAGAACCAGTTAATATTAGCTTTACCTACGAACAGGAATACAAAAAACAGCTGGAAAAGGATGAACTCATCAATAAAGATCACAGTGACATTATTACAAAAAAAGTAAAGAAAACTAACGAAAACACATTTGAGAACTTCGTGATTGGAGCATCCAACGAACAGGCTTTTATTGCCGTGCAGACGGTTTCCAAAAATCCTGGCATTAGTTATAACCCGCTTTTTATCTACGGCGAATCCGGTATGGGTAAAACACACCTGTTAAAAGCCGCGAAGAATTATATTGAAAGCAATTTCTCGGACTTAAAGGTCAGTTATATGAGCGGTGATGAATTCGCTCGTAAAGCGGTGGACATTTTACAAAAAACCCATAAAGAGATTGAGCAATTCAAAAACGAGGTGTGTCAGAATGACGTATTGATTATTGACGATGTGCAGTTTCTGAGCTACAAAGAGAAGACTAATGAAATTTTTTTTACGATTTTTAACAACTTCATCGAAAATGATAAACAGCTGTTTTTCTCCTCAGATAAATCCCCGGAACTGCTGAATGGCTTCGACAATCGTCTGATTACCCGTTTCAATATGGGTCTGAGCATCGCGATTCAGAAATTAGATAACAAAACCGCGACCGCAATCATTAAAAAAGAGATTAAAAACCAGAACATCAAGAGTGAAGTCACGAGTGAGGCGATCAATTTTATTAGTAACTATTACAGCGATGATGTGCGTAAAATTAAAGGCAGTGTCTCTCGCCTTAACTTTTGGTCC