In [1]:
pip install genbank

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install optipyzer

Note: you may need to restart the kernel to use updated packages.


In [3]:
# read in antigens
import pandas as pd

antigens = pd.read_csv('../antigens/antigens.csv')

In [4]:
# get each unique protein
protiens = antigens.drop_duplicates(subset='Protein ID')[["Protein ID", "Protein Sequence"]]
protiens

Unnamed: 0,Protein ID,Protein Sequence
0,Smp_075800.1,MMLFSLFLISILHILLVKCQLDTNYEVSDETVSDNNKWAVLVAGSN...
2,Smp_179950.1,MLISVLYIASLISHLEAHISIKNEKFEPLSDDIISYINEHPNAGWR...
3,Smp_158420.1,MLISVLCIASLITHLEAHISIKNEKFEPLSHDIISYINKHLDARRE...
4,MEG4.161011091,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...
5,MEG4.1610110,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...
6,MEG4.11101100,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...
7,MEG4.1700100,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...
8,MEG4.1200001,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...
9,MEG4.11711010172,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNKIFQTDKFHLKEYI...
10,MEG4.1700110,MNIYLIGILCIVGLIISQGSTANGSPLDDRFNDVNTINKKQFTEEE...


In [5]:
from Bio import SeqIO, SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
from Bio.Seq import Seq

# Read the GenBank template file
record: SeqRecord = SeqIO.read('SpyTag-Antigen-H6 Template.gb', 'genbank')
record



SeqRecord(seq=Seq('GCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGG...AAA'), id='SpyTag-Antigen-H6_Templ', name='SpyTag-Antigen-H6_Templ', description='', dbxrefs=[])

In [6]:
# find the location to insert
for feature in record.features:
    if 'label' in feature.qualifiers:
        if feature.qualifiers['label'][0] == "Linker":
            insert_location: ExactPosition = feature.location.end  # End of "Linker"

assert insert_location is not None 

insert_location

ExactPosition(3236)

In [7]:
import optipyzer

api = optipyzer.API()

def optimize(protein: str) -> str:
    # Perform codon optimization for the protein and return the optimized DNA sequence
    optimization = api.optimize(
        seq=protein,
        seq_type='protein',
        weights={'e_coli': 1.0}, # We are expressing in E. coli
    )
    return optimization['optimized_sd']

In [8]:
for protien_id, protien_seq in protiens.to_numpy():
    # Run codon optimization
    nucleotide_seq = optimize(protien_seq)[:-3] # Remove the stop codon
    
    # Insert the protein sequence at the correct location
    new_seq = record.seq[:insert_location] + Seq(nucleotide_seq) + record.seq[insert_location:]

    # Update the sequence in the record
    record.seq = new_seq

    # Calculate the length of the inserted sequence
    insert_length = len(nucleotide_seq)

    # Update features after the insertion
    updated_features = []
    for feature in record.features:
        if feature.location.start >= insert_location:
            # If the feature starts after the insertion point, shift its location
            new_start = feature.location.start + insert_length
            new_end = feature.location.end + insert_length
            new_feature = SeqFeature(FeatureLocation(new_start, new_end), type=feature.type, qualifiers=feature.qualifiers)
            updated_features.append(new_feature)
        elif feature.type == "CDS":   
            # Update the CDS to include the inserted sequence
            new_start = feature.location.start
            new_end = feature.location.end + insert_length
            feature.qualifiers.pop('translation', None) # let the translation be recalculated
            new_feature = SeqFeature(FeatureLocation(new_start, new_end), type=feature.type, qualifiers=feature.qualifiers)
            updated_features.append(new_feature)
        else:
            # Keep the feature as is if it occurs before the insertion point
            updated_features.append(feature)

    # Add the new feature for the inserted sequence
    inserted_feature = SeqFeature(
        FeatureLocation(insert_location, insert_location + insert_length),
        type="misc_feature",
        qualifiers={
            "label": [protien_id],
            "ApEinfo_revcolor": "#ff0000",  # Red color for the inserted antigen
            "ApEinfo_fwdcolor": "#ff0000"
        }
    )
    updated_features.append(inserted_feature)

    # Replace the old features with the updated ones
    record.features = updated_features

    # Write the modified record back to a new GenBank file
    construct_name = f'SpyTag-{protien_id}-H6'
    filename = f'{construct_name}.gb'
    with open(filename, 'w') as output_file:
        SeqIO.write(record, output_file, 'genbank')
        
    with open(filename, "r") as file:
        updated_file = file.read().replace("SpyTag-Antigen-H6_Templ", construct_name)
    
    with open(filename, "w") as file:
        file.write(updated_file)
        
    # Reset the record for the next iteration
    record = SeqIO.read('SpyTag-Antigen-H6 Template.gb', 'genbank')

