In [2]:
# start coding here

# Insert summary

In [3]:
from Bio import SeqIO
from pydna.readers import read
from pydna.genbankrecord import GenbankRecord
from pydna.dseq import Dseq
import pandas as pd
%load_ext rpy2.ipython

In [4]:
inserts = snakemake.input['insert_genbank']

In [5]:
def read_genbank_files(*args):
    records = []
    for each_file in args:
        records.append(GenbankRecord(read(str(each_file))))
    return records

In [6]:
insert_records = read_genbank_files(*inserts)

## Insert sequence lengths

In [7]:
def clean_string(s):
    # remove junk from strings (underscores newlines etc)
    return s.replace('_', ' ').strip()

In [8]:
def clean_insert_name(name):
    return name.split('_')[-1]

In [9]:
def ordered_feature_names(record):
    # get feature names in the order they appear in a record
    names = []
    for each_feature in record.features:
        names.append(clean_string(each_feature.extract(record).name))
    return names

In [10]:
def ordered_insert_names(names):
    # get insert names ordered by their number
    return sorted(names, key=lambda n: int(n.split('-')[-1]))

Add restriction sites as features.

In [11]:
def add_restriction_site_as_feature(record, enzyme):
    site_start = record.seq.watson.find(enzyme.site)  # using cut or casting GenbankRecord to string caused AttributeError. Bug?
    if site_start != -1:
        record.add_feature(site_start, site_start+len(enzyme.site), label=f'{str(enzyme)}')

In [12]:
insert_records[0].unique_cutters()

Create dataframe of each insert's features and their lengths for plotting with ggplot.

In [None]:
def seq_len_df(records):
    df_rows = []
    for each_record in records:
        for each_feature in each_record.features:
            df_rows.append(
                {
                    'insert': clean_insert_name(each_record.name),
                    'feature': clean_string(each_feature.extract(each_record).name),
                    'feature_length': len(each_feature.extract(each_record))
                }
            )
    return pd.DataFrame(df_rows)

In [None]:
len_df = seq_len_df(insert_records)
print(len_df)

In [None]:
feat_order = ordered_feature_names(insert_records[0])
insert_order = ordered_insert_names([clean_insert_name(r.name) for r in insert_records])

In [None]:
%%R -i len_df -i feat_order -i insert_order -w 7 -h 7 --units in -r 300
# import df from global environment
# make default figure size 5 by 5 inches with 200 dpi resolution
len_df$feature <- factor(len_df$feature, levels=feat_order)
len_df$insert <- factor(len_df$insert, levels=insert_order)
library(ggplot2)
library(ggpubr)
ggplot(len_df, aes(fill=feature, x=insert, y=feature_length)) + 
       geom_bar(position='stack', stat='identity', color='black') + labs(x='Insert', y='Length') +
        theme_pubr() + scale_fill_brewer(palette = "Dark2") +
        theme(axis.text.x = element_text(angle = 45, hjust=1))

Lengths of all feature types should be the same.

In [None]:
from Bio import SeqIO
g = SeqIO.read(inserts[0], format='genbank')
g.features[0].extract(g)

In [None]:
import Bio

In [None]:
str(Seq('AAGAGAGAGTGTTGTAG'))

In [None]:
insert_records[0].seq.watson.find(KpnI.site)
insert_records[0].cut(XbaI)