* Load up the sequencing counts produced by proteins.py
* Combine data from two sequencing gates
* Take input FASTA files
* Perform needle alignment for input files
* Retrieve counts from sequencing for those variants

In [35]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns   This is just for plots

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)



This cell specifies where the InDelScanner scripts are located: modify `indels_path` as needed.

In [5]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign

In [31]:
def protein_needle(prot_files, reffile):
    """
    Use the Emboss Needle package to align fastq read to reference, return trimmed reads from the alignment
    :param prot_files: list of fastq file
    :param reffile: name of reference file
    :return:
    """
    aln_files = []

    for fqname in prot_files:
        prefix, suffix = os.path.splitext(fqname)
        alnname = prefix + '.aln'
        needle_cline = NeedleallCommandline(r'needleall', asequence=reffile, bsequence=fqname,
                                            gapopen=3, gapextend=5, 
                                            verbose=False, outfile=alnname, aformat='fasta')
        needle_cline()
        aln_files.append(alnname)

    return aln_files

Change directory to where the sequencing count dictionaries are located, load them and combine counts to get the `mek` Counter containing sequencing results.

In [18]:
os.chdir("/mnt/c/Users/Maya/Dropbox/mek_results")

with open('Remkes_protein.p', 'rb') as f:
    all_ref = pickle.load(f)
with open('Remkes_protein_low.p', 'rb') as f:
    low = pickle.load(f)

all_ref['mek']['low-v2'] = low['mek']['low-v2']

mek = {}
for fraction in ['high', 'med']:
    mek[fraction] = Counter(all_ref['mek'][fraction])
mek['low-t'] = Counter(all_ref['mek']['low']) + Counter(all_ref['mek']['low-v2'])

In [36]:
def find_protein_short(read, ref, ends):

    prot_short = []

    i = ends.get('start')
    ref_index = ends.get('start') + 1  # reference amino acid index

    while i < ends.get('end'):
        if read[i] != ref[i]:  # found a mutation
            if read[i] == '-':  # found a deletions
                prot_short.append(str(ref_index) + 'Δ')
                i += 1
                ref_index += 1
            elif ref[i] == '-':  # found an insertion
                # check the length: to handle insertions of multiple AAs correctly
                l = indel_len(ref, i)
                stop, inslist = format_prot_insertion(ref_index - 1, read[i:i+l])
                prot_short += inslist  # adding two lists together
                if stop:
                    break
                i += l
            else:  # substitutions
                prot_short.append(str(ref_index) + read[i])
                i += 1
                ref_index += 1
        else:
            i += 1
            ref_index += 1

    if prot_short == []:
        short = 'wt'
    else:
        short = '/'.join(prot_short)

    return short

In [24]:
# Extract information from individually sequenced clones
def retrieve_ind_clones(mek, filename, outfile):

    columns = ['Sample', 'Protein'] + list(mek.keys())

    with open(outfile, 'w') as f:
        writer = csv.DictWriter(f, delimiter=',', fieldnames=columns)
        writer.writeheader()

        for pair in AlignIO.parse(filename, "fasta", seq_count=2):
            # both read and ref are MutableSeq
            ref = str(pair[0].seq)
            read = str(pair[1].seq)
            readname = pair[1].id

            ref, read = trim_read(ref, read)

            # check that there is no frame shift or gross mistranslation
            ends = findEnds(read, ref, 0)
            if not endMatch(read, ref, ends, 2):
                continue

            protein = find_protein_short(read, ref, ends)
            print(protein)
            row = {'Sample': readname, 'Protein': protein}
            for fraction in mek.keys():
                row[fraction] = mek[fraction][protein]

            writer.writerow(row)
    


In [32]:
aln_file = protein_needle(['PosNeg.fa'], 'Xref.fa')


['PosNeg.aln']


In [37]:
retrieve_ind_clones(mek, aln_file[0], 'PosNeg_counts.csv')

6P/9I/11L/13P
6P/9A/11A/13P
