In [42]:
from datetime import datetime

import numpy as np
import pandas as pd

import json
import os
import glob
import re
import math

import shutil
from config import *

from Bio import pairwise2
from Bio import Seq
from Bio import SeqIO
from Bio.Blast import NCBIXML

In [43]:
## ============================================
## ESTABLISH PATHS AND NAMES
## ============================================
PIPELINE_PATH = BASE_PATH + "/pipeline"
BUILDS_PATH = BASE_PATH + "/builds"
DATA_PATH = BASE_PATH + "/data"
BACKBONE_PATH = BASE_PATH + "/sequencing_files/popen_v1-1_backbone.fasta"
DICTIONARY_PATH = PIPELINE_PATH + "/testing/data_testing/10K_CDS.csv"
DATABASE_PATH = BASE_PATH + "/raw_files/BLAST_db/current_BLAST_db.fsa"

forward_primer = "M13-Forward---20-"
reverse_primer = "M13-Reverse"

## ============================================
## TAKE IN BUILD INFORMATION
## ============================================

def choose_build(build_num,path):
    '''Checks to make sure that the build exists and if not asks for a different one'''
    if glob.glob(path):
        return build_num
    else:
        build_num = input("Not a valid build number. Please try again: ")
        build_num = "build{}".format(str(build_num).zfill(3))
        path = "{}/builds/{}/{}_20*.csv".format(BASE_PATH,build_num,build_num)
        return choose_build(build_num,path)

#build_num = "build"+str(input("Which build: ")).zfill(3)
build_num = input("Enter build number: ")
build_num = "build{}".format(str(build_num).zfill(3))
path = "{}/builds/{}/{}_20*.csv".format(BASE_PATH,build_num,build_num)
# build_num = choose_build(path)

build_num = "build007"
print("build_num",build_num)

SEQFILE_PATH = "{}/{}/{}_seq_files".format(BUILDS_PATH,build_num,build_num)


# Create a dictionary to link the gene name to the corresponding id number
data = pd.read_csv(DICTIONARY_PATH)
dictionary = dict(zip(data['gene_name'], data['idnum']))

## ============================================
## GENERATE BLAST DATABASE
## ============================================
# Create a multientry FASTA file with all seqs from database
db_counter = 1
with open(DATABASE_PATH,"w+") as fsa:
    for file in glob.glob(DATA_PATH + "/*/*.json"):
        with open(file) as json_file:
            data = json.load(json_file)
        gene_id = data["gene_id"]
        sequence = data["sequence"]["optimized_sequence"]
        fsa.write(">{}|BLAST_db|{}\n{}\n".format(gene_id,db_counter,sequence))
        db_counter += 1
        #if db_counter > 10:
        #    break
    # fsa.close()
# Convert the FSA file into a BLAST database
os.system("makeblastdb -in {} -parse_seqids -dbtype nucl".format(DATABASE_PATH))

count = 0
align_data = []
unknown_data = []
nan = []
small = []
check = []

# REVIEW: Change this to take in the fasta file for the backbone
# REVIEW: When using the full backbone it would throw off the alignment so this is just the insert
backbone_seq = Seq.Seq("ATGCGGTCTTCCGCATCGCCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTGAGCGCAACGCAATTAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGGAATTGTGAGCGGATAACAATTTCACACATACTAGAGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGATGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAGGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAGGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGTTACCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAGGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAAGCGATGTTGAAGACCATGA")


Enter build number: 7
build_num build007


In [44]:
## ============================================
## DEFINE INITIAL FUNCTIONS
## ============================================

## Function to read in the sequencing file and trim it based on the phred score
def loadsequencing(file, threshold=0.9):
    '''Load sequencing reads and trim them based on the specified threshold'''
    seq = SeqIO.read(file, 'abi')

    maxq = np.max(seq.letter_annotations['phred_quality'])
    rolling = pd.Series(seq.letter_annotations['phred_quality']).rolling(window=20).mean()
    start = (rolling > maxq * threshold).idxmax()
    end = (rolling > maxq * threshold)[::-1].idxmax()

    return seq.seq[start:end], seq, start, end, np.mean(seq.letter_annotations['phred_quality'][start:end])

align_data = []
def align_reads(forward, reverse, target_seq):
    '''Generates a forward and reverse alignments and then determines the result'''
    forward_align = pairwise2.align.globalms(target_seq, forward,1,0,-1,-1, one_alignment_only=True, penalize_end_gaps=False)
    reverse_align = pairwise2.align.globalms(target_seq, reverse,1,0,-1,-1, one_alignment_only=True, penalize_end_gaps=False)
    forward_align = forward_align[0]
    reverse_align = reverse_align[0]

    for_raw = len(forward)
    rev_raw = len(reverse)
    target_length = len(target_seq)

    if for_raw <= target_length:
        for_score = for_raw - forward_align[2]
    else:
        for_score = target_length - forward_align[2]
    if rev_raw <= target_length:
        rev_score = rev_raw - reverse_align[2]
    else:
        rev_score = target_length - reverse_align[2]

    if for_raw < 100 and rev_raw < 100:
        outcome = "Bad_reads"
    elif for_score == 0:
        if rev_score == 0:
            outcome = "Perfect"
        elif rev_score < 10:
            outcome = "Mutation: {} {}".format(for_score,rev_score)
        else:
            outcome = "Bad Reverse"
    elif for_score < 10:
        if rev_score == 0:
            outcome = "Mutation: {} {}".format(for_score,rev_score)
        elif rev_score < 10:
            outcome = "Mutation: {} {}".format(for_score,rev_score)
        else:
            outcome = "Mutation: {} {}".format(for_score,rev_score)
    else:
        if rev_score == 0:
            outcome = "Bad Forward"
        elif rev_score < 10:
            outcome = "Mutation: {} {}".format(for_score,rev_score)
        else:
            outcome = "Bad clone"
    return [outcome, for_raw, forward_align[2], rev_raw, reverse_align[2], target_length]

def verify_sequence(id_num,forward,reverse,gene_seq,backbone_seq):
    '''
    Runs alignments against both the target sequence and the backbone and
    returns the result.
    '''
    g_res = align_reads(forward,reverse,gene_seq)
    b_res = align_reads(forward,reverse,backbone_seq)

    if g_res[0] == "Perfect" and b_res[0] == "Bad clone":
        final = "Good_sequence"
    elif "Mutation" in g_res[0] and b_res[0] == "Bad clone":
        final = "Point_mutation"
    elif g_res[0] == "Bad clone" and b_res[0] == "Perfect":
        final = "Original_vector_sequence"
    elif g_res[0] == "Bad clone" and "Mutation" in b_res[0]:
        final = "Original_vector_sequence"
    elif g_res[0] == "Bad clone" and b_res[0] == "Bad clone":
        final = "Unknown_sequence"
    elif g_res[0] == "Bad_reads" and b_res[0] == "Bad_reads":
        final = "Bad_reads"
    else:
        final = "CHECK"
    return [final] + g_res + b_res

def blast_seq(name,build_num,sequence,direction, E_VALUE_THRESH=0.04):
    '''Runs a BLAST search with a read against the database'''
    # Create fasta files to run the BLAST search
    query = "{}/{}/{}.fasta".format(BUILDS_PATH,build_num,direction)
    with open(query,"w+") as seq_file:
        seq_file.write(">{}\n{}".format(name,sequence))

    # Run the BLAST search
    output = "{}/{}/forward_results.xml".format(BUILDS_PATH,build_num)
    os.system("blastn -query {} -db {}  -out {} -evalue 0.001 -outfmt 5".format(query,DATABASE_PATH,output))

    # Take in the blast results
    result_handle = open(output)
    blast_record = NCBIXML.read(result_handle)
    os.remove(output)

    # Return the top BLAST hit
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                hit_id = alignment.title.split("|")
                return hit_id[0]

def align_unknown(name,build_num,forward_sequence,reverse_sequence):
    '''
    Runs a BLAST search on both reads and determines if they share a common hit,
    if they do it preforms an alignment of the reads on the target.
    '''
    print("Using align unknown on {}".format(name))
    for_hit = blast_seq(name,build_num,forward_sequence,'forward')
    rev_hit = blast_seq(name,build_num,reverse_sequence,'reverse')
    if for_hit == rev_hit:
        print("match")
        with open("{}/{}/{}.json".format(DATA_PATH,for_hit,for_hit),"r") as json_file:
            data = json.load(json_file)
        target = data["sequence"]["optimized_sequence"]
        target = Seq.Seq(target)
        new_row = align_reads(forward,reverse,target)
        new_row = [name,for_hit] + new_row
        return new_row
    else:
        return [name,"no hit", "empty", "empty", "empty", "empty", "empty", "empty"]



In [45]:
## ============================================
## TAKE IN ALL OF THE SEQUENCING FILES
## ============================================

for forfile in glob.glob("{}/*{}*.ab1".format(SEQFILE_PATH,forward_primer)):
    #print(forfile)

    # There has been inconsistency in the naming of samples so this is made to account for them
    if "BBF10K" in forfile:
        initials, order_number, plate_number, well_number, sample_name, sample_number, well_address = re.match(
            r'.*/([A-Z]+)_([0-9]+)-([0-9])([0-9]+)_([A-Za-z0-9]+)_([0-9]+)_M13-Forward---20-_?([A-H][0-9]{2}).ab1',
            forfile).groups()
        revfile = "{}/{}_{}-{}{}_{}_{}_{}_{}.ab1".format(os.path.dirname(forfile), initials, order_number, (int(plate_number)+1), well_number, sample_name, sample_number, reverse_primer, well_address)
        id_num = sample_name + "_" + sample_number
        unknown = False
    elif "MMSYN" in forfile:
        initials, order_number, plate_number, well_number, hyphen, sample_name, primer_name, well_address = re.match(
            r'.*/([A-Z]+)_([0-9]+)-([0-9])([0-9]+)_(-?)([A-Za-z0-9_-]+)_([A-Za-z0-9-]+)_([A-H][0-9]{2}).ab1',
            forfile).groups()
        revfile = "{}/{}_{}-{}{}_{}{}_{}_{}.ab1".format(os.path.dirname(forfile), initials, order_number, (int(plate_number)+1), well_number, hyphen, sample_name, reverse_primer, well_address)
        id_num = dictionary[sample_name[:-2]]
        unknown = False
    elif "Unk" in forfile:
        initials, order_number, plate_number, well_number, sample_name, sample_number, well_address = re.match(
            r'.*/([A-Z]+)_([0-9]+)-([0-9])([0-9]+)_([A-Za-z0-9]+)_([0-9]+)_M13-Forward---20-_?([A-H][0-9]{2}).ab1',
            forfile).groups()
        revfile = "{}/{}_{}-{}{}_{}_{}_{}_{}.ab1".format(os.path.dirname(forfile), initials, order_number, (int(plate_number)+1), well_number, sample_name, sample_number, reverse_primer, well_address)
        id_num = sample_name + "_" + sample_number
        unknown = True

    for_seq_file = forfile.split("/")[-1]
    print("Forward: ",for_seq_file)
    rev_seq_file = revfile.split("/")[-1]
    print("Reverse: ",rev_seq_file)
        
    # Generates a new directory for each gene with their reads for a specific build
    #    new_dir = "{}/{}/{}_seq_files".format(DATA_PATH,id_num,build_num)
    #    if os.path.exists(new_dir):
    #        print("{} already exists".format(new_dir))
    #    else:
    #        os.makedirs(new_dir)
    #        shutil.copy(forfile, new_dir)
    #        shutil.copy(revfile, new_dir)

    # Trim the reads and check their quality
    forward_untrim, _, _, _, forward_qual = loadsequencing(forfile)
    reverse_untrim, revseq, _, _, reverse_qual = loadsequencing(revfile)
    print(id_num, "Quality", forward_qual, reverse_qual)

    # Check that the reads have sufficient length and if not decrease the threshold and reload
    if math.isnan(forward_qual) or math.isnan(reverse_qual):
        forward_untrim, _, _, _, forward_qual = loadsequencing(forfile, threshold=0.8)
        reverse_untrim, revseq, _, _, reverse_qual = loadsequencing(revfile, threshold=0.8)
        nan.append(id_num)
        print("Quality", forward_qual, reverse_qual)
    if len(forward_untrim) < 50 or len(reverse_untrim) < 50:
        forward_untrim, _, _, _, forward_qual = loadsequencing(forfile, threshold=0.8)
        reverse_untrim, revseq, _, _, reverse_qual = loadsequencing(revfile, threshold=0.8)
        small.append(id_num)

    # REVIEW: Not all sequences are CDS's and so we need to grab bases from the beginning and
    # of the target sequence or something to trim it.

    # Trim the reads to start at the start or stop codons
    forward = forward_untrim[forward_untrim.find('ATG'):] # Start at start codon
    reverse = reverse_untrim[reverse_untrim.find('TCA'):].reverse_complement() # Stop at stop

    if unknown:
        print("_____________UNKNOWN__________________")
        row = align_unknown(id_num,build_num,forward,reverse) + [well_address,for_seq_file,rev_seq_file]
        unknown_data.append(row)

    else:
        with open("{}/{}/{}.json".format(DATA_PATH,id_num,id_num),"r") as json_file:
            data = json.load(json_file)

        gene_seq = data["sequence"]["optimized_sequence"]
        gene_seq = Seq.Seq(gene_seq)
        gene_name = data["gene_name"]

        row = verify_sequence(id_num,forward,reverse,gene_seq,backbone_seq)
        if row[0] == "Unknown_sequence":
            row = align_unknown(id_num,build_num,forward,reverse) + [well_address,for_seq_file,rev_seq_file]
            unknown_data.append(row)

        else:
            row = [id_num] + [gene_name] + row + [well_address,for_seq_file,rev_seq_file]
            align_data.append(row)


Forward:  CM_534180-1001_BBF10K_000497_M13-Forward---20-_A01.ab1
Reverse:  CM_534180-2001_BBF10K_000497_M13-Reverse_A01.ab1
BBF10K_000497 Quality 60.9239332096 59.6731707317
Forward:  CM_534180-1002_BBF10K_000468_M13-Forward---20-_B01.ab1
Reverse:  CM_534180-2002_BBF10K_000468_M13-Reverse_B01.ab1
BBF10K_000468 Quality 60.8339041096 59.7619808307
Forward:  CM_534180-1003_BBF10K_000485_M13-Forward---20-_C01.ab1
Reverse:  CM_534180-2003_BBF10K_000485_M13-Reverse_C01.ab1
BBF10K_000485 Quality 57.6511627907 57.8863636364
Forward:  CM_534180-1004_BBF10K_000493_M13-Forward---20-_D01.ab1
Reverse:  CM_534180-2004_BBF10K_000493_M13-Reverse_D01.ab1
BBF10K_000493 Quality 59.8091743119 59.2865064695
Forward:  CM_534180-1005_BBF10K_000499_M13-Forward---20-_E01.ab1
Reverse:  CM_534180-2005_BBF10K_000499_M13-Reverse_E01.ab1
BBF10K_000499 Quality 60.7441016334 60.0051194539
Forward:  CM_534180-1006_BBF10K_000478_M13-Forward---20-_F01.ab1
Reverse:  CM_534180-2006_BBF10K_000478_M13-Reverse_F01.ab1
BBF10K

Forward:  CM_534180-1046_BBF10K_000518_M13-Forward---20-_F06.ab1
Reverse:  CM_534180-2046_BBF10K_000518_M13-Reverse_F06.ab1
BBF10K_000518 Quality 60.7478411054 59.9394904459
Using align unknown on BBF10K_000518
match
Forward:  CM_534180-1047_BBF10K_000513_M13-Forward---20-_G06.ab1
Reverse:  CM_534180-2047_BBF10K_000513_M13-Reverse_G06.ab1
BBF10K_000513 Quality 60.8629441624 60.4422382671
Using align unknown on BBF10K_000513
match
Forward:  CM_534180-1048_BBF10K_000516_M13-Forward---20-_H06.ab1
Reverse:  CM_534180-2048_BBF10K_000516_M13-Reverse_H06.ab1
BBF10K_000516 Quality 60.9418386492 58.6178217822
Forward:  CM_534180-1049_BBF10K_000533_M13-Forward---20-_A07.ab1
Reverse:  CM_534180-2049_BBF10K_000533_M13-Reverse_A07.ab1
BBF10K_000533 Quality 60.5311355311 60.0307971014
Forward:  CM_534180-1050_BBF10K_000519_M13-Forward---20-_B07.ab1
Reverse:  CM_534180-2050_BBF10K_000519_M13-Reverse_B07.ab1
BBF10K_000519 Quality 60.3961937716 59.9518900344
Using align unknown on BBF10K_000519
match
F

Forward:  CM_534180-1086_BBF10K_000560_M13-Forward---20-_F11.ab1
Reverse:  CM_534180-2086_BBF10K_000560_M13-Reverse_F11.ab1
BBF10K_000560 Quality 61.2136445242 54.9551934827
Forward:  CM_534180-1087_BBF10K_000557_M13-Forward---20-_G11.ab1
Reverse:  CM_534180-2087_BBF10K_000557_M13-Reverse_G11.ab1
BBF10K_000557 Quality 56.2339055794 56.3239740821
Forward:  CM_534180-1088_BBF10K_000555_M13-Forward---20-_H11.ab1
Reverse:  CM_534180-2088_BBF10K_000555_M13-Reverse_H11.ab1
BBF10K_000555 Quality 60.8269896194 60.030141844
Forward:  CM_534180-1089_BBF10K_000553_M13-Forward---20-_A12.ab1
Reverse:  CM_534180-2089_BBF10K_000553_M13-Reverse_A12.ab1
BBF10K_000553 Quality 60.4708737864 60.207079646
Forward:  CM_534180-1090_Unk_03_M13-Forward---20-_B12.ab1
Reverse:  CM_534180-2090_Unk_03_M13-Reverse_B12.ab1
Unk_03 Quality 59.8932676519 60.2892857143
_____________UNKNOWN__________________
Using align unknown on Unk_03
match
Forward:  CM_534180-1091_Unk_04_M13-Forward---20-_C12.ab1
Reverse:  CM_534180-

In [91]:
align_data = np.array(align_data)
unknown_data = np.array(unknown_data)

unknown_df = pd.DataFrame({
    "Gene ID" : unknown_data[:,0],
    "Hit ID" : unknown_data[:,1],
    "Hit Name" : names,
    "Intended Well" : intended_wells,
    "Hit Result" : unknown_data[:,2],
    "Hit For Length" : unknown_data[:,3],
    "Hit For Score" : unknown_data[:,4],
    "Hit Rev Length" : unknown_data[:,5],
    "Hit Rev Score" : unknown_data[:,6],
    "Hit Length" : unknown_data[:,7],
    "Well" : unknown_data[:,8],
    "For Read" : unknown_data[:,9],
    "Rev Read" : unknown_data[:,10]
})
unknown_df = unknown_df[["Gene ID","Hit ID","Hit Name","Hit Result","Well","Intended Well","Hit Result","Hit For Length","Hit For Score","Hit Rev Length","Hit Rev Score","Hit Length","For Read","Rev Read"]]
unknown_df.to_csv("{}/{}/{}_alignment_results-unknown.csv".format(BUILDS_PATH,build_num,build_num))
# unknown_df.to_csv("./testing/alignment_results-unknown.csv")

intended_unknown = pd.DataFrame({
    "Gene ID" : unknown_data[:,0],
    "Well" : unknown_data[:,8],
    "Outcome" : "Failed",
    "For Read" : unknown_data[:,9],
    "Rev Read" : unknown_data[:,10],
    "Actual" : False
})

actual_unknown = pd.DataFrame({
    "Gene ID" : unknown_data[:,1],
    "Well" : unknown_data[:,8],
    "Outcome" : unknown_data[:,2],
    "For Read" : unknown_data[:,9],
    "Rev Read" : unknown_data[:,10],
    "Actual" : True
})

complete_unknown = pd.concat([intended_unknown,actual_unknown])
complete_unknown

array = pd.DataFrame({
    "Gene ID" : align_data[:,0],
    "Gene Name" : align_data[:,1],
    "Outcome" : align_data[:,2],
    "Gene Result" : align_data[:,3],
    "Gene For Length" : align_data[:,4],
    "Gene For Score" : align_data[:,5],
    "Gene Rev Length" : align_data[:,6],
    "Gene Rev Score" : align_data[:,7],
    "Gene Length" : align_data[:,8],
    "Template Result" : align_data[:,9],
    "Template For Length" : align_data[:,10],
    "Template For Score" : align_data[:,11],
    "Template Rev Length" : align_data[:,12],
    "Template Rev Score" : align_data[:,13],
    "Template Length" : align_data[:,14],
    "Well" : align_data[:,15]
})
array = array[["Gene ID","Gene Name","Outcome","Well","Gene Result","Gene For Length","Gene For Score","Gene Rev Length","Gene Rev Score","Gene Length","Template Result","Template For Length","Template For Score" ,"Template Rev Length","Template Rev Score" ,"Template Length"]]
array.to_csv("{}/{}/{}_alignment_results-array.csv".format(BUILDS_PATH,build_num,build_num))

outcomes = pd.DataFrame({
    "Well" : array["Well"],
    "Gene ID" : array["Gene ID"],
    "Outcome" : array["Outcome"],
    "Actual" : True,
    "For Read" : align_data[:,16],
    "Rev Read" : align_data[:,17]
})

complete = pd.concat([outcomes,complete_unknown])
complete = complete.sort_values('Well')
complete



Unnamed: 0,Actual,For Read,Gene ID,Outcome,Rev Read,Well
0,True,CM_534180-1001_BBF10K_000497_M13-Forward---20-...,BBF10K_000497,Original_vector_sequence,CM_534180-2001_BBF10K_000497_M13-Reverse_A01.ab1,A01
8,True,CM_534180-1009_BBF10K_000481_M13-Forward---20-...,BBF10K_000481,Point_mutation,CM_534180-2009_BBF10K_000481_M13-Reverse_A02.ab1,A02
15,True,CM_534180-1017_BBF10K_000495_M13-Forward---20-...,BBF10K_000495,Good_sequence,CM_534180-2017_BBF10K_000495_M13-Reverse_A03.ab1,A03
23,True,CM_534180-1025_BBF10K_000494_M13-Forward---20-...,BBF10K_000494,Good_sequence,CM_534180-2025_BBF10K_000494_M13-Reverse_A04.ab1,A04
31,True,CM_534180-1033_BBF10K_000477_M13-Forward---20-...,BBF10K_000477,Point_mutation,CM_534180-2033_BBF10K_000477_M13-Reverse_A05.ab1,A05
6,False,CM_534180-1041_BBF10K_000503_M13-Forward---20-...,BBF10K_000503,Failed,CM_534180-2041_BBF10K_000503_M13-Reverse_A06.ab1,A06
6,True,CM_534180-1041_BBF10K_000503_M13-Forward---20-...,BBF10K_000550,Perfect,CM_534180-2041_BBF10K_000503_M13-Reverse_A06.ab1,A06
36,True,CM_534180-1049_BBF10K_000533_M13-Forward---20-...,BBF10K_000533,Original_vector_sequence,CM_534180-2049_BBF10K_000533_M13-Reverse_A07.ab1,A07
17,True,CM_534180-1057_BBF10K_000504_M13-Forward---20-...,BBF10K_000549,Perfect,CM_534180-2057_BBF10K_000504_M13-Reverse_A08.ab1,A08
17,False,CM_534180-1057_BBF10K_000504_M13-Forward---20-...,BBF10K_000504,Failed,CM_534180-2057_BBF10K_000504_M13-Reverse_A08.ab1,A08


In [94]:
for index, row in complete.iterrows():
    if "BBF10K_" not in row["Gene ID"]:
        print("Skipped: ", row["Gene ID"])
        continue
    with open("{}/{}/{}.json".format(DATA_PATH,row["Gene ID"],row["Gene ID"]),"r") as json_file:
        data = json.load(json_file)
    num_builds = len(data["status"]["build_attempts"])
    print("Num builds: ",num_builds)
    for build in data["status"]["build_attempts"]:
        print()
        print(row["Gene ID"])
        if len(build["build_well"]) == 2:
            build["build_well"] = str(build["build_well"][0]+"0"+build["build_well"][1])
        if build["build_number"] == build_num:
            if build["build_well"] == row["Well"]:
                build["build_outcome"] = row["Outcome"]
                build["forward_read"] = row["For Read"]
                build["reverse_read"] = row["Rev Read"]
            else:
                data["status"]["build_attempts"].append({
                    "build_well" : row["Well"],
                    "build_number" : build_num,
                    "build_outcome" : row["Outcome"],
                    "forward_read" : row["For Read"],
                    "reverse_read" : row["Rev Read"]
                })
    if row["Outcome"] == "Good_sequence" or row["Outcome"] == "Perfect":
        data["status"]["build_complete"] = True
    else:
        data["status"]["build_complete"] = False
    with open("{}/{}/{}.json".format(DATA_PATH,row["Gene ID"],row["Gene ID"]),"w") as json_file:
        json.dump(data,json_file,indent=2)

        
complete.to_csv("{}/{}/{}_alignment_results-complete.csv".format(BUILDS_PATH,build_num,build_num))

#     print(data)
#     input()

Num builds:  1

BBF10K_000497
Num builds:  1

BBF10K_000481
Num builds:  1

BBF10K_000495
Num builds:  1

BBF10K_000494
Num builds:  1

BBF10K_000477
Num builds:  1

BBF10K_000503
Num builds:  1

BBF10K_000550

BBF10K_000550
Num builds:  1

BBF10K_000533
Num builds:  1

BBF10K_000549

BBF10K_000549
Num builds:  1

BBF10K_000504
Num builds:  1

BBF10K_000531
Num builds:  1

BBF10K_000521

BBF10K_000521
Num builds:  1

BBF10K_000511
Skipped:  no hit
Num builds:  1

BBF10K_000535
Num builds:  1

BBF10K_000517
Num builds:  1

BBF10K_000553
Num builds:  1

BBF10K_000468
Num builds:  1

BBF10K_000476
Num builds:  1

BBF10K_000481

BBF10K_000481
Num builds:  1

BBF10K_000479
Num builds:  1

BBF10K_000469
Num builds:  1

BBF10K_000498
Num builds:  1

BBF10K_000508
Num builds:  1

BBF10K_000544

BBF10K_000544
Num builds:  1

BBF10K_000533

BBF10K_000533
Num builds:  1

BBF10K_000519
Num builds:  1

BBF10K_000524
Num builds:  1

BBF10K_000528

BBF10K_000528
Num builds:  1

BBF10K_000537
Skipped:

In [None]:
# print(result)
# result.to_csv("{}/{}/{}_alignment_results-comb.csv".format(BUILDS_PATH,build_num,build_num))

# print(array)
# print("nan sequences:", nan)
# print("small sequences:", small)

# input("continue?")

# new_dir = "{}/{}/{}_fasta_files".format(BUILDS_PATH,build_num,build_num)
# if os.path.exists(new_dir):
#     print("{} already exists".format(new_dir))
# else:
#     os.makedirs(new_dir)
#     print("made directory")
# 
# for index,row in array.iterrows():
#     for fasta in glob.glob("{}/{}/{}.fasta".format(DATA_PATH,row["Gene ID"],row["Gene ID"])):
#         shutil.copy(fasta, new_dir)

# print("\n",array['Outcome'].value_counts(),"\n")

# stop = datetime.now()
# runtime = stop - start
# print("Total runtime is: ", runtime)

In [61]:
## Check unknown sequence locations to debug build

# unknown_dict = dict(zip(unknown_data[:,0],unknown_data[:,8]))
# print(unknown_dict)
# location_dict = dict(zip(align_data[:,0],align_data[:,15]))
# print(location_dict)
# location_dict.update(unknown_dict)
# print(location_dict)

# intended_wells = []
# for gene in unknown_data[:,1]:
#     if gene in location_dict:
#         intended_wells.append(location_dict[gene])
#     else:
#         intended_wells.append("not_in_build")

# plates = []
# for gene in unknown_data[:,0]:
#     if "Unk" in gene:
#         continue
#     with open("{}/data/{}/{}.json".format(BASE_PATH,gene,gene),"r") as json_file:
#         data = json.load(json_file)
#     for frag in data["location"]["fragments"]:
#         plate,well = data["location"]["fragments"][frag].split("_")
#         plates.append(plate)
# plates = pd.Series(plates)
# print(plates.value_counts())

# plates = []
# for gene in unknown_data[:,1]:
#     if "no hit" in gene:
#         continue
#     with open("{}/data/{}/{}.json".format(BASE_PATH,gene,gene),"r") as json_file:
#         data = json.load(json_file)
#     for frag in data["location"]["fragments"]:
#         plate,well = data["location"]["fragments"][frag].split("_")
#         plates.append(plate)
# plates = pd.Series(plates)
# # print(plates.value_counts())

# names = []
# for gene in unknown_data[:,1]:
#     if "no hit" in gene:
#         names.append("no hit")
#         continue
#     with open("{}/data/{}/{}.json".format(BASE_PATH,gene,gene),"r") as json_file:
#         data = json.load(json_file)
#     names.append(data["gene_name"])
    
# print(names)

# deck = ['A01','B02','C03','D04','E05','F06','G07','H08','G09','F10','E11','D12']
# slots = pd.Series(deck)
# rows = sorted(slots.str[0].unique())
# columns = sorted(slots.str[1:3].unique(), reverse=False)
# layout_table = pd.DataFrame(index=rows, columns=columns)
# layout_table.fillna("---", inplace=True)
# print(layout_table)

# locations = dict(zip(unknown_df["Well"],unknown_df["Intended Well"]))
# # Fill in the dataframe with the locations
# for obj in locations:
#     layout_table.loc[obj[0], obj[1:3]] = locations[obj]
# layout_table

In [62]:
# print(outcomes)

# unknown_outcomes = pd.DataFrame({
#     "Well" : unknown_df["Well"],
#     "Gene ID" : unknown_df["Gene ID"],
#     "Outcome" : unknown_df["Hit Result"]
# })
# unknown_outcomes.set_index("Well")
# unknown_outcomes
# outcomes = outcomes.append(unknown_outcomes)
# outcomes