# DNA Barcoding Chromatogram Explorer

Let's open an .ab1 file, aka trace file, aka chromatogram, using BioPython.

In [1]:
import os
from ipywidgets import widgets

sequence_dropdown = widgets.Dropdown(options=os.listdir('./trace_files'))
display(sequence_dropdown)

Dropdown(options=('oc.laedaporous-ITS1-F.ab1', 'pink.fuzzy.mold-ITS1-F.ab1'), value='oc.laedaporous-ITS1-F.ab1…

Let's take a look at some of the annotation data in the file.

In [2]:
import pandas as pd
from Bio import SeqIO

selected_file = sequence_dropdown.value
selected_file_path = './trace_files/{}'.format(selected_file)

In [3]:
chromatogram = SeqIO.read(selected_file_path, 'abi')

annotations_df = pd.DataFrame.from_dict(chromatogram.annotations,
                                        orient='index',
                                        columns=['Value'])
display(annotations_df)

Unnamed: 0,Value
sample_well,b'D11'
dye,b'Z-BigDyeV3'
polymer,b'POP7 '
machine_model,b'3730'
run_start,2020-11-25 20:47:19
run_finish,2020-11-25 22:03:24
abif_raw,"{'AEPt1': 9925, 'AEPt2': 9925, 'APFN2': b'KBan..."
molecule_type,DNA


The chromatogram data lives in the `abif_raw` row. Let's take a closer look.

In [4]:
annotations_df.loc['abif_raw']['Value'].keys()

dict_keys(['AEPt1', 'AEPt2', 'APFN2', 'APXV1', 'APrN1', 'APrV1', 'APrX1', 'ARTN1', 'ASPF1', 'ASPt1', 'ASPt2', 'AUDT1', 'B1Pt1', 'B1Pt2', 'BCTS1', 'BufT1', 'CMNT1', 'CTID1', 'CTNM1', 'CTOw1', 'CTTL1', 'CpEP1', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7', 'DATA8', 'DATA9', 'DATA10', 'DATA11', 'DATA12', 'DCHT1', 'DSam1', 'DySN1', 'Dye#1', 'DyeN1', 'DyeN2', 'DyeN3', 'DyeN4', 'DyeW1', 'DyeW2', 'DyeW3', 'DyeW4', 'EPVt1', 'EVNT1', 'EVNT2', 'EVNT3', 'EVNT4', 'FTab1', 'FVoc1', 'FWO_1', 'Feat1', 'GTyp1', 'HCFG1', 'HCFG2', 'HCFG3', 'HCFG4', 'InSc1', 'InVt1', 'LANE1', 'LIMS1', 'LNTD1', 'LsrP1', 'MCHN1', 'MODF1', 'MODL1', 'NAVG1', 'NLNE1', 'NOIS1', 'PBAS1', 'PBAS2', 'PCON1', 'PCON2', 'PDMF1', 'PDMF2', 'PLOC1', 'PLOC2', 'PSZE1', 'PTYP1', 'PXLB1', 'RGNm1', 'RGOw1', 'RMXV1', 'RMdN1', 'RMdV1', 'RMdX1', 'RPrN1', 'RPrV1', 'RUND1', 'RUND2', 'RUND3', 'RUND4', 'RUNT1', 'RUNT2', 'RUNT3', 'RUNT4', 'Rate1', 'RunN1', 'S/N%1', 'SCAN1', 'SMED1', 'SMLt1', 'SMPL1', 'SPAC1', 'SPAC2', 'SPAC3', 'SVER

To make sense of these fields within the raw abi data, we can refer to the [ABIF File Format spec](docs/ABIF_File_Format.pdf).

Here is a more human-readable summary of some additional info found in the raw data.

In [5]:
abif_raw = annotations_df.loc['abif_raw']['Value']
pd.DataFrame.from_dict({
  'User':                      abif_raw['User1'],
  'Stop Point':                abif_raw['AEPt1'],
  'Start Point':               abif_raw['ASPt1'],
  'Peak Area Ratio':           abif_raw['phAR1'],
  'Peak Spacing':              abif_raw['SPAC3'],
  'Max Quality Value':         abif_raw['phQL1'],
  'Last Successful Analysis':  abif_raw['BCTS1'],
  'Signal Level for Each Dye': abif_raw['S/N%1'],
  'Sample Comment':            abif_raw['CMNT1'],
  'Container ID':              abif_raw['CTID1'],
  'Container Name':            abif_raw['CTNM1'],
  'Container Owner':           abif_raw['CTOw1']},
    orient='index',
    columns=['Raw Data Fields'])

Unnamed: 0,Raw Data Fields
User,b'genewiz'
Stop Point,9925
Start Point,1730
Peak Area Ratio,-1
Peak Spacing,14.2801
Max Quality Value,99
Last Successful Analysis,b'2020-11-25 22:08:05 -05:00'
Signal Level for Each Dye,"(1514, 1845, 2257, 2273)"
Sample Comment,b'2377975-D11-JoshMcGinnis-30-449493217-JM7'
Container ID,b'09A000036479'


### Now let's take a look at the sequence data

In [6]:
sequence = abif_raw['PBAS1']
print('Bases: {}'.format(len(sequence)))
' '.join(list(sequence.decode('UTF-8')))

Bases: 601


'N N N N G N N T C N G T N G G T G A C N N C G G A N G G A N A T T A T A G A G T T T T C T A A A C T C C C A A C C C A T G T G A A C T T A C C A T T G T T G C C T C G G C A G A A G C T G C T C G G T G C A C C C T A C C T T G G A A C G G C C T A C C C T G T A G C G C C T T A C C C T G G A A C G G C T T A C C C T G T A A C G G C T G C C G G T G G A C T A C C A A A C T C T T G T T A T T T T A T T G T A A T C T G A G C G T C T T A T T C T A A T A A G T C A A A A C T T T C A A C A A C G G A T C T C T T G G T T C T G G C A T C G A T G A A G A A C G C A G C G A A A T G C G A T A A G T A A T G T G A A T T G C A G A A T T C A G T G A A T C A T C G A A T C T T T G A A C G C A C A T T G C G C C C A T T A G T A T T C T A G T G G G C A T G C C T G T T C G A G C G T C A T T T C A A C C C T T A A G C C T A G C T T A G T G T T G G G A G C C T A C T G C T T T T G C T A G C T G T A G C T C C T G A A A T A C A A C G G C G G A T C T G C G A T A T C C T C T G A G C G T A G T A A A T T T T T A T C T C G C T

### Let's take a look at the Phred (_quality_) scores for the bases in the sequence.

In [128]:
%matplotlib widget

import matplotlib.pyplot as plt
from matplotlib.widgets import Slider

plt.style.use('fast')
plt.rcParams.update({ 'xtick.labelsize': 11 })

fig = plt.figure(figsize=(10,2.5))

# position the main graph and slider boxes
main_axis = plt.axes([0.07, 0.37, 0.9, 0.50]) # left botoom width height
slider_axis = plt.axes([0.12, 0.10, 0.75, 0.06])

quality_scores = chromatogram.letter_annotations['phred_quality']
sequence_list = list(sequence.decode('UTF-8'))
valstep = 50

def build_bar_plot(start, end):
    main_axis.clear()

    seq = sequence_list[start:end]
    ticks = range(len(seq))
    
    main_axis.set_ylim(0)
    main_axis.set_xlim(0)
    main_axis.set_yticks([0, 20, 40, 60, 80])
    main_axis.set_xticks(ticks)
    main_axis.set_xticklabels(seq)
    
    main_axis.bar(ticks, quality_scores[start:end], align='center')
    plt.show()
    
# make the slider
q_slider = Slider(ax=slider_axis,
                  label='Click to Scroll', 
                  valmin=valstep, 
                  valmax=len(quality_scores),
                  valinit=valstep,
                  valstep=valstep)

def update(val):
    start = val - valstep
    build_bar_plot(start, val)
    
q_slider.on_changed(update)
build_bar_plot(0, valstep)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …