In [1]:
import pandas as pd
import scipy as sp
from scipy.sparse import diags
import numpy as np
from numpy import linalg as LA
import sys

from os import path

import matplotlib.pyplot as plt

#importing seaborn for plotting
import seaborn as sns

#for plotting purposes
%pylab inline
sns.set_style('ticks')
sns.set_context('paper')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib as mpl

from scipy.signal import find_peaks

# mpl.rcParams
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.figsize'] = [8, 16/3]

Populating the interactive namespace from numpy and matplotlib


### converting ladder to nts

In [3]:
data_dir = './raw_data/'
fig_dir = './plots/'
processed_dir = './processed_data/'

In [4]:
#list of filenames 
filenames = ["Eukaryote Total RNA Nano_2020-12-02_14-31-30",\
             "Eukaryote Total RNA Nano_2020-12-02_14-38-16",\
             "Eukaryote Total RNA Nano_2020-12-02_15-29-46",\
             "Eukaryote Total RNA Nano_2020-12-02_15-24-46",\
             "Eukaryote Total RNA Nano_2020-12-02_15-48-34",\
             "Eukaryote Total RNA Nano_2020-12-02_15-54-46",\
             "Eukaryote Total RNA Nano_2020-12-02_16-14-33",\
             "Eukaryote Total RNA Nano_2020-12-02_16-22-53"]
"""
from files:
Eukaryote Total RNA Nano_2020-12-02_14-31-30_WK_1.pdf
Eukaryote Total RNA Nano_2020-12-02_14-38-16_WK_2.pdf
Eukaryote Total RNA Nano_2020-12-02_15-24-46_WK_4.pdf
Eukaryote Total RNA Nano_2020-12-02_15-29-46_WK_3.pdf
Eukaryote Total RNA Nano_2020-12-02_15-48-34_WK_5.pdf
Eukaryote Total RNA Nano_2020-12-02_15-54-46_WK_6.pdf
Eukaryote Total RNA Nano_2020-12-02_16-14-33_WK_7.pdf
Eukaryote Total RNA Nano_2020-12-02_16-22-53_WK_8.pdf
"""

for filename in filenames:
    
    """
    Plotting and identifying peaks from the ladder traces from BioAnalyzer.
    Must be done per chip/run, since ladders may elute at slightly different times per run.
    Generates plots that shows identified peaks.
    As of 12-02-2020, should identify 6 peaks.
    """
    
    ladder_dict = {}
    ladder_values = {}

    ladder_df = pd.read_csv(data_dir+filename+'_Ladder.csv', skiprows=17)[:-1]
    ladder_time = np.array(ladder_df['Time'].astype(float))
    ladder_value = np.array(ladder_df['Value'].astype(float))

    peaks,_ = find_peaks(ladder_value, height=5, distance=15)
    ladder_dict = list(peaks[:6])

    #plotting ladder trace + peak identification
    plot(ladder_time, ladder_value, label='ladder trace')
    plot(ladder_time[peaks], ladder_value[peaks], 'x', label='peaks', markersize=15)
    title('Ladder Peaks: '+filename, fontsize=16)
    legend(fontsize=14)
    xlabel('Time', fontsize=14)
    ylabel('FU', fontsize=14)
    xticks(fontsize=14)
    yticks(fontsize=14)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+filename+'_ladder.png', dpi=300)
    clf()
    
    
    """
    Converting times (elution times from BioAnalyzer) to nucleotides using
    a 4th order polynomial fit
    """
    
    #returns the times when peaks occur, will serve as x on polynomial fit
    peak_times = ladder_time[ladder_dict]
    
    #ladder nucleotide sizes, will serve as y on polynomial fit
    peak_fu = np.array([25, 200, 500, 1000, 2000, 4000])
    
    #assert that the lengths of x and y match (cannot perform fit otherwise)
    assert (len(peak_times) == len(peak_fu))

    new_height = 5
    while (len(peak_times) < len(peak_fu)):
        new_height = new_height-0.1
        peaks,_ = find_peaks(ladder_value, height=new_height, distance=15)
        ladder_dict = list(peaks[:6])
        peak_times = ladder_time[ladder_dict]
        
    #fitting a 4th order polynomial to peaks/nucleotides
    ladder_fit = np.polyfit(x=peak_times, y = peak_fu, deg = 4)
    lf = np.poly1d(ladder_fit)
    ladder_nts = lf
    
    #plotting
    plot(peak_fu, lf(peak_times), 'o',label='calculated nts vs. ladder nts')
    plot(peak_fu, peak_fu,label='perfect correlation')
    ylabel('Calculated nts (from time)', fontsize=14)
    xlabel('Ladder nts', fontsize=14)
    yticks(fontsize=12)
    xticks(fontsize=12)
    title('Ladder polynomial fit: '+filename, fontsize=16)
    legend(fontsize=12)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+'ladder_fit-'+filename+'.png', dpi=300)
    clf()

    
    """
    Applying fits to samples as well, per ladder
    """
    
    sample_numbers = np.arange(1,13)

    for number in sample_numbers:
        file = filename+'_Sample'+str(number)+".csv"
        print(file)
        if (path.exists(data_dir + file)):
            sample_df = pd.read_csv(data_dir + file, skiprows=17)[:-1]
            times = np.array(sample_df['Time'].astype(float))
            sample_df['Nucleotides'] = ladder_nts(times)
            sample_df.to_csv(processed_dir+'nts-'+ file)
        else:
            continue

'\nfrom filenames:\nEukaryote Total RNA Nano_2020-12-02_14-31-30_WK_1.pdf\nEukaryote Total RNA Nano_2020-12-02_14-38-16_WK_2.pdf\nEukaryote Total RNA Nano_2020-12-02_15-24-46_WK_4.pdf\nEukaryote Total RNA Nano_2020-12-02_15-29-46_WK_3.pdf\nEukaryote Total RNA Nano_2020-12-02_15-48-34_WK_5.pdf\nEukaryote Total RNA Nano_2020-12-02_15-54-46_WK_6.pdf\nEukaryote Total RNA Nano_2020-12-02_16-14-33_WK_7.pdf\nEukaryote Total RNA Nano_2020-12-02_16-22-53_WK_8.pdf\n'

'\n    Plotting and identifying peaks from the ladder traces from BioAnalyzer.\n    Must be done per chip/run, since ladders may elute at slightly different times per run.\n    Generates plots that shows identified peaks.\n    As of 12-02-2020, should identify 6 peaks.\n    '

[<matplotlib.lines.Line2D at 0x7fb5c17a5c90>]

[<matplotlib.lines.Line2D at 0x7fb5d05dee50>]

Text(0.5, 1.0, 'Ladder Peaks: Eukaryote Total RNA Nano_2020-12-02_14-31-30')

<matplotlib.legend.Legend at 0x7fb5d05decd0>

Text(0.5, 0, 'Time')

Text(0, 0.5, 'FU')

(array([10., 20., 30., 40., 50., 60., 70., 80.]),
 <a list of 8 Text major ticklabel objects>)

(array([-5.,  0.,  5., 10., 15., 20., 25., 30.]),
 <a list of 8 Text major ticklabel objects>)

'\n    Converting times (elution times from BioAnalyzer) to nucleotides using\n    a 4th order polynomial fit\n    '

[<matplotlib.lines.Line2D at 0x7fb5d060bad0>]

[<matplotlib.lines.Line2D at 0x7fb5d05b9550>]

Text(0, 0.5, 'Calculated nts (from time)')

Text(0.5, 0, 'Ladder nts')

(array([-500.,    0.,  500., 1000., 1500., 2000., 2500., 3000., 3500.,
        4000., 4500.]),
 <a list of 11 Text major ticklabel objects>)

(array([-500.,    0.,  500., 1000., 1500., 2000., 2500., 3000., 3500.,
        4000., 4500.]),
 <a list of 11 Text major ticklabel objects>)

Text(0.5, 1.0, 'Ladder polynomial fit: Eukaryote Total RNA Nano_2020-12-02_14-31-30')

<matplotlib.legend.Legend at 0x7fb5902efed0>

'\n    Applying fits to samples as well, per ladder\n    '

Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample1.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample2.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample3.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample4.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample5.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample6.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample7.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample8.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample9.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample10.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample11.csv
Eukaryote Total RNA Nano_2020-12-02_14-31-30_Sample12.csv


'\n    Plotting and identifying peaks from the ladder traces from BioAnalyzer.\n    Must be done per chip/run, since ladders may elute at slightly different times per run.\n    Generates plots that shows identified peaks.\n    As of 12-02-2020, should identify 6 peaks.\n    '

[<matplotlib.lines.Line2D at 0x7fb5c1787050>]

[<matplotlib.lines.Line2D at 0x7fb5c1776550>]

Text(0.5, 1.0, 'Ladder Peaks: Eukaryote Total RNA Nano_2020-12-02_14-38-16')

<matplotlib.legend.Legend at 0x7fb5902ef110>

Text(0.5, 0, 'Time')

Text(0, 0.5, 'FU')

(array([10., 20., 30., 40., 50., 60., 70., 80.]),
 <a list of 8 Text major ticklabel objects>)

(array([-5.,  0.,  5., 10., 15., 20., 25.]),
 <a list of 7 Text major ticklabel objects>)

'\n    Converting times (elution times from BioAnalyzer) to nucleotides using\n    a 4th order polynomial fit\n    '

[<matplotlib.lines.Line2D at 0x7fb5c17b28d0>]

[<matplotlib.lines.Line2D at 0x7fb5c17a4b90>]

Text(0, 0.5, 'Calculated nts (from time)')

Text(0.5, 0, 'Ladder nts')

(array([-500.,    0.,  500., 1000., 1500., 2000., 2500., 3000., 3500.,
        4000., 4500.]),
 <a list of 11 Text major ticklabel objects>)

(array([-500.,    0.,  500., 1000., 1500., 2000., 2500., 3000., 3500.,
        4000., 4500.]),
 <a list of 11 Text major ticklabel objects>)

Text(0.5, 1.0, 'Ladder polynomial fit: Eukaryote Total RNA Nano_2020-12-02_14-38-16')

<matplotlib.legend.Legend at 0x7fb5c17e1190>

'\n    Applying fits to samples as well, per ladder\n    '

Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample1.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample2.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample3.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample4.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample5.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample6.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample7.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample8.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample9.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample10.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample11.csv
Eukaryote Total RNA Nano_2020-12-02_14-38-16_Sample12.csv


'\n    Plotting and identifying peaks from the ladder traces from BioAnalyzer.\n    Must be done per chip/run, since ladders may elute at slightly different times per run.\n    Generates plots that shows identified peaks.\n    As of 12-02-2020, should identify 6 peaks.\n    '

FileNotFoundError: [Errno 2] File ./raw_data/Eukaryote Total RNA Nano_2020-12-02_15-29-46_Ladder.csv does not exist: './raw_data/Eukaryote Total RNA Nano_2020-12-02_15-29-46_Ladder.csv'

<Figure size 576x384 with 0 Axes>