In [4]:
import pandas as pd
import scipy as sp
from scipy.sparse import diags
import numpy as np
from numpy import linalg as LA
import sys

from os import path

import matplotlib.pyplot as plt

#importing seaborn for plotting
import seaborn as sns

#for plotting purposes
%pylab inline
sns.set_style('ticks')
sns.set_context('paper')

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

import matplotlib as mpl

from scipy.signal import find_peaks

# mpl.rcParams
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.figsize'] = [8, 16/3]

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### converting ladder to nts

In [5]:
data_dir = './raw_data/'
fig_dir = './plots/'
processed_dir = './processed_data/'

In [7]:
#list of filenames 
filenames = []

incomplete_ladder = ["Eukaryote Total RNA Nano_2020-12-10_07-47-43",
                     "Eukaryote Total RNA Nano_2020-12-10_08-35-37",
                     "Eukaryote Total RNA Nano_2020-12-10_08-43-35"]

with open("platenumber_filename.csv", 'r+') as f:
    for line in f.readlines()[1:]:
        filename = line.split(',')[1]
        filenames.append(filename)
        
for filename in filenames:
    
    """
    Plotting and identifying peaks from the ladder traces from BioAnalyzer.
    Must be done per chip/run, since ladders may elute at slightly different times per run.
    Generates plots that shows identified peaks.
    As of 12-10-2020, should identify 6 peaks.
    """
    
    ladder_dict = {}
    ladder_values = {}
    
    if filename in incomplete_ladder:
        peak_number = 5
    else:
        peak_number = 6

    ladder_df = pd.read_csv(data_dir+filename+'_Ladder.csv', skiprows=17)[:-1]
    ladder_time = np.array(ladder_df['Time'].astype(float))
    ladder_value = np.array(ladder_df['Value'].astype(float))

    peaks,_ = find_peaks(ladder_value, height=5, distance=30)
    ladder_dict = list(peaks[:peak_number])

    #plotting ladder trace + peak identification
    plot(ladder_time, ladder_value, label='ladder trace')
    plot(ladder_time[peaks], ladder_value[peaks], 'x', label='peaks', markersize=15)
    title('Ladder Peaks: '+filename, fontsize=16)
    legend(fontsize=14)
    xlabel('Time', fontsize=14)
    ylabel('FU', fontsize=14)
    xticks(fontsize=14)
    yticks(fontsize=14)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+filename+'_ladder.png', dpi=300)
    clf()
    
    
    """
    Converting times (elution times from BioAnalyzer) to nucleotides using
    a 4th order polynomial fit
    """
    
    #returns the times when peaks occur, will serve as x on polynomial fit
    peak_times = ladder_time[ladder_dict]
    
    #ladder nucleotide sizes, will serve as y on polynomial fit
    if filename in incomplete_ladder:
        peak_fu = np.array([25, 500, 1000, 2000, 4000])
    else:
        peak_fu = np.array([25, 200, 500, 1000, 2000, 4000])
        
    #assert that the lengths of x and y match (cannot perform fit otherwise)
    assert (len(peak_times) == len(peak_fu))

    new_height = 5
    while (len(peak_times) < len(peak_fu)):
        new_height = new_height-0.1
        peaks,_ = find_peaks(ladder_value, height=new_height, distance=15)
        ladder_dict = list(peaks[:6])
        peak_times = ladder_time[ladder_dict]
        
    #fitting a 4th order polynomial to peaks/nucleotides
    ladder_fit = np.polyfit(x=peak_times, y = peak_fu, deg = 4)
    lf = np.poly1d(ladder_fit)
    ladder_nts = lf

    #print filename and ladder fit
    print('-----------')
    print('printing filename and ladder fit:')
    print(filename)
    print(lf)
    print('-----------')
    
    #plotting
    plot(peak_fu, lf(peak_times), 'o',label='calculated nts vs. ladder nts')
    plot(peak_fu, peak_fu,label='perfect correlation')
    ylabel('Calculated nts (from time)', fontsize=14)
    xlabel('Ladder nts', fontsize=14)
    yticks(fontsize=12)
    xticks(fontsize=12)
    title('Ladder polynomial fit: '+filename, fontsize=16)
    legend(fontsize=12)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+'ladder_fit-'+filename+'.png', dpi=300)
    clf()

    
    """
    Applying fits to samples as well, per ladder
    """
    
    sample_numbers = np.arange(1,13)

    for number in sample_numbers:
        file = filename+'_Sample'+str(number)+".csv"
        print(file)
        if (path.exists(data_dir + file)):
            sample_df = pd.read_csv(data_dir + file, skiprows=17)[:-1]
            times = np.array(sample_df['Time'].astype(float))
            sample_df['Nucleotides'] = ladder_nts(times)
            sample_df.to_csv(processed_dir+'nts-'+ file)
        else:
            continue

-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-10_07-39-50
           4           3         2
0.0009715 x + 0.01734 x - 2.729 x + 67.9 x - 568.8
-----------
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample1.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample2.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample3.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample4.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample5.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample6.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample7.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample8.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample9.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample10.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample11.csv
Eukaryote Total RNA Nano_2020-12-10_07-39-50_Sample12.csv
-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-10_07-47-43
          4         3       2
-0.

-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-10_15-17-25
           4          3         2
-0.001523 x + 0.3473 x - 18.75 x + 407.5 x - 3219
-----------
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample1.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample2.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample3.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample4.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample5.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample6.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample7.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample8.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample9.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample10.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample11.csv
Eukaryote Total RNA Nano_2020-12-10_15-17-25_Sample12.csv
-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-10_15-22-13
            4          3         2


-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-11_09-00-11
           4          3         2
-0.001199 x + 0.3108 x - 17.34 x + 384.3 x - 3079
-----------
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample1.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample2.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample3.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample4.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample5.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample6.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample7.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample8.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample9.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample10.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample11.csv
Eukaryote Total RNA Nano_2020-12-11_09-00-11_Sample12.csv
-----------
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-12-11_09-26-01
          4           3         2
0

<Figure size 576x384 with 0 Axes>