In [3]:
ls

10-21-2020_analyzed_samples.csv
10-21-2020_analyzed_samples_doublenormalized.csv
10-21-2020_analyzed_samples_mindistance100_minheight_1.csv
10-21-2020_analyzed_samples_minheight_1.csv
102120_Bioanalyzer.xlsx
[1m[36mAnalysis_DSK[m[m/
Convert_ladders.ipynb
[1m[36mData[m[m/
Icon?
Samples_Runs_map.xlsx
analysis_10-21-20_BioAnalyzer.ipynb
[1m[36moutput[m[m/
[1m[36mpdf[m[m/
platenumber_filename.csv
[1m[36mplots[m[m/
[1m[36mprocessed_data[m[m/
[1m[36mraw_data[m[m/
sample_nucleotide_filename.csv
sample_nucleotide_filename.xlsx
[1m[36mxad[m[m/
[1m[36mzip[m[m/


In [4]:
import pandas as pd
import scipy as sp
from scipy.sparse import diags
import numpy as np
from numpy import linalg as LA
import sys

from os import path

import matplotlib.pyplot as plt

#importing seaborn for plotting
import seaborn as sns

#for plotting purposes
%pylab inline
sns.set_style('ticks')
sns.set_context('paper')

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

import matplotlib as mpl

from scipy.signal import find_peaks

# mpl.rcParams
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.figsize'] = [8, 16/3]

Populating the interactive namespace from numpy and matplotlib


### converting ladder to nts

In [5]:
data_dir = './raw_data/'
fig_dir = './output/ladder_convert/'
processed_dir = './processed_data/'

In [7]:
#list of filenames 
filenames_df = pd.read_csv('platenumber_filename.csv').dropna(axis=0)
filenames = list(filenames_df['File_Name'])    

incomplete_ladder = ["Eukaryote Total RNA Nano_2020-12-10_07-47-43",
                     "Eukaryote Total RNA Nano_2020-12-10_08-35-37",
                     "Eukaryote Total RNA Nano_2020-12-10_08-43-35"]

for filename in filenames:
    
    """
    Plotting and identifying peaks from the ladder traces from BioAnalyzer.
    Must be done per chip/run, since ladders may elute at slightly different times per run.
    Generates plots that shows identified peaks.
    As of 12-10-2020, should identify 6 peaks.
    """
    
    ladder_dict = {}
    ladder_values = {}
    
    if filename in incomplete_ladder:
        peak_number = 5
    else:
        peak_number = 6


    print('filename: ')
    print(filename)
    ladder_df = pd.read_csv(data_dir+filename+'_Ladder.csv', skiprows=17)[:-1]
    ladder_time = np.array(ladder_df['Time'].astype(float))
    ladder_value = np.array(ladder_df['Value'].astype(float))

    peaks,_ = find_peaks(ladder_value, height=2.3, distance=50)
    print('identified peaks: ')
    print(peaks)
    ladder_dict = list(peaks[:peak_number])

    #plotting ladder trace + peak identification
    plot(ladder_time, ladder_value, label='ladder trace')
    plot(ladder_time[peaks], ladder_value[peaks], 'x', label='peaks', markersize=15)
    title('Ladder Peaks: '+filename, fontsize=16)
    legend(fontsize=14)
    xlabel('Time', fontsize=14)
    ylabel('FU', fontsize=14)
    xticks(fontsize=14)
    yticks(fontsize=14)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+filename+'_ladder.png', dpi=300)
    clf()
    
    """
    Converting times (elution times from BioAnalyzer) to nucleotides using
    a 4th order polynomial fit
    """
    
    #returns the times when peaks occur, will serve as x on polynomial fit
    peak_times = ladder_time[ladder_dict]
    print(len(peak_times))
    
    #ladder nucleotide sizes, will serve as y on polynomial fit
    if filename in incomplete_ladder:
        peak_fu = np.array([25, 500, 1000, 2000, 4000])
    else:
        peak_fu = np.array([25, 200, 500, 1000, 2000, 4000])
        
    new_height = 5
    while (len(peak_times) < len(peak_fu)):
        new_height = new_height-0.1
        peaks,_ = find_peaks(ladder_value, height=new_height, distance=15)
        ladder_dict = list(peaks[:6])
        peak_times = ladder_time[ladder_dict]
    print('peak times:')
    print(peak_times)
    #assert that the lengths of x and y match (cannot perform fit otherwise)
    assert (len(peak_times) == len(peak_fu))

    #fitting a 4th order polynomial to peaks/nucleotides
    ladder_fit = np.polyfit(x=peak_times, y = peak_fu, deg = 4)
    lf = np.poly1d(ladder_fit)
    ladder_nts = lf

    #print filename and ladder fit
    print('printing filename and ladder fit:')
    print(filename)
    print(lf)
    print('-----------')
    print('-----------')
    print('-----------')
    #plotting
    plot(peak_fu, lf(peak_times), 'o',label='calculated nts vs. ladder nts')
    plot(peak_fu, peak_fu,label='perfect correlation')
    ylabel('Calculated nts (from time)', fontsize=14)
    xlabel('Ladder nts', fontsize=14)
    yticks(fontsize=12)
    xticks(fontsize=12)
    title('Ladder polynomial fit: '+filename, fontsize=16)
    legend(fontsize=12)
    tight_layout()
    
    #saving figure
    savefig(fig_dir+'ladder_fit-'+filename+'.png', dpi=300)
    clf()
    
    """
    Applying fits to samples as well, per ladder
    """
    
    sample_numbers = np.arange(1,13)
  
    for number in sample_numbers:
        file = filename+'_Sample'+str(number)+".csv"
        if (path.exists(data_dir + file)):
            sample_df = pd.read_csv(data_dir + file, skiprows=17)[:-1]
            times = np.array(sample_df['Time'].astype(float))
            sample_df['Nucleotides'] = ladder_nts(times)
            sample_df.to_csv(processed_dir+'nts-'+ file)
#             print('-------')
#             print(data_dir+file)
#             print('Printing minimum: ')
#             print(sample_df['Nucleotides'].min())

        else:
            continue

filename: 
Eukaryote Total RNA Nano_2020-10-21_16-07-11
identified peaks: 
[110 234 318 423 549 718 858]
6
peak times:
[22.5  28.7  32.9  38.15 44.45 52.9 ]
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-10-21_16-07-11
           4           3          2
-5.62e-05 x + 0.05256 x - 0.8949 x - 23.94 x + 430.8
-----------
-----------
-----------
filename: 
Eukaryote Total RNA Nano_2020-10-21_16-14-00
identified peaks: 
[110 233 315 419 541 709 853 904]
6
peak times:
[22.5  28.65 32.75 37.95 44.05 52.45]
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-10-21_16-14-00
            4          3         2
-0.0009395 x + 0.1803 x - 7.441 x + 119.8 x - 717.7
-----------
-----------
-----------
filename: 
Eukaryote Total RNA Nano_2020-10-21_16-34-50
identified peaks: 
[110 233 316 421 544 711 855]
6
peak times:
[22.5  28.65 32.8  38.05 44.2  52.55]
printing filename and ladder fit:
Eukaryote Total RNA Nano_2020-10-21_16-34-50
            4         3         2
-0.000

<Figure size 576x384 with 0 Axes>

In [9]:
filenames_df

Unnamed: 0,Plate_Number,File_Name
0,1,Eukaryote Total RNA Nano_2020-10-07_13-56-03
1,2,Eukaryote Total RNA Nano_2020-10-08_10-55-37
2,3,Eukaryote Total RNA Nano_2020-10-21_17-06-53
3,4,Eukaryote Total RNA Nano_2020-10-21_17-37-08
4,5,Eukaryote Total RNA Nano_2020-10-21_17-42-16
5,6,Eukaryote Total RNA Nano_2020-10-21_17-42-16
6,7,Eukaryote Total RNA Nano_2020-10-22_10-58-16
7,8,Eukaryote Total RNA Nano_2020-10-12_16-05-51


In [13]:
ladder_nts
sample_df['Nucleotides'].min()

1.7152977779078817