In [4]:
import numpy as np
data_dir = './processed_data/'
output_dir = './output/'
norm_peak = 0
pk_nt_bounds = np.array([(20, 30), (150,300), (820,1000)])
output_file = '121020_Bioanalyzer'

In [5]:
#quickly making sure output dir exists, if not make the directory
if os.path.exists(output_dir):
    pass
else:
    os.mkdir(output_dir)

import pandas as pd
import scipy as sp
from scipy.signal import find_peaks
import numpy as np
from os import path
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

#for plotting purposes
%pylab inline
sns.set_style('ticks')
sns.set_context('paper')

# mpl.rcParams
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.figsize'] = [8, 16/3]

#reading in samples and the corresponding filenames
samples_df = pd.read_csv('sample_nucleotide_filename.csv').dropna(axis=1)
all_sample_names = samples_df['Sample']

plates_df = pd.read_csv('platenumber_filename.csv').dropna(axis=1)
filenames_dict = dict(zip(plates_df['Plate_Number'],plates_df['File_Name']))

filenames = []
for filename, filenum in zip(samples_df['Plate'], samples_df['FileNumber']):
    name = filenames_dict[filename]
    name = 'nts-'+name+'_Sample'+str(filenum)+'.csv'
    filenames.append(name)

samples_df['filename'] = filenames

#appending data to dataframe
times = []
nts = []
fus = []
for row in samples_df.itertuples():
    signal_df = pd.read_csv(data_dir+row.filename)

    time = np.array(signal_df['Time'])
    fu = np.array(signal_df['Value'])
    nt = np.array(signal_df['Nucleotides'])
    
    times.append(time)
    nts.append(nt)
    fus.append(fu)

samples_df = samples_df.assign(nts = nts, times = times, fu = fus)

#now identifying peaks and areas
pkwidth_below = 1
pkwidth_above = 2

peak_idxs = []
peak_nts = []
widths_list = []
peak_area_list = []
background_area_list = []
peak_area_backsub_list = []
total_area_list = []
total_mRNA_list = []

for i, row in enumerate(samples_df.itertuples()):
    clip_len = int(len(row.nts/2.2))
    nts = row.nts[:clip_len]
    fu  = row.fu[:clip_len]
    times = row.times[:clip_len]
    
    #findings peaks per row
    peak_idx, properties = find_peaks(fu, distance=50, prominence=2.5, width=0.5)
    widths = properties['widths']

    #after finding peaks, filter them for only those that appear within bounds provided
    peak_idx_curated = []
    widths_curated = []
    num_pks = len(pk_nt_bounds)
    for q, idx in enumerate(peak_idx):
        for r in np.arange(num_pks):
            if (nts[idx]>pk_nt_bounds[r][0] and nts[idx]<pk_nt_bounds[r][1] and widths[q]<20):
                peak_idx_curated.append(idx)
                widths_curated.append(widths[q])

    # assume that the last peak is the peak of interest--if not found, then use w, peak_idx from before
    if nts[peak_idx_curated[-1]]>pk_nt_bounds[-1][0] and nts[peak_idx_curated[-1]]<pk_nt_bounds[-1][1]:
        pass
    else:
        peak_idx_curated = peak_idxs[-1]
        widths_curated = widths_list[-1]
    
    peak_nts.append(nts[peak_idx_curated])
    peak_idxs.append(peak_idx_curated)
    widths_list.append(widths_curated)

    num_peaks = len(peak_idx_curated)
    peak_area = np.zeros(num_peaks)
    background_area=np.zeros(num_peaks)
    for i in np.arange(num_peaks):
        min_idx = int(floor(peak_idx_curated[i]-pkwidth_below*widths_curated[i]))
        max_idx = int(ceil(peak_idx_curated[i]+pkwidth_above*widths_curated[i]))

        # straight up integration
        peak_area[i] = sum(fu[min_idx:max_idx])
        # trapezoidal for the background area
        background_area[i] = (max_idx-min_idx)*(fu[min_idx]+fu[max_idx])/2

    # calculating total background area
    total_min_nt = 100
    total_max_nt = 1000
    fp = np.arange(len(nts))
    total_min_idx = int(np.interp(total_min_nt, nts, fp))
    total_max_idx = int(np.interp(total_max_nt, nts, fp))
    total_area = sum(fu[total_min_idx:total_max_idx])
    total_mRNA = total_area - np.sum(peak_area[:-1]-background_area[:-1])

    peak_area_list.append(peak_area)
    background_area_list.append(background_area)
    peak_area_backsub_list.append(peak_area-background_area)
    total_area_list.append(total_area)
    total_mRNA_list.append(total_mRNA)

samples_df = samples_df.assign(peak_index = peak_idxs, peak_nts = peak_nts, peak_widths = widths_list, peak_area = peak_area_list, background_area = background_area_list, peak_area_backsub = peak_area_backsub_list, total_area = total_area_list, total_mRNA = total_mRNA_list)


#now moving onto fits!
from scipy import optimize
#normalizing for exponential curve fitting, will need to do it by each sample
#let's first iterate through by sample
all_samples = samples_df['Sample']
seen = set()
unique_samples = [x for x in all_samples if not (x in seen or seen.add(x))]

sample_df_list = []
sample_fit_list = []
for sample in unique_samples:
    print(sample)
    sample_fit_dict = {}
    sample_df = samples_df[samples_df['Sample']==sample]
    peak_area_backsub = np.array(sample_df['peak_area_backsub'].to_list())
    total_mRNA = sample_df['total_mRNA']
    if norm_peak == 0:
        frac_intact = np.array(peak_area_backsub[:,-1]/total_mRNA)
    elif norm_peak<0:
        frac_intact = np.array(peak_area_backsub[:,-1])
    elif norm_peak>0:
        frac_intact = np.array(peak_area_backsub[:,-1]/peak_area_backsub[:,norm_peak-1])
    frac_norm = (frac_intact/frac_intact[0]).flatten()

    sample_df['fraction_intact'] = frac_intact
    sample_df['fraction_norm'] = frac_norm

    sample_df_list.append(sample_df)

    expfit = lambda p,t,y: (abs(np.exp(-1*p*t)-y)).sum()
    p0=1
    times = sample_df['Timepoint'].to_numpy().flatten()
    print(times)
    print(frac_norm)
    p = optimize.fmin(func=expfit, x0=p0, args=(times,frac_norm))
    print(p)

    n_bootstrap = 1000
    kdeg_fits = []
    for i in np.arange(n_bootstrap):
        bootstrap_inds = np.random.choice(a=len(times), size=len(times))
        fit_t = times[bootstrap_inds]
        fit_frac = frac_norm[bootstrap_inds]
        p = optimize.fmin(func=expfit, x0=p0, args=(fit_t,fit_frac))
        kdeg_fits.append(float(p))
    kdeg_mean = np.mean(kdeg_fits)
    kdeg_err = np.std(kdeg_fits)

    sample_fit_dict['kdeg_fits'] = kdeg_fits
    sample_fit_dict['kdeg'] = kdeg_mean
    sample_fit_dict['kdeg_err'] = kdeg_err
    sample_fit_dict['sample'] = sample

    sample_fit_list.append(sample_fit_dict) 

    sample_fit_df = pd.DataFrame(sample_fit_list)
    sample_fit_df.to_csv(output_file+'_expfits.csv')

    final_df = pd.concat(sample_df_list)
    final_df.to_csv(output_dir+output_file+'_summary.csv')


on terminated successfully.
         Current function value: 0.293994
         Iterations: 19
         Function evaluations: 38
Optimization terminated successfully.
         Current function value: 0.449135
         Iterations: 15
         Function evaluations: 30
Optimization terminated successfully.
         Current function value: 0.657586
         Iterations: 18
         Function evaluations: 36
Optimization terminated successfully.
         Current function value: 0.347766
         Iterations: 16
         Function evaluations: 32
Optimization terminated successfully.
         Current function value: 0.260488
         Iterations: 19
         Function evaluations: 38
Optimization terminated successfully.
         Current function value: 0.187915
         Iterations: 16
         Function evaluations: 32
Optimization terminated successfully.
         Current function value: 0.413568
         Iterations: 17
         Function evaluations: 34
Optimization terminated successfully.
      