In [17]:
from glob import glob
import numpy as np
import pandas as pd
from scipy import stats


In [18]:
# Get bioanlyzer ladder
nano_ladder = [25,200,500,1000,2000,4000,6000] # they just do linear fit

ladder_data = []
for ladder_file in glob('bioanalyzer/data_export/*Ladder-peaks.csv'):
    ladder = pd.read_csv(ladder_file)
    ladder_top6 = ladder.sort_values('Time corrected area',ascending = False).head(7)['Aligned Migration Time [s]'].sort_values().to_list()
    for time,nt in zip(ladder_top6,nano_ladder):
        ladder_data.append([ladder_file,1/time,nt])
ladder_data = pd.DataFrame(ladder_data,columns = ['ladder_file', 'inverse_migration_time_s', 'length_nt'])


ladder_dict = {}
for file,df in ladder_data.groupby('ladder_file'):
    print(file)
    slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(df['length_nt']),df['inverse_migration_time_s'])
    print(slope, intercept, r_value, p_value, std_err)
    print(f'log(nt) = {(1/slope).round(2)}((1/time)-{intercept.round(4)})')
    ladder_dict[file.replace('\\','/')[:-17]] = (1/slope,intercept)


bioanalyzer/data_export/Eukaryote Total RNA Nano_2023-12-07_15-03-29_Ladder-peaks.csv
-0.011622864915385629 0.06121212439684324 -0.9990573119632992 5.237867043400352e-08 0.0002258574424183162
log(nt) = -86.04((1/time)-0.0612)
bioanalyzer/data_export/Eukaryote Total RNA Nano_2024-04-02_15-17-51_Ladder-peaks.csv
-0.01155777299192757 0.06104726925690548 -0.999342227953112 2.1305381774954705e-08 0.0001875668974930757
log(nt) = -86.52((1/time)-0.061)
bioanalyzer/data_export/Eukaryote Total RNA Nano_2024-04-17_12-58-50_Ladder-peaks.csv
-0.011604048015070596 0.061161197687038954 -0.9992236328273593 3.2243419789068766e-08 0.00020460963858015663
log(nt) = -86.18((1/time)-0.0612)


In [28]:
# Get bioanlyzer data, and match to linearly fit ladder

bioanlyzer_data = pd.read_csv('bioanalyzer/bioanalyzer_locations_summary.csv')

for ind,row in bioanlyzer_data.iterrows():
    if row.length != row.length:
        continue
    slope,intercept = ladder_dict['bioanalyzer/'+row.run_name]
    data = pd.read_csv(f'bioanalyzer/{row.run_name}_Sample{row.run_index}.csv',header=13)[:-1].astype(float).sort_values('Time',ascending=False)
    data['log(nt)'] = slope*((1/data.Time)-intercept)
    data['nt'] = 10**data['log(nt)']
    data['name'] = row.sample_name
    lad = pd.read_csv(f'bioanalyzer/{row.run_name}_Ladder.csv',header=13)[:-1].astype(float).sort_values('Time',ascending=False)
    lad['log(nt)'] = slope*((1/lad.Time)-intercept)
    lad['nt'] = 10**lad['log(nt)']
    lad['name'] = 'Ladder'
    data = pd.concat([lad,data]).reset_index()
    data = data.pivot_table(index="nt", columns="name", values="Value",sort=False)
    data = data.reindex(columns=['Ladder',row.sample_name])
    data.to_csv(f'source_tables_for_supplement/bioanlyzer_{row.sample_name}__{row.run_name.split("Nano_")[1]}.csv')


In [49]:
# DLS

def save_data(sample,folder):
    num_reps = 2
    num_aqs = 10
    outliers=[]
    concentrations=[0.03]
    temperatures = [25,35,45,55,65,75]
    
    # temperatures in C
    
    # read in all data
    df = []
    temp_index = 0
    for conc in concentrations:
        for Tn in range(1,len(temperatures)+1):
            rep_start = 1
            rep_end = num_reps+1
            for rep in range(rep_start,rep_end):
                for aq in range(1,num_aqs+1):
                    if (conc,temperatures[temp_index],rep,aq) not in outliers:
                        try:
                            temp = pd.read_excel(f'{folder}/{sample}_{conc}_HEPES + MgCl2 ({Tn})/Replicate {rep}/Acquisition {aq}/{sample}_{conc}_HEPES + MgCl2 ({Tn}) - {rep}.xlsx')
                        except:
                            temp = pd.read_excel(f'{folder}/{sample}_{conc}_Hepes + MgCl2 ({Tn})/Replicate {rep}/Acquisition {aq}/{sample}_{conc}_Hepes + MgCl2 ({Tn}) - {rep}.xlsx')
    
                        temp['sample'] = sample
                        temp['concentration'] = conc
                        temp['temperature'] = temperatures[temp_index]
                        
                        temp['replicate'] = rep
                        temp['acquisition'] = aq
                        temp['normalized_frequency']= temp['Relative frequency [%]']/temp['Relative frequency [%]'].sum()
                        df.append(temp)
            temp_index += 1
    df = pd.concat(df)
    
    
    
    for conc,df_conc in df.groupby("concentration"):
        i = concentrations[::-1].index(conc)
        if folder=='Export_2024-04-19_09-26-36':
            missed_T = [20,30,70]
            for T in missed_T:
                df_conc = pd.concat([pd.DataFrame([[df_conc["Radius [nm]"].min(),0,'dummy',conc,T,1,1]], columns=df_conc.columns), df_conc], ignore_index=True)
        #if normalize:
        #    piv = pd.pivot_table(df_conc, values="normalized_frequency", index="Radius [nm]", columns="temperature", aggfunc='mean', sort=True)
        piv = pd.pivot_table(df_conc, values="Relative frequency [%]", index="Radius [nm]", columns="temperature", aggfunc='mean', sort=True)
    piv.to_csv(f'source_tables_for_supplement/DLS_{sample}.csv')


sample='GOLLDa_Full'
folder='dynamic_light_scattering/DLS_data/GOLLDa-Export_2024-03-09_18-20-01'
save_data(sample,folder)
sample='ROOL_120'
folder='dynamic_light_scattering/DLS_data/ROOL120_Export_2024-03-09_18-05-01'
save_data(sample,folder)

In [56]:
# MP
# get data
all_data = []
all_names = []
files =glob("mass_photometry/rool_golld_mp_data/*events.csv")
for file in files:
    df = pd.read_csv(file)
    vals = df.calibrated_values.to_numpy()
    vals = vals[~np.isnan(vals)]
    name = file[39:-11]
    all_data.extend(vals)
    all_names.extend([name]*len(vals))
all_df_cage = pd.DataFrame(np.array([all_data,all_names]).T,columns=['nt','name'])
all_df_cage = all_df_cage.astype({'nt':float})
print(all_df_cage.name.unique())

all_data = []
all_names = []
files =glob("mass_photometry/ole_raiA_mp_data/*events.csv")
for file in files:
    df = pd.read_csv(file)
    vals = df.calibrated_values.to_numpy()
    vals = vals[~np.isnan(vals)]
    name = file[37:-11]
    all_data.extend(vals)
    all_names.extend([name]*len(vals))
all_df = pd.DataFrame(np.array([all_data,all_names]).T,columns=['nt','name'])
all_df = all_df.astype({'nt':float})
print(all_df.name.unique())

['gollda-10ulbuffer-2ulblack-2weeks'
 'rool120-75-gollda-25-10ulbuffer-2ulbuffer-try2'
 'rool120-gollda-10ulbuffer-2ulbuffer'
 'rool120-25-gollda-75-10ulbuffer-2ulbuffer' 'sample38-10x'
 'rool120-10ulbuffer-2ulblack-2weeks'
 'rool120-25-gollda-75-10ulbuffer-2ulbuffer-try2'
 'rool120-75-gollda-25-10ulbuffer-2ulbuffer'
 'rool120-gollda-10ulbuffer-2ulbuffer_collect2' 'rnaladder_startround3'
 'sample28--10x']
['OLE_100mM-MgCl2' 'OLE_10mM-MgCl2_v1' 'RNAladder_10mM-MgCl2_200mM-NaCl'
 'RNAladder_0mM-MgCl2_200mM-NaCl' 'raiA_01_v2' 'raiA_02_v2'
 'RNAladder_100mM-MgCl2_v3' 'RNAladder_1percentEtOH' 'RNAladder_0mM-MgCl2'
 'RNAladder_10mM-MgCl2_200mM-KCl' 'OLE_0mM-MgCl2_200mM-KCl'
 'OLE_0mM-MgCl2' 'RNAladder_0mM-MgCl2_200mM-KCl' 'raiA_01_v1'
 'OLE_0mM-MgCl2_200mM-NaCl' 'RNAladder_10mM-MgCl2' 'raiA_02_v1'
 'RNAladder_5percentEtOH' 'OLE_10mM-MgCl2_200mM-KCl' 'OLE_10mM-MgCl2_v2'
 'OLE_10mM-MgCl2_200mM-NaCl' 'OLE_1mM-MgCl2' 'buffer_10mM-MgCl2'
 'OLE_1percentEtOH' 'RNAladder_100mM-MgCl2_v2' 'OLE_5percen

In [57]:
pd.concat([all_df_cage,all_df]).to_csv(f'source_tables_for_supplement/mass_photometry_events.csv',index=False)