In [3]:
import pandas as pd
import numpy as np
import os
import sys
import ast
import re
import json

In [4]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
sys.path.append(project_root)

from utils.generate_seq import Sample

In [5]:
num_replicates = 200

df = pd.read_csv('polymer_db.csv')
df[['monomers', 'mon_mol_distribution']] = df[['monomers', 'mon_mol_distribution']].map(lambda x: ast.literal_eval(str(x)))

# for col in ['monomers', 'mon_SMILES', 'mon_class_wt_%', 'mon_mol_distribution']:
#     print(f"Column: {col}")
#     print(df[col].apply(type).value_counts())

In [6]:
def generate_samples_for_row(row):
    monomers = row['monomers']
    mol_dist = row['mon_mol_distribution']
    DP = row['DP']
    n= num_replicates
    
    return Sample(monomers, DP, mol_dist, 'wo_replacement', n, batch=True).samples

# Generate samples and replicate the data for each row in a more memory-efficient manner
samples_list = df.apply(generate_samples_for_row, axis=1)

new_df = df.loc[df.index.repeat(num_replicates), ['poly_ID', 'monomers', 'MIC_ecoli']].reset_index(drop=True)
new_df['sequence'] = [sample for samples in samples_list for sample in samples]

new_df

Unnamed: 0,poly_ID,monomers,MIC_ecoli,sequence
0,24,"[Tma, Do]",64,TmaDoTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTma...
1,24,"[Tma, Do]",64,TmaTmaTmaTmaTmaTmaTmaDoTmaDoTmaTmaTmaTmaTmaTma...
2,24,"[Tma, Do]",64,DoTmaTmaTmaTmaTmaTmaDoDoDoTmaTmaTmaTmaTmaTmaTm...
3,24,"[Tma, Do]",64,TmaTmaDoTmaTmaTmaTmaTmaTmaTmaDoTmaTmaDoTmaTmaT...
4,24,"[Tma, Do]",64,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTmaTmaTm...
...,...,...,...,...
3995,43,"[Tma, Mep, Phe, Olam, Bmam]",64,PheMepTmaTmaTmaOlamPheTmaTmaTmaOlamTmaTmaTmaTm...
3996,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaPheTmaMepTmaTmaPheTmaOlamMepTmaBmamPheTmaTm...
3997,43,"[Tma, Mep, Phe, Olam, Bmam]",64,PheTmaBmamTmaPheOlamTmaTmaPheTmaTmaTmaBmamTmaT...
3998,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaTmaMepMepTmaTmaTmaMepBmamBmamTmaPheTmaTmaTm...


In [7]:
with open('monomer_data/monomer_properties.json', 'r') as file:
    monomer_properties = json.load(file)

def get_polymer_weights(seq):

    seq = re.findall('[A-Z][^A-Z]*', seq)
    freq = {}
    
    for mon in seq:
        if (mon in freq):
            freq[mon] += monomer_properties[mon]['molar_mass']
        else:
            freq[mon] = monomer_properties[mon]['molar_mass']

    return freq

def get_seq_dist(seq):
    wts = get_polymer_weights(seq)
    total_wts = sum(list(wts.values()))
    wts = {key: val / total_wts for key, val in wts.items()}

    return wts

In [8]:
new_df['seq_mon_mol_dist'] = new_df['sequence'].map(lambda x: get_seq_dist(x))
new_df['seq_mon_mol_dist'] = new_df.apply(lambda row:
                                               np.array([row['seq_mon_mol_dist'].get(mon, 0) for mon in row['monomers'] if mon in row['seq_mon_mol_dist']]), axis=1)

In [9]:
new_df = new_df.drop('monomers', axis=1)
# new_df.insert(0, 'sample_ID', range(1, len(new_df) + 1))
new_df.insert(0, 'sample_ID', (np.array([i for i in range(0, len(new_df))]) % (num_replicates)) + 1)
new_df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist
0,1,24,64,TmaDoTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTma...,"[0.8067013951034785, 0.19329860489652143]"
1,2,24,64,TmaTmaTmaTmaTmaTmaTmaDoTmaDoTmaTmaTmaTmaTmaTma...,"[0.8067013951034785, 0.19329860489652143]"
2,3,24,64,DoTmaTmaTmaTmaTmaTmaDoDoDoTmaTmaTmaTmaTmaTmaTm...,"[0.6985847678156438, 0.3014152321843561]"
3,4,24,64,TmaTmaDoTmaTmaTmaTmaTmaTmaTmaDoTmaTmaDoTmaTmaT...,"[0.7138288179400879, 0.2861711820599121]"
4,5,24,64,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTmaTmaTm...,"[0.7138288179400879, 0.2861711820599121]"
...,...,...,...,...,...
3995,196,43,64,PheMepTmaTmaTmaOlamPheTmaTmaTmaOlamTmaTmaTmaTm...,"[0.6480495967520711, 0.03060537421650058, 0.09..."
3996,197,43,64,TmaPheTmaMepTmaTmaPheTmaOlamMepTmaBmamPheTmaTm...,"[0.6351275297044727, 0.08583965017854536, 0.07..."
3997,198,43,64,PheTmaBmamTmaPheOlamTmaTmaPheTmaTmaTmaBmamTmaT...,"[0.6852770001473404, 0.05274053337262416, 0.08..."
3998,199,43,64,TmaTmaMepMepTmaTmaTmaMepBmamBmamTmaPheTmaTmaTm...,"[0.7185387777984039, 0.07412611927659254, 0.04..."


In [10]:
new_df.to_csv('polymer_samples_db.csv', index=False)