In [11]:
import pandas as pd
import numpy as np
import os
import sys
import ast
import re
import json

In [12]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
sys.path.append(project_root)

from utils.generate_seq import Sample

In [13]:
num_replicates = 100

df = pd.read_csv('polymer_db.csv')
df[['monomers', 'mon_mol_distribution']] = df[['monomers', 'mon_mol_distribution']].map(lambda x: ast.literal_eval(str(x)))

# for col in ['monomers', 'mon_SMILES', 'mon_class_wt_%', 'mon_mol_distribution']:
#     print(f"Column: {col}")
#     print(df[col].apply(type).value_counts())

In [14]:
def generate_samples_for_row(row):
    monomers = row['monomers']
    mol_dist = row['mon_mol_distribution']
    DP = row['DP']
    n= num_replicates
    
    return Sample(monomers, DP, mol_dist, 'wo_replacement', n, batch=True).samples

# Generate samples and replicate the data for each row in a more memory-efficient manner
samples_list = df.apply(generate_samples_for_row, axis=1)

new_df = df.loc[df.index.repeat(num_replicates), ['poly_ID', 'monomers', 'MIC_ecoli']].reset_index(drop=True)
new_df['sequence'] = [sample for samples in samples_list for sample in samples]

new_df

Unnamed: 0,poly_ID,monomers,MIC_ecoli,sequence
0,4,"[Tma, Phe, Mep]",256,MepTmaTmaTmaTmaPhePhePheTmaPhePheTmaTmaMepMepT...
1,4,"[Tma, Phe, Mep]",256,TmaTmaTmaMepTmaTmaPhePheTmaMepTmaPhePheMepPheT...
2,4,"[Tma, Phe, Mep]",256,PhePhePhePhePheTmaTmaMepTmaPheMepMepPhePhePheT...
3,4,"[Tma, Phe, Mep]",256,TmaTmaPheTmaTmaTmaTmaTmaTmaPheTmaPheTmaTmaMepM...
4,4,"[Tma, Phe, Mep]",256,TmaTmaTmaPheTmaPheTmaPhePheMepTmaTmaPheTmaTmaM...
...,...,...,...,...
2195,23,"[Tma, Olam, Mep]",128,MepTmaTmaTmaTmaTmaTmaTmaMepOlamOlamTmaTmaTmaTm...
2196,23,"[Tma, Olam, Mep]",128,TmaTmaOlamMepTmaTmaMepTmaTmaOlamTmaTmaTmaOlamM...
2197,23,"[Tma, Olam, Mep]",128,TmaTmaTmaTmaTmaMepOlamMepOlamTmaOlamMepTmaMepT...
2198,23,"[Tma, Olam, Mep]",128,OlamTmaTmaTmaTmaTmaMepMepOlamTmaTmaTmaMepMepTm...


In [15]:
with open('monomer_data/monomer_properties.json', 'r') as file:
    monomer_properties = json.load(file)

def get_polymer_weights(seq):

    seq = re.findall('[A-Z][^A-Z]*', seq)
    freq = {}
    
    for mon in seq:
        if (mon in freq):
            freq[mon] += monomer_properties[mon]['molar_mass']
        else:
            freq[mon] = monomer_properties[mon]['molar_mass']

    return freq

def get_seq_dist(seq):
    wts = get_polymer_weights(seq)
    total_wts = sum(list(wts.values()))
    wts = {key: val / total_wts for key, val in wts.items()}

    return wts

In [16]:
new_df['seq_mon_mol_dist'] = new_df['sequence'].map(lambda x: get_seq_dist(x))
new_df['seq_mon_mol_dist'] = new_df.apply(lambda row:
                                               np.array([row['seq_mon_mol_dist'].get(mon, 0) for mon in row['monomers'] if mon in row['seq_mon_mol_dist']]), axis=1)

In [17]:
new_df = new_df.drop('monomers', axis=1)
# new_df.insert(0, 'sample_ID', range(1, len(new_df) + 1))
new_df.insert(0, 'sample_ID', (np.array([i for i in range(0, len(new_df))]) % (num_replicates)) + 1)
new_df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist
0,1,4,256,MepTmaTmaTmaTmaPhePhePheTmaPhePheTmaTmaMepMepT...,"[0.6004420069601168, 0.24937083803002713, 0.15..."
1,2,4,256,TmaTmaTmaMepTmaTmaPhePheTmaMepTmaPhePheMepPheT...,"[0.5441835708015673, 0.30268664476677276, 0.15..."
2,3,4,256,PhePhePhePhePheTmaTmaMepTmaPheMepMepPhePhePheT...,"[0.5434699912170217, 0.3506560942284577, 0.105..."
3,4,4,256,TmaTmaPheTmaTmaTmaTmaTmaTmaPheTmaPheTmaTmaMepM...,"[0.5850669383827234, 0.3570397506034766, 0.057..."
4,5,4,256,TmaTmaTmaPheTmaPheTmaPhePheMepTmaTmaPheTmaTmaM...,"[0.5434699912170217, 0.3506560942284577, 0.105..."
...,...,...,...,...,...
2195,196,23,128,MepTmaTmaTmaTmaTmaTmaTmaMepOlamOlamTmaTmaTmaTm...,"[0.6302198193071066, 0.30611972682649236, 0.06..."
2196,197,23,128,TmaTmaOlamMepTmaTmaMepTmaTmaOlamTmaTmaTmaOlamM...,"[0.6339941020890663, 0.2725711902908302, 0.093..."
2197,198,23,128,TmaTmaTmaTmaTmaMepOlamMepOlamTmaOlamMepTmaMepT...,"[0.6158993073233038, 0.29135948327121663, 0.09..."
2198,199,23,128,OlamTmaTmaTmaTmaTmaMepMepOlamTmaTmaTmaMepMepTm...,"[0.5739784777949709, 0.3446998477845459, 0.081..."


In [18]:
new_df.to_csv('polymer_samples_db.csv', index=False)