In [36]:
import pandas as pd
import numpy as np
import os
import sys
import ast
import re
import json

In [37]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
sys.path.append(project_root)

from utils.generate_seq import Sample

In [38]:
num_replicates = 100

df = pd.read_csv('polymer_db.csv')
df[['monomers', 'mon_mol_distribution']] = df[['monomers', 'mon_mol_distribution']].map(lambda x: ast.literal_eval(str(x)))

# for col in ['monomers', 'mon_SMILES', 'mon_class_wt_%', 'mon_mol_distribution']:
#     print(f"Column: {col}")
#     print(df[col].apply(type).value_counts())

In [39]:
def generate_samples_for_row(row):
    monomers = row['monomers']
    mol_dist = row['mon_mol_distribution']
    DP = row['DP']
    n= num_replicates
    
    return Sample(monomers, DP, mol_dist, 'wo_replacement', n, batch=True).samples

# Generate samples and replicate the data for each row in a more memory-efficient manner
samples_list = df.apply(generate_samples_for_row, axis=1)

new_df = df.loc[df.index.repeat(num_replicates), ['poly_ID', 'monomers', 'MIC_ecoli']].reset_index(drop=True)
new_df['sequence'] = [sample for samples in samples_list for sample in samples]

new_df

Unnamed: 0,poly_ID,monomers,MIC_ecoli,sequence
0,1,"[Tma, Ni, Mo]",>512,NiMoNiTmaTmaNiTmaNiNiNiMoNiTmaMoTmaMoTmaTmaMoN...
1,1,"[Tma, Ni, Mo]",>512,NiTmaTmaTmaTmaMoTmaNiNiNiNiMoNiMoNiNiNiNiTmaTm...
2,1,"[Tma, Ni, Mo]",>512,TmaNiNiTmaNiNiTmaTmaNiTmaNiNiNiTmaTmaTmaTmaTma...
3,1,"[Tma, Ni, Mo]",>512,TmaMoNiTmaNiTmaMoNiMoTmaTmaNiTmaNiTmaTmaNiNiNi...
4,1,"[Tma, Ni, Mo]",>512,TmaNiNiNiNiNiTmaNiTmaNiTmaTmaMoTmaTmaTmaTmaTma...
...,...,...,...,...
4295,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaTmaBmamTmaTmaTmaTmaPheTmaTmaTmaPheTmaTmaTma...
4296,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaTmaTmaTmaTmaTmaPheTmaTmaMepBmamBmamTmaTmaTm...
4297,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaTmaTmaOlamTmaTmaMepOlamPheMepTmaMepTmaTmaTm...
4298,43,"[Tma, Mep, Phe, Olam, Bmam]",64,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaBmamTma...


In [40]:
with open('monomer_data/monomer_properties.json', 'r') as file:
    monomer_properties = json.load(file)

def get_polymer_weights(seq):

    seq = re.findall('[A-Z][^A-Z]*', seq)
    freq = {}
    
    for mon in seq:
        if (mon in freq):
            freq[mon] += monomer_properties[mon]['molar_mass']
        else:
            freq[mon] = monomer_properties[mon]['molar_mass']

    return freq

def get_seq_dist(seq):
    wts = get_polymer_weights(seq)
    total_wts = sum(list(wts.values()))
    wts = {key: val / total_wts for key, val in wts.items()}

    return wts

In [41]:
new_df['seq_mon_mol_dist'] = new_df['sequence'].map(lambda x: get_seq_dist(x))
new_df['seq_mon_mol_dist'] = new_df.apply(lambda row:
                                               np.array([row['seq_mon_mol_dist'].get(mon, 0) for mon in row['monomers'] if mon in row['seq_mon_mol_dist']]), axis=1)

In [42]:
new_df = new_df.drop('monomers', axis=1)
# new_df.insert(0, 'sample_ID', range(1, len(new_df) + 1))
new_df.insert(0, 'sample_ID', (np.array([i for i in range(0, len(new_df))]) % (num_replicates)) + 1)
new_df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist
0,1,1,>512,NiMoNiTmaTmaNiTmaNiNiNiMoNiTmaMoTmaMoTmaTmaMoN...,"[0.5080992060565839, 0.3851315707373519, 0.106..."
1,2,1,>512,NiTmaTmaTmaTmaMoTmaNiNiNiNiMoNiMoNiNiNiNiTmaTm...,"[0.5067572855446987, 0.37344457026907496, 0.11..."
2,3,1,>512,TmaNiNiTmaNiNiTmaTmaNiTmaNiNiNiTmaTmaTmaTmaTma...,"[0.563355075491924, 0.3083994985374007, 0.1282..."
3,4,1,>512,TmaMoNiTmaNiTmaMoNiMoTmaTmaNiTmaNiTmaTmaNiNiNi...,"[0.6304577994083072, 0.24652413174096224, 0.12..."
4,5,1,>512,TmaNiNiNiNiNiTmaNiTmaNiTmaTmaMoTmaTmaTmaTmaTma...,"[0.577227994793426, 0.29560729100514793, 0.127..."
...,...,...,...,...,...
4295,96,43,64,TmaTmaBmamTmaTmaTmaTmaPheTmaTmaTmaPheTmaTmaTma...,"[0.7061636874514257, 0.05095127068583161, 0.08..."
4296,97,43,64,TmaTmaTmaTmaTmaTmaPheTmaTmaMepBmamBmamTmaTmaTm...,"[0.7480315072371421, 0.03047837726801518, 0.05..."
4297,98,43,64,TmaTmaTmaOlamTmaTmaMepOlamPheMepTmaMepTmaTmaTm...,"[0.7522598055545445, 0.05010203767415694, 0.04..."
4298,99,43,64,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaBmamTma...,"[0.8245595040003646, 0.04079574206195441, 0.01..."


In [43]:
new_df.to_csv('polymer_samples_db.csv', index=False)