In [1]:
import pandas as pd
import numpy as np
import os
import sys
import ast
import re
import json

In [2]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
sys.path.append(project_root)

from utils.generate_seq import Sample

In [3]:
num_replicates = 200

df = pd.read_csv('polymer_db_rd1.csv')
df[['monomers', 'mon_mol_distribution']] = df[['monomers', 'mon_mol_distribution']].map(lambda x: ast.literal_eval(str(x)))

# for col in ['monomers', 'mon_SMILES', 'mon_class_wt_%', 'mon_mol_distribution']:
#     print(f"Column: {col}")
#     print(df[col].apply(type).value_counts())

In [4]:
def generate_samples_for_row(row):
    monomers = row['monomers']
    mol_dist = row['mon_mol_distribution']
    DP = row['DP']
    n= num_replicates
    
    return Sample(monomers, DP, mol_dist, 'wo_replacement', n, batch=True).samples

# Generate samples and replicate the data for each row in a more memory-efficient manner
samples_list = df.apply(generate_samples_for_row, axis=1)

new_df = df.loc[df.index.repeat(num_replicates), ['poly_ID', 'monomers', 'MIC_ecoli']].reset_index(drop=True)
new_df['sequence'] = [sample for samples in samples_list for sample in samples]

new_df

Unnamed: 0,poly_ID,monomers,MIC_ecoli,sequence
0,1,"[Tma, Ni, Mo]",>512,MoTmaTmaNiNiTmaTmaNiTmaNiTmaMoTmaNiNiNiTmaNiNi...
1,1,"[Tma, Ni, Mo]",>512,TmaNiNiTmaNiTmaTmaMoTmaTmaTmaTmaNiTmaNiNiNiNiN...
2,1,"[Tma, Ni, Mo]",>512,NiTmaTmaMoNiTmaMoNiMoNiNiNiMoNiNiTmaNiTmaTmaTm...
3,1,"[Tma, Ni, Mo]",>512,TmaNiTmaNiTmaNiTmaMoNiNiNiMoTmaNiNiNiNiNiTmaTm...
4,1,"[Tma, Ni, Mo]",>512,MoTmaNiNiNiTmaNiTmaMoTmaNiTmaNiNiMoMoNiNiTmaTm...
...,...,...,...,...
4595,23,"[Tma, Olam, Mep]",128,TmaTmaTmaOlamOlamMepTmaTmaTmaOlamTmaTmaTmaMepT...
4596,23,"[Tma, Olam, Mep]",128,OlamTmaTmaOlamOlamTmaTmaTmaTmaTmaOlamOlamTmaTm...
4597,23,"[Tma, Olam, Mep]",128,MepTmaMepOlamTmaTmaTmaTmaTmaTmaOlamTmaOlamTmaT...
4598,23,"[Tma, Olam, Mep]",128,TmaTmaTmaMepTmaTmaTmaTmaTmaTmaTmaTmaMepTmaMepT...


In [5]:
with open('monomer_data/monomer_properties.json', 'r') as file:
    monomer_properties = json.load(file)

def get_polymer_weights(seq):

    seq = re.findall('[A-Z][^A-Z]*', seq)
    freq = {}
    
    for mon in seq:
        if (mon in freq):
            freq[mon] += monomer_properties[mon]['molar_mass']
        else:
            freq[mon] = monomer_properties[mon]['molar_mass']

    return freq

def get_seq_dist(seq):
    wts = get_polymer_weights(seq)
    total_wts = sum(list(wts.values()))
    wts = {key: val / total_wts for key, val in wts.items()}

    return wts

In [6]:
new_df['seq_mon_mol_dist'] = new_df['sequence'].map(lambda x: get_seq_dist(x))
new_df['seq_mon_mol_dist'] = new_df.apply(lambda row:
                                               np.array([row['seq_mon_mol_dist'].get(mon, 0) for mon in row['monomers'] if mon in row['seq_mon_mol_dist']]), axis=1)

In [7]:
new_df = new_df.drop('monomers', axis=1)
# new_df.insert(0, 'sample_ID', range(1, len(new_df) + 1))
new_df.insert(0, 'sample_ID', (np.array([i for i in range(0, len(new_df))]) % (num_replicates)) + 1)
new_df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist
0,1,1,>512,MoTmaTmaNiNiTmaTmaNiTmaNiTmaMoTmaNiNiNiTmaNiNi...,"[0.5879270317449595, 0.2615039481215758, 0.150..."
1,2,1,>512,TmaNiNiTmaNiTmaTmaMoTmaTmaTmaTmaNiTmaNiNiNiNiN...,"[0.5348896097295, 0.3346474812165571, 0.130462..."
2,3,1,>512,NiTmaTmaMoNiTmaMoNiMoNiNiNiMoNiNiTmaNiTmaTmaTm...,"[0.563355075491924, 0.3083994985374007, 0.1282..."
3,4,1,>512,TmaNiTmaNiTmaNiTmaMoNiNiNiMoTmaNiNiNiNiNiTmaTm...,"[0.5923511380518659, 0.29387233150859365, 0.11..."
4,5,1,>512,MoTmaNiNiNiTmaNiTmaMoTmaNiTmaNiNiMoMoNiNiTmaTm...,"[0.5202845117295835, 0.3481148301272938, 0.131..."
...,...,...,...,...,...
4595,196,23,128,TmaTmaTmaOlamOlamMepTmaTmaTmaOlamTmaTmaTmaMepT...,"[0.6421989529518732, 0.1996405820554758, 0.158..."
4596,197,23,128,OlamTmaTmaOlamOlamTmaTmaTmaTmaTmaOlamOlamTmaTm...,"[0.6480662859702159, 0.2878065166274775, 0.064..."
4597,198,23,128,MepTmaMepOlamTmaTmaTmaTmaTmaTmaOlamTmaOlamTmaT...,"[0.6231695092815204, 0.3228748219440322, 0.053..."
4598,199,23,128,TmaTmaTmaMepTmaTmaTmaTmaTmaTmaTmaTmaMepTmaMepT...,"[0.6600919677261403, 0.2351284248398017, 0.104..."


In [8]:
new_df.to_csv('polymer_samples_db.csv', index=False)