In [8]:
import pandas as pd
import numpy as np
import os
import sys
import ast
import re
import json

In [9]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
sys.path.append(project_root)

from utils.generate_seq import Sample

In [10]:
num_replicates = 100

df = pd.read_csv('polymer_db.csv')
df[['monomers', 'mon_mol_distribution']] = df[['monomers', 'mon_mol_distribution']].map(lambda x: ast.literal_eval(str(x)))

# for col in ['monomers', 'mon_SMILES', 'mon_class_wt_%', 'mon_mol_distribution']:
#     print(f"Column: {col}")
#     print(df[col].apply(type).value_counts())

In [11]:
def generate_samples_for_row(row):
    monomers = row['monomers']
    mol_dist = row['mon_mol_distribution']
    DP = row['DP']
    n= num_replicates
    
    return Sample(monomers, DP, mol_dist, 'wo_replacement', n, batch=True).samples

# Generate samples and replicate the data for each row in a more memory-efficient manner
samples_list = df.apply(generate_samples_for_row, axis=1)

new_df = df.loc[df.index.repeat(num_replicates), ['poly_ID', 'monomers', 'MIC_ecoli']].reset_index(drop=True)
new_df['sequence'] = [sample for samples in samples_list for sample in samples]

new_df

Unnamed: 0,poly_ID,monomers,MIC_ecoli,sequence
0,1,"[Tma, Ni, Mo]",>512,TmaTmaTmaTmaTmaTmaTmaTmaTmaNiMoNiTmaTmaMoNiNiN...
1,1,"[Tma, Ni, Mo]",>512,TmaTmaNiTmaNiTmaTmaNiMoTmaNiTmaTmaNiNiMoTmaNiN...
2,1,"[Tma, Ni, Mo]",>512,TmaTmaTmaTmaNiTmaNiTmaTmaTmaTmaNiTmaTmaNiNiTma...
3,1,"[Tma, Ni, Mo]",>512,NiMoTmaTmaNiNiTmaNiMoNiTmaMoNiNiNiNiNiTmaTmaNi...
4,1,"[Tma, Ni, Mo]",>512,TmaNiTmaNiTmaTmaTmaNiTmaMoNiNiTmaTmaTmaMoTmaTm...
...,...,...,...,...
6595,33,"[Tma, Aeg, Mo, Ni, Olam]",128,TmaAegNiTmaOlamNiTmaTmaNiNiOlamTmaOlamAegTmaNi...
6596,33,"[Tma, Aeg, Mo, Ni, Olam]",128,OlamTmaOlamNiNiAegOlamOlamAegNiAegAegTmaMoNiMo...
6597,33,"[Tma, Aeg, Mo, Ni, Olam]",128,AegNiAegAegOlamNiNiOlamNiTmaNiAegAegMoTmaOlamT...
6598,33,"[Tma, Aeg, Mo, Ni, Olam]",128,TmaOlamTmaTmaTmaTmaAegNiNiTmaOlamNiTmaNiTmaOla...


In [12]:
with open('monomer_data/monomer_properties.json', 'r') as file:
    monomer_properties = json.load(file)

def get_polymer_weights(seq):

    seq = re.findall('[A-Z][^A-Z]*', seq)
    freq = {}
    
    for mon in seq:
        if (mon in freq):
            freq[mon] += monomer_properties[mon]['molar_mass']
        else:
            freq[mon] = monomer_properties[mon]['molar_mass']

    return freq

def get_seq_dist(seq):
    wts = get_polymer_weights(seq)
    total_wts = sum(list(wts.values()))
    wts = {key: val / total_wts for key, val in wts.items()}

    return wts

In [13]:
new_df['seq_mon_mol_dist'] = new_df['sequence'].map(lambda x: get_seq_dist(x))
new_df['seq_mon_mol_dist'] = new_df.apply(lambda row:
                                               np.array([row['seq_mon_mol_dist'].get(mon, 0) for mon in row['monomers'] if mon in row['seq_mon_mol_dist']]), axis=1)

In [14]:
new_df = new_df.drop('monomers', axis=1)
# new_df.insert(0, 'sample_ID', range(1, len(new_df) + 1))
new_df.insert(0, 'sample_ID', (np.array([i for i in range(0, len(new_df))]) % (num_replicates)) + 1)
new_df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist
0,1,1,>512,TmaTmaTmaTmaTmaTmaTmaTmaTmaNiMoNiTmaTmaMoNiNiN...,"[0.6649876145212331, 0.19159819238043316, 0.14..."
1,2,1,>512,TmaTmaNiTmaNiTmaTmaNiMoTmaNiTmaTmaNiNiMoTmaNiN...,"[0.5983545683568752, 0.3377956281632043, 0.063..."
2,3,1,>512,TmaTmaTmaTmaNiTmaNiTmaTmaTmaTmaNiTmaTmaNiNiTma...,"[0.7257626852271377, 0.22703250367403033, 0.04..."
3,4,1,>512,NiMoTmaTmaNiNiTmaNiMoNiTmaMoNiNiNiNiNiTmaTmaNi...,"[0.5216465996332416, 0.3596027344253836, 0.118..."
4,5,1,>512,TmaNiTmaNiTmaTmaTmaNiTmaMoNiNiTmaTmaTmaMoTmaTm...,"[0.6573705761415909, 0.23342707638568957, 0.10..."
...,...,...,...,...,...
6595,196,33,128,TmaAegNiTmaOlamNiTmaTmaNiNiOlamTmaOlamAegTmaNi...,"[0.2810484353415925, 0.22264227133798145, 0.00..."
6596,197,33,128,OlamTmaOlamNiNiAegOlamOlamAegNiAegAegTmaMoNiMo...,"[0.3283519025062823, 0.306018064040614, 0.0672..."
6597,198,33,128,AegNiAegAegOlamNiNiOlamNiTmaNiAegAegMoTmaOlamT...,"[0.20316510748293523, 0.2366827799609809, 0.12..."
6598,199,33,128,TmaOlamTmaTmaTmaTmaAegNiNiTmaOlamNiTmaNiTmaOla...,"[0.35753992903843923, 0.14487859978010642, 0.0..."


In [15]:
new_df.to_csv('polymer_samples_db.csv', index=False)