In [9]:
import pandas as pd
import numpy as np
import re
import math

In [10]:
pepDB = pd.read_csv('peptide_data/peptide_db.csv')
polySampDB = pd.read_csv('polymer_data/polymer_samples_db.csv')

In [11]:
pepDB['ID'] = pepDB.apply(lambda row: 'pepID' + str(row['pep_ID']), axis=1)
polySampDB['ID'] = polySampDB.apply(lambda row: 'polyID' + str(row['poly_ID']) + '_S' + str(row['sample_ID']), axis=1)

## Polymer DB

In [12]:
def preprocess_MIC(value):
    if value.startswith('>'):
        return 1024
    elif '-' in value:
        low, high = map(int, value.split('-'))
        return high
    else:
        return float(value)

polySampDB['MIC_ecoli'] = polySampDB['MIC_ecoli'].apply(preprocess_MIC)

## Combine

In [13]:
df = pd.concat([pepDB, polySampDB], ignore_index=True).dropna(axis=1)
df.loc[df['MIC_ecoli'] > 1024, 'MIC_ecoli'] = 1024
df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist,ID
0,1,24,64.0,TmaDoTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTma...,[0.8067014 0.1932986],polyID24_S1
1,2,24,64.0,TmaTmaTmaTmaTmaTmaTmaDoTmaDoTmaTmaTmaTmaTmaTma...,[0.8067014 0.1932986],polyID24_S2
2,3,24,64.0,DoTmaTmaTmaTmaTmaTmaDoDoDoTmaTmaTmaTmaTmaTmaTm...,[0.69858477 0.30141523],polyID24_S3
3,4,24,64.0,TmaTmaDoTmaTmaTmaTmaTmaTmaTmaDoTmaTmaDoTmaTmaT...,[0.71382882 0.28617118],polyID24_S4
4,5,24,64.0,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTmaTmaTm...,[0.71382882 0.28617118],polyID24_S5
...,...,...,...,...,...,...
3995,196,43,64.0,PheMepTmaTmaTmaOlamPheTmaTmaTmaOlamTmaTmaTmaTm...,[0.6480496 0.03060537 0.09437476 0.13735866 0...,polyID43_S196
3996,197,43,64.0,TmaPheTmaMepTmaTmaPheTmaOlamMepTmaBmamPheTmaTm...,[0.63512753 0.08583965 0.07720278 0.07223498 0...,polyID43_S197
3997,198,43,64.0,PheTmaBmamTmaPheOlamTmaTmaPheTmaTmaTmaBmamTmaT...,[0.685277 0.05274053 0.08673641 0.07101076 0...,polyID43_S198
3998,199,43,64.0,TmaTmaMepMepTmaTmaTmaMepBmamBmamTmaPheTmaTmaTm...,[0.71853878 0.07412612 0.04353817 0.04752604 0...,polyID43_S199


In [14]:
def bin(num_classes):

    base = np.exp(np.log(1024.1)/num_classes)
    bins = [0]

    for i in range(num_classes):
        bins.append(base**(i+1))

    return bins

In [15]:
# For binary
df['binary_class'] = df['MIC_ecoli'].apply(lambda x: 1 if x < 512 else 0)

for i in range(3,11):
    bin_vals = bin(i)
    labels = range(0, len(bin_vals)-1)  # Labels 1 through 11
    df[str(i) + '_classes'] = pd.cut(df['MIC_ecoli'], bins=bin_vals, labels=labels, right=True)

# find all columns whose name contains 'class'
class_cols = df.columns[df.columns.str.contains('class')]

# # convert them in place to int
df[class_cols] = df[class_cols].astype(int)
# df[df.isna().any(axis=1)]

In [16]:
df

Unnamed: 0,sample_ID,poly_ID,MIC_ecoli,sequence,seq_mon_mol_dist,ID,binary_class,3_classes,4_classes,5_classes,6_classes,7_classes,8_classes,9_classes,10_classes
0,1,24,64.0,TmaDoTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTma...,[0.8067014 0.1932986],polyID24_S1,1,1,2,2,3,4,4,5,5
1,2,24,64.0,TmaTmaTmaTmaTmaTmaTmaDoTmaDoTmaTmaTmaTmaTmaTma...,[0.8067014 0.1932986],polyID24_S2,1,1,2,2,3,4,4,5,5
2,3,24,64.0,DoTmaTmaTmaTmaTmaTmaDoDoDoTmaTmaTmaTmaTmaTmaTm...,[0.69858477 0.30141523],polyID24_S3,1,1,2,2,3,4,4,5,5
3,4,24,64.0,TmaTmaDoTmaTmaTmaTmaTmaTmaTmaDoTmaTmaDoTmaTmaT...,[0.71382882 0.28617118],polyID24_S4,1,1,2,2,3,4,4,5,5
4,5,24,64.0,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaDoTmaTmaTm...,[0.71382882 0.28617118],polyID24_S5,1,1,2,2,3,4,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,196,43,64.0,PheMepTmaTmaTmaOlamPheTmaTmaTmaOlamTmaTmaTmaTm...,[0.6480496 0.03060537 0.09437476 0.13735866 0...,polyID43_S196,1,1,2,2,3,4,4,5,5
3996,197,43,64.0,TmaPheTmaMepTmaTmaPheTmaOlamMepTmaBmamPheTmaTm...,[0.63512753 0.08583965 0.07720278 0.07223498 0...,polyID43_S197,1,1,2,2,3,4,4,5,5
3997,198,43,64.0,PheTmaBmamTmaPheOlamTmaTmaPheTmaTmaTmaBmamTmaT...,[0.685277 0.05274053 0.08673641 0.07101076 0...,polyID43_S198,1,1,2,2,3,4,4,5,5
3998,199,43,64.0,TmaTmaMepMepTmaTmaTmaMepBmamBmamTmaPheTmaTmaTm...,[0.71853878 0.07412612 0.04353817 0.04752604 0...,polyID43_S199,1,1,2,2,3,4,4,5,5


In [17]:
df.to_csv('db.csv', index=False)