In [1]:
import pandas as pd
import numpy as np
import re
import math

In [2]:
pepDB = pd.read_csv('peptide_data/peptide_db.csv')
polySampDB = pd.read_csv('polymer_data/polymer_samples_db.csv')

In [3]:
pepDB['ID'] = pepDB.apply(lambda row: 'pepID' + str(row['pep_ID']), axis=1)
polySampDB['ID'] = polySampDB.apply(lambda row: 'polyID' + str(row['poly_ID']) + '_S' + str(row['sample_ID']), axis=1)

## Polymer DB

In [4]:
def preprocess_MIC(value):
    if value.startswith('>'):
        return 1024
    elif '-' in value:
        low, high = map(int, value.split('-'))
        return high
    else:
        return float(value)

polySampDB['MIC_ecoli'] = polySampDB['MIC_ecoli'].apply(preprocess_MIC)

## Combine

In [5]:
df = pd.concat([pepDB, polySampDB], ignore_index=True).dropna(axis=1)
df.loc[df['MIC_ecoli'] > 1024, 'MIC_ecoli'] = 1024
df

Unnamed: 0,sequence,MIC_ecoli,ID
0,AAAAAAAAAAGIGKFLHSAKKFGKAFVGEIMNS,125.878150,pepID1
1,AAAAAAAIKMLMDLVNERIMALNKKAKK_amd,10.000000,pepID2
2,AAAAGSVWGAVNYTSDCNGECKRRGYKGGYCGSFANVNCWCET,100.000000,pepID3
3,AAAKAALNAVLVGANA,80.000000,pepID4
4,AACSDRAHGHICESFKSFCKDSGRNGVKLRANCKKTCGLC,1.780176,pepID5
...,...,...,...
17598,TmaAegNiTmaOlamNiTmaTmaNiNiOlamTmaOlamAegTmaNi...,128.000000,polyID33_S196
17599,OlamTmaOlamNiNiAegOlamOlamAegNiAegAegTmaMoNiMo...,128.000000,polyID33_S197
17600,AegNiAegAegOlamNiNiOlamNiTmaNiAegAegMoTmaOlamT...,128.000000,polyID33_S198
17601,TmaOlamTmaTmaTmaTmaAegNiNiTmaOlamNiTmaNiTmaOla...,128.000000,polyID33_S199


In [6]:
def bin(num_classes):

    base = np.exp(np.log(1024.1)/num_classes)
    bins = [0]

    for i in range(num_classes):
        bins.append(base**(i+1))

    return bins

In [7]:
# For binary
df['binary_class'] = df['MIC_ecoli'].apply(lambda x: 1 if x < 512 else 0)

for i in range(3,11):
    bin_vals = bin(i)
    labels = range(0, len(bin_vals)-1)  # Labels 1 through 11
    df[str(i) + '_classes'] = pd.cut(df['MIC_ecoli'], bins=bin_vals, labels=labels, right=True)

# find all columns whose name contains 'class'
class_cols = df.columns[df.columns.str.contains('class')]

# # convert them in place to int
df[class_cols] = df[class_cols].astype(int)
# df[df.isna().any(axis=1)]

In [8]:
df

Unnamed: 0,sequence,MIC_ecoli,ID,binary_class,3_classes,4_classes,5_classes,6_classes,7_classes,8_classes,9_classes,10_classes
0,AAAAAAAAAAGIGKFLHSAKKFGKAFVGEIMNS,125.878150,pepID1,1,2,2,3,4,4,5,6,6
1,AAAAAAAIKMLMDLVNERIMALNKKAKK_amd,10.000000,pepID2,1,0,1,1,1,2,2,2,3
2,AAAAGSVWGAVNYTSDCNGECKRRGYKGGYCGSFANVNCWCET,100.000000,pepID3,1,1,2,3,3,4,5,5,6
3,AAAKAALNAVLVGANA,80.000000,pepID4,1,1,2,3,3,4,5,5,6
4,AACSDRAHGHICESFKSFCKDSGRNGVKLRANCKKTCGLC,1.780176,pepID5,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
17598,TmaAegNiTmaOlamNiTmaTmaNiNiOlamTmaOlamAegTmaNi...,128.000000,polyID33_S196,1,2,2,3,4,4,5,6,6
17599,OlamTmaOlamNiNiAegOlamOlamAegNiAegAegTmaMoNiMo...,128.000000,polyID33_S197,1,2,2,3,4,4,5,6,6
17600,AegNiAegAegOlamNiNiOlamNiTmaNiAegAegMoTmaOlamT...,128.000000,polyID33_S198,1,2,2,3,4,4,5,6,6
17601,TmaOlamTmaTmaTmaTmaAegNiNiTmaOlamNiTmaNiTmaOla...,128.000000,polyID33_S199,1,2,2,3,4,4,5,6,6


In [9]:
df.to_csv('db.csv', index=False)