In [181]:
import pandas as pd
import numpy as np
import re

In [182]:
pepDB = pd.read_csv('peptide_data/peptide_db.csv')
polySampDB = pd.read_csv('polymer_data/polymer_samples_db.csv')

In [183]:
pepDB['ID'] = pepDB.apply(lambda row: 'pepID' + str(row['pep_ID']), axis=1)
polySampDB['ID'] = polySampDB.apply(lambda row: 'polyID' + str(row['poly_ID']) + '_S' + str(row['sample_ID']), axis=1)

## Polymer DB

In [184]:
def preprocess_MIC(value):
    if value.startswith('>'):
        return int(re.split(r'>', value)[1]) + 1
    elif '-' in value:
        low, high = map(int, value.split('-'))
        return high
    else:
        return float(value)

polySampDB['MIC_ecoli'] = polySampDB['MIC_ecoli'].apply(preprocess_MIC)

In [185]:
polySampDB.rename(columns={"poly_ID": "group"}, inplace=True)
polySampDB

Unnamed: 0,sample_ID,group,MIC_ecoli,sequence,seq_mon_mol_dist,ID
0,1,1,513.0,NiMoNiTmaTmaNiTmaNiNiNiMoNiTmaMoTmaMoTmaTmaMoN...,[0.50809921 0.38513157 0.10676922],polyID1_S1
1,2,1,513.0,NiTmaTmaTmaTmaMoTmaNiNiNiNiMoNiMoNiNiNiNiTmaTm...,[0.50675729 0.37344457 0.11979814],polyID1_S2
2,3,1,513.0,TmaNiNiTmaNiNiTmaTmaNiTmaNiNiNiTmaTmaTmaTmaTma...,[0.56335508 0.3083995 0.12824543],polyID1_S3
3,4,1,513.0,TmaMoNiTmaNiTmaMoNiMoTmaTmaNiTmaNiTmaTmaNiNiNi...,[0.6304578 0.24652413 0.12301807],polyID1_S4
4,5,1,513.0,TmaNiNiNiNiNiTmaNiTmaNiTmaTmaMoTmaTmaTmaTmaTma...,[0.57722799 0.29560729 0.12716471],polyID1_S5
...,...,...,...,...,...,...
4295,96,43,64.0,TmaTmaBmamTmaTmaTmaTmaPheTmaTmaTmaPheTmaTmaTma...,[0.70616369 0.05095127 0.08379381 0.1143361 0...,polyID43_S96
4296,97,43,64.0,TmaTmaTmaTmaTmaTmaPheTmaTmaMepBmamBmamTmaTmaTm...,[0.74803151 0.03047838 0.05221286 0.09119246 0...,polyID43_S97
4297,98,43,64.0,TmaTmaTmaOlamTmaTmaMepOlamPheMepTmaMepTmaTmaTm...,[0.75225981 0.05010204 0.04119859 0.1124304 0...,polyID43_S98
4298,99,43,64.0,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaBmamTma...,[0.8245595 0.04079574 0.01048315 0.0457734 0...,polyID43_S99


In [186]:
start_idx = max(polySampDB['group'].value_counts().index.to_list())

vals = list(polySampDB['group'].value_counts().to_list())

if np.unique(vals).size == 1:
    k = vals[0]

## Peptide DB

In [187]:
pepDB['group'] = (
    pepDB
    .sample(frac=1, random_state=42)  # Shuffle only the matching rows
    .assign(group=lambda x: (np.arange(len(x)) // k) + start_idx + 1)['group']  # Group by size k
)

## Combine

In [188]:
df = pd.concat([pepDB, polySampDB], ignore_index=True).dropna(axis=1)
df['group'] = df['group'].astype(int)
df

Unnamed: 0,sequence,MIC_ecoli,ID,group
0,AAAAAAAAAAGIGKFLHSAKKFGKAFVGEIMNS,125.878150,pepID1,61
1,AAAAAAAIKMLMDLVNERIMALNKKAKK_amd,10.000000,pepID2,137
2,AAAAGSVWGAVNYTSDCNGECKRRGYKGGYCGSFANVNCWCET,100.000000,pepID3,110
3,AAAKAALNAVLVGANA,80.000000,pepID4,50
4,AACSDRAHGHICESFKSFCKDSGRNGVKLRANCKKTCGLC,1.780176,pepID5,150
...,...,...,...,...
15298,TmaTmaBmamTmaTmaTmaTmaPheTmaTmaTmaPheTmaTmaTma...,64.000000,polyID43_S96,43
15299,TmaTmaTmaTmaTmaTmaPheTmaTmaMepBmamBmamTmaTmaTm...,64.000000,polyID43_S97,43
15300,TmaTmaTmaOlamTmaTmaMepOlamPheMepTmaMepTmaTmaTm...,64.000000,polyID43_S98,43
15301,TmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaTmaBmamTma...,64.000000,polyID43_S99,43


In [189]:
bins = [0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, np.inf]
labels = range(1, len(bins))  # Labels 1 through 11

df['MIC_ecoli_class'] = pd.cut(df['MIC_ecoli'], bins=bins, labels=labels, right=True)

In [190]:
df['MIC_ecoli_class'].value_counts()

MIC_ecoli_class
11    7251
8     1625
9     1495
7     1003
5      812
6      747
3      606
4      604
2      400
10     381
1      379
Name: count, dtype: int64

In [191]:
def shuffle_groups(df, group_column):
    # Get unique groups and shuffle them
    shuffled_groups = np.random.permutation(df[group_column].unique())
    
    # Reorder the DataFrame based on the shuffled groups
    reordered_df = pd.concat([df[df[group_column] == group] for group in shuffled_groups])
    
    return reordered_df.reset_index(drop=True)

# Apply the function
df_shuffled = shuffle_groups(df, 'group')
df_shuffled

Unnamed: 0,sequence,MIC_ecoli,ID,group,MIC_ecoli_class
0,AIHDILKYGKPS_amd,50.000000,pepID82,79,7
1,ALWKNMLKGI_amd,14.000000,pepID182,79,5
2,DDALHHLLHHLLHHL,100.000000,pepID351,79,8
3,FFWHHIGHALDAAKRVHGMLSG,12.500000,pepID565,79,5
4,FLGKVFKGAVKVFPAVFGKV,59.851199,pepID750,79,7
...,...,...,...,...,...
15298,APVAQEGN,9999.000000,pepID10553,65,11
15299,EVLYIPVTTDA,9999.000000,pepID10748,65,11
15300,CISARYPCSNSKDCCSGSCGIFWTCYLRKDPCSKECLAP,9999.000000,pepID10907,65,11
15301,MYVLQEINPGITS,9999.000000,pepID10993,65,11


In [192]:
df_shuffled.to_csv('db.csv', index=False)