In [7]:
import pandas as pd
import json
from rdkit.Chem import rdMolDescriptors, Descriptors
from rdkit import Chem
import numpy as np

from utils.util_functions import get_unscaled_features

In [8]:
SMILES = 'data_preprocessing/SMILES.txt'
DESCRIPTORS = 'monomer_data/unique_descriptors.json'

unscaled_features = get_unscaled_features(SMILES, DESCRIPTORS)

In [9]:
df = pd.read_csv(SMILES)
df

Unnamed: 0,type,molecule,SMILES,description
0,monomer,G,C(C(=O)O)N,Glycine
1,monomer,A,C[C@@H](C(=O)O)N,Alanine
2,monomer,L,CC(C)C[C@@H](C(=O)O)N,Leucine
3,monomer,M,CSCC[C@@H](C(=O)O)N,Methionine
4,monomer,F,C1=CC=C(C=C1)C[C@@H](C(=O)O)N,Phenylalanine
5,monomer,W,C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N,Tryptophan
6,monomer,K,C(CCN)C[C@@H](C(=O)O)N,Lysine
7,monomer,Q,C(CC(=O)N)[C@@H](C(=O)O)N,Glutamine
8,monomer,E,C(CC(=O)O)[C@@H](C(=O)O)N,Glutamic Acid
9,monomer,S,C([C@@H](C(=O)O)N)O,Serine


In [10]:
pep = list(set((df['molecule'].tolist())[:40]))
poly = list(set((df['molecule'].tolist())[40:52]))

unscaled_train_feats = pd.DataFrame(np.vstack([unscaled_features['monomer'][mon] for mon in pep]), index=pep)
unscaled_train_feats

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
F,0.690463,3.0,0.0,2.103857,8.543755,0.955337,-0.959395,2.5,3.722225,-1.35,...,154.104,0.0,2.634453,30.331835,-2.316395,6.604021,17.117674,5.730125,10.076364,6.041841
D,0.452021,5.0,0.0,2.138971,-1.289352,0.275438,-2.49787,2.0,2.239272,-1.1,...,126.047,0.0,1.542593,0.0,-2.318388,4.572731,28.193506,5.796664,10.219121,6.041841
H,0.541194,5.0,6.32732,2.114647,-0.851435,0.720551,-1.00037,3.090909,3.155289,-1.36,...,146.085,18.311593,2.225701,0.0,-2.318022,5.819183,27.085631,5.728837,10.169528,6.041841
W,0.700584,4.0,0.0,2.137162,6.913184,1.624731,-0.971962,2.933333,4.716242,-1.81,...,192.133,23.52059,3.508352,30.462312,-2.316122,8.104021,22.101652,5.838006,10.061026,6.041841
E,0.485976,5.0,0.0,2.115643,-1.059167,0.402453,-2.196343,2.2,2.739272,-1.1,...,138.058,6.420822,1.877144,0.0,-2.299667,5.279838,28.193506,5.74181,10.159875,6.041841
D_amd,0.423939,5.0,0.0,2.117992,-1.06713,0.28872,-1.91787,2.111111,2.30434,-1.1,...,124.055,0.0,1.606725,0.0,-2.375338,4.702868,23.024853,5.838956,10.203733,0.0
P,0.498209,3.0,0.0,2.195555,-0.268519,0.895327,-0.719907,2.75,2.766883,-0.57,...,106.06,0.0,1.987176,0.0,-2.294649,4.554132,23.66243,5.731893,10.225316,6.041841
G,0.421171,3.0,0.0,1.805032,0.0,0.0,-0.967593,2.2,1.189533,-0.57,...,70.027,0.0,0.597863,0.0,-2.104229,2.639919,17.620589,5.680937,10.7175,0.0
K_amd,0.429639,4.0,0.0,2.062408,-0.493935,0.650847,-0.433313,2.4,3.431304,-0.61,...,130.086,0.0,2.293319,0.0,-2.357355,6.046076,18.493777,5.789012,10.115315,0.0
S_amd,0.370821,4.0,0.0,2.036605,-0.902778,0.089972,-0.678241,2.285714,1.839284,-0.61,...,96.045,0.0,1.191436,0.0,-2.386531,3.794619,23.66243,5.790633,10.324988,6.041841


In [11]:
# verify rows are the same as in dictionary

for i in range(len(pep)):
    print(((unscaled_train_feats.iloc[i,:]).to_numpy() == unscaled_features['monomer'][pep[i]]).all())

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [12]:
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_trained_feats = scaler.fit_transform(unscaled_train_feats)

train_scaled = pd.DataFrame(
    scaled_trained_feats,
    index=unscaled_train_feats.index
)

In [13]:
train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
F,0.979195,0.0,0.0,0.785468,0.956591,0.583229,0.516628,0.514019,0.705136,0.289308,...,0.693557,0.0,0.684656,0.995717,0.365387,0.708604,0.292461,0.021386,0.107218,0.512396
D,0.489061,0.666667,0.0,0.867618,0.0,0.168153,0.0,0.102804,0.292262,0.446541,...,0.467469,0.0,0.317597,0.0,0.359425,0.345501,0.919172,0.050315,0.306006,0.512396
H,0.672363,0.666667,0.521508,0.810711,0.042602,0.439893,0.502868,1.0,0.547294,0.283019,...,0.628938,0.778535,0.547242,0.0,0.36052,0.568311,0.856484,0.020826,0.236949,0.512396
W,1.0,0.333333,0.0,0.863385,0.797965,0.991891,0.512408,0.870405,0.981884,0.0,...,1.0,1.0,0.978441,1.0,0.366205,0.976737,0.574472,0.06829,0.085859,0.512396
E,0.558856,0.666667,0.0,0.813041,0.022393,0.245696,0.101255,0.26729,0.431469,0.446541,...,0.564256,0.272987,0.430065,0.0,0.415424,0.4719,0.919172,0.026466,0.223507,0.512396
D_amd,0.431337,0.666667,0.0,0.818538,0.021618,0.176262,0.194767,0.194185,0.310378,0.446541,...,0.451417,0.0,0.339157,0.0,0.189083,0.368763,0.626711,0.068703,0.284579,0.0
P,0.584003,0.0,0.0,1.0,0.099309,0.546593,0.597049,0.719626,0.439156,0.779874,...,0.306411,0.0,0.467056,0.0,0.430432,0.342176,0.662787,0.022154,0.314634,0.512396
G,0.425645,0.0,0.0,0.086351,0.125432,0.0,0.513875,0.26729,0.0,0.779874,...,0.016052,0.0,0.0,0.0,1.0,0.0,0.320918,0.0,1.0,0.0
K_amd,0.443053,0.333333,0.0,0.688496,0.07738,0.397339,0.693289,0.431776,0.62414,0.754717,...,0.500016,0.0,0.569974,0.0,0.242871,0.608869,0.370326,0.046989,0.161457,0.0
S_amd,0.322147,0.333333,0.0,0.628128,0.037607,0.054928,0.611041,0.337784,0.1809,0.754717,...,0.225709,0.0,0.199546,0.0,0.155602,0.206409,0.662787,0.047693,0.453427,0.512396


In [14]:
scaled_train_feats = dict(zip(train_scaled.index, train_scaled.to_numpy().tolist()))
scaled_train_feats

{'F': [0.9791949183589342,
  0.0,
  0.0,
  0.7854681164336288,
  0.9565909170394542,
  0.5832294596887978,
  0.5166279748222222,
  0.514018691588785,
  0.7051361564156295,
  0.28930817610062887,
  0.5,
  0.4969220678844657,
  0.2869770350159163,
  0.20554457552872307,
  0.7051361564156295,
  0.0,
  0.3216076757700032,
  0.0,
  0.0,
  0.7000000000000002,
  0.3007697276426854,
  0.0,
  0.0,
  0.0,
  0.1996799784890304,
  0.0,
  0.33120237580274847,
  0.6250137632560572,
  0.16619149441672598,
  0.9963782445931717,
  0.9744781797765141,
  0.0,
  0.48213139137719485,
  0.0,
  0.5832294596887978,
  0.8003200215109696,
  0.860407784448505,
  0.20530855991218436,
  0.03332004482978124,
  0.5913656802365317,
  0.31157060703774697,
  0.1118866762471772,
  0.7086043551370869,
  0.47368292010175,
  0.0,
  0.0,
  0.7159194116304705,
  0.7000127682850625,
  0.5,
  0.6980214200764633,
  0.023662259152921195,
  0.7902717121000711,
  0.0,
  0.34026294183447336,
  0.6250137632560572,
  0.70728334146728

In [15]:
nontrain_keys = list(set(unscaled_features['monomer'].keys()) - set(scaled_train_feats.keys()))
unscaled_nontrain_feats = pd.DataFrame(np.vstack([unscaled_features['monomer'][mon] for mon in nontrain_keys]), index=nontrain_keys)
unscaled_nontrain_feats

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
Oct,0.45468,2.0,38.180128,1.922979,0.0,1.189087,-0.06152,2.461538,5.211708,-0.79,...,162.127,6.544756,3.262919,0.0,-2.211313,8.642453,12.451936,5.862217,10.121098,0.0
Aeg,0.151122,5.0,6.07602,1.944911,0.0,0.464706,-0.055329,2.545455,3.185505,-1.52,...,144.093,6.544756,1.87727,0.0,-2.552529,6.261619,24.956247,5.863181,10.512856,0.0
Ni,0.524524,2.0,6.07602,1.931947,0.208796,0.343443,-0.111111,2.375,2.594423,-0.79,...,102.072,6.041841,1.867413,13.847474,-2.274461,5.270056,11.949021,5.864597,10.372714,0.0
Phe,0.634893,2.0,6.07602,1.921953,9.25374,0.714776,-0.184074,2.545455,3.311731,-1.57,...,138.105,5.687386,2.0594,30.331835,-2.059928,6.079457,5.90718,5.982585,10.29869,0.0
Mo,0.482094,3.0,6.07602,2.122518,0.0,0.912837,0.003056,2.6,3.33446,-0.83,...,130.082,13.089513,2.141782,0.0,-2.391606,5.876594,37.110366,5.86511,10.380311,0.0
Tma,0.361624,3.0,6.07602,2.047494,0.0,0.837156,-0.086658,2.5,4.16247,-0.83,...,152.112,12.965578,3.862636,0.0,-2.368076,8.261239,44.622739,5.862228,10.342409,0.0
Do,0.400991,2.0,63.863415,1.939251,0.0,1.896194,-0.0547,1.882353,7.211708,-0.79,...,210.171,6.544756,4.677132,0.0,-2.211332,11.47088,12.451936,5.862227,10.066238,0.0
Olam,0.189297,2.0,89.546701,1.936193,0.0,2.656273,-0.057642,1.782609,9.861538,-1.05,...,282.237,6.544756,6.432994,0.0,-2.211329,15.454008,12.451936,5.862248,10.077276,0.0
Tmb,0.66916,2.0,6.07602,2.254546,0.0,1.754565,-0.102546,2.076923,4.608154,-0.79,...,162.127,5.538925,5.314695,13.847474,-2.445852,9.399812,11.446105,5.869078,9.815912,0.0
Bmam,0.35551,3.0,6.07602,1.902071,0.0,0.512898,-0.193057,3.0,3.789058,-0.83,...,142.093,6.606882,2.151291,0.0,-2.234975,6.929381,19.244878,5.861287,10.361347,6.730817


In [16]:
scaled_nontrain_feats = scaler.transform(unscaled_nontrain_feats)

nontrain_scaled = pd.DataFrame(
    scaled_nontrain_feats,
    index=unscaled_nontrain_feats.index,
)

In [17]:
nontrain_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
Oct,0.494526,-0.333333,3.146869,0.362292,0.125432,0.725933,0.818139,0.482387,1.119829,0.641509,...,0.758207,0.278256,0.895931,0.0,0.6797,1.072985,0.028457,0.078816,0.16951,0.0
Aeg,-0.129461,0.666667,0.500796,0.413606,0.125432,0.283701,0.820218,0.551402,0.555706,0.18239,...,0.612887,0.278256,0.430108,0.0,-0.340915,0.647398,0.735996,0.079235,0.715033,0.0
Ni,0.638095,-0.333333,0.500796,0.383276,0.145744,0.209671,0.801486,0.411215,0.391141,0.641509,...,0.274275,0.256875,0.426794,0.454577,0.490818,0.470151,0.0,0.079851,0.519885,0.0
Phe,0.864967,-0.333333,0.500796,0.359894,1.02566,0.436368,0.776985,0.551402,0.590849,0.150943,...,0.564634,0.241805,0.491336,0.995717,1.132509,0.614836,-0.341869,0.131149,0.416806,0.0
Mo,0.550877,0.0,0.500796,0.829125,0.125432,0.557283,0.839824,0.596262,0.597177,0.616352,...,0.499984,0.556513,0.519031,0.0,0.140424,0.578573,1.42372,0.080074,0.530464,0.0
Tma,0.303243,0.0,0.500796,0.653604,0.125432,0.51108,0.809698,0.514019,0.827706,0.616352,...,0.677505,0.551244,1.097543,0.0,0.210805,1.004841,1.848797,0.078821,0.477685,0.0
Do,0.384163,-0.333333,5.263728,0.400364,0.125432,1.157618,0.820429,0.006047,1.676656,0.641509,...,1.145353,0.278256,1.371358,0.0,0.679644,1.578581,0.028457,0.078821,0.093117,0.0
Olam,-0.050988,-0.333333,7.380587,0.393209,0.125432,1.621644,0.819441,-0.075985,2.414405,0.477987,...,1.726071,0.278256,1.961639,0.0,0.679654,2.290587,0.028457,0.07883,0.108488,0.0
Tmb,0.935406,-0.333333,0.500796,1.138013,0.125432,1.071155,0.804362,0.166068,0.951791,0.641509,...,0.758207,0.235493,1.585692,0.454577,-0.021834,1.208367,-0.028457,0.081799,-0.255462,0.0
Bmam,0.290675,0.0,0.500796,0.313378,0.125432,0.313122,0.773968,0.925234,0.723743,0.616352,...,0.59677,0.280898,0.522227,0.0,0.608925,0.766764,0.412826,0.078412,0.504057,0.570827


In [18]:
scaled_nontrain_feats = dict(zip(nontrain_scaled.index, nontrain_scaled.to_numpy().tolist()))
scaled_nontrain_feats

{'Oct': [0.49452599973962574,
  -0.33333333333333337,
  3.1468692704038888,
  0.3622924756984762,
  0.12543159189478362,
  0.7259325973088935,
  0.8181391389114243,
  0.48238677210639835,
  1.1198286346172621,
  0.6415094339622641,
  1.75,
  0.5181754767253999,
  0.6687976241972515,
  0.28675346264506985,
  1.1198286346172621,
  5.0,
  -0.286430703080013,
  -1.0,
  0.0,
  0.8,
  2.1680514691227897,
  0.5450047453586356,
  0.0,
  0.0,
  1.1125907833046975,
  -0.5,
  0.0,
  0.7982107048967976,
  -0.2403183654074602,
  0.3512751602789834,
  0.13285893813666982,
  0.0,
  0.2238985867137595,
  0.0,
  0.7259325973088935,
  -0.11259078330469752,
  1.4428202674091481,
  0.0,
  -0.40815146239609834,
  2.0807265050321955,
  1.1474512469700693,
  1.5285131998696173,
  1.072985018316771,
  1.7274203929020198,
  0.0,
  0.4600018561689591,
  0.8989275103070254,
  0.8390705688337001,
  0.0,
  0.8221555907276646,
  0.0,
  0.8211645886760102,
  1.0,
  0.0,
  0.7982107048967976,
  0.9185371530917861,
  

In [19]:
all_scaled_features = {**scaled_train_feats, **scaled_nontrain_feats}
all_scaled_features

{'F': [0.9791949183589342,
  0.0,
  0.0,
  0.7854681164336288,
  0.9565909170394542,
  0.5832294596887978,
  0.5166279748222222,
  0.514018691588785,
  0.7051361564156295,
  0.28930817610062887,
  0.5,
  0.4969220678844657,
  0.2869770350159163,
  0.20554457552872307,
  0.7051361564156295,
  0.0,
  0.3216076757700032,
  0.0,
  0.0,
  0.7000000000000002,
  0.3007697276426854,
  0.0,
  0.0,
  0.0,
  0.1996799784890304,
  0.0,
  0.33120237580274847,
  0.6250137632560572,
  0.16619149441672598,
  0.9963782445931717,
  0.9744781797765141,
  0.0,
  0.48213139137719485,
  0.0,
  0.5832294596887978,
  0.8003200215109696,
  0.860407784448505,
  0.20530855991218436,
  0.03332004482978124,
  0.5913656802365317,
  0.31157060703774697,
  0.1118866762471772,
  0.7086043551370869,
  0.47368292010175,
  0.0,
  0.0,
  0.7159194116304705,
  0.7000127682850625,
  0.5,
  0.6980214200764633,
  0.023662259152921195,
  0.7902717121000711,
  0.0,
  0.34026294183447336,
  0.6250137632560572,
  0.70728334146728

In [20]:
set(all_scaled_features.keys()) - set(unscaled_features['monomer'].keys())

set()

In [21]:
a = {'c': 1}

for key,val in a.items():
    print(key)
    print(val)

c
1
