In [1]:
%load_ext autoreload
%autoreload 2
    
import pandas as pd
import numpy as np
import json
import gc

from utils.preprocess_data import preprocess
from utils.infer import inference
from utils.load_data import load_data

# import sys
# sys.path.append('../')  # Add the parent directory 'gv7' to the path
# from utils.load_networkx import get_unscaled_features
# from utils.inference import infer

In [2]:
# Take in dataframe and run inference pipeline
df = pd.read_parquet('./samples_one_replicate/0df1d0c39c1d4684a616a57b0851ab1c_rows_642032.parquet', engine='pyarrow')
df_temp = df.head(25)
df_temp

Unnamed: 0,ID,mol_distribution,monomers,sequence
0,000000002516045_00001,"[0.06880176793451682, 0.036911532441588304, 0....","[Tma, Aeg, Mep, Bam]",TmaMepMepBamBamBamMepBamBamMepMepMepMepBamBamB...
1,000000002516047_00001,"[0.06923435300951733, 0.0371436104609326, 0.49...","[Tma, Aeg, Mep, Bam]",BamBamAegBamBamMepBamMepTmaMepMepMepBamMepMepT...
2,000000002516049_00001,"[0.0696724121837914, 0.03737862528552172, 0.55...","[Tma, Aeg, Mep, Bam]",MepMepMepMepAegMepBamMepBamMepBamMepBamMepMepB...
3,000000002516051_00001,"[0.0701160500260276, 0.037616633015520796, 0.6...","[Tma, Aeg, Mep, Bam]",MepMepMepMepMepBamBamBamBamMepMepMepMepBamMepM...
4,000000002516053_00001,"[0.07056537378533904, 0.0378576911891187, 0.66...","[Tma, Aeg, Mep, Bam]",BamMepMepMepMepBamAegMepBamMepBamMepTmaBamMepM...
5,000000002516055_00001,"[0.07102049347770208, 0.03810185882890162, 0.7...","[Tma, Aeg, Mep, Bam]",MepBamAegMepMepMepMepBamMepMepMepBamMepBamMepM...
6,000000002516057_00001,"[0.07148152197576138, 0.038349196490032794, 0....","[Tma, Aeg, Mep, Bam]",MepMepMepMepMepMepMepBamMepMepBamMepMepAegMepM...
7,000000002516059_00001,"[0.06969055049291474, 0.07477671265190972, 0.4...","[Tma, Aeg, Mep, Bam]",MepBamBamMepMepBamMepBamMepMepMepMepTmaBamBamM...
8,000000002516060_00001,"[0.10480234660410795, 0.03748368036427919, 0.4...","[Tma, Aeg, Mep, Bam]",AegBamBamMepBamBamMepBamMepBamMepMepMepMepMepM...
9,000000002516062_00001,"[0.0701344200917373, 0.07525297678257471, 0.45...","[Tma, Aeg, Mep, Bam]",BamBamMepMepMepMepMepBamTmaBamMepMepMepMepBamM...


In [3]:
MON_SMILES_POLY = '../tables_poly/SMILES_polymers_monomer.txt'
BOND_SMILES_POLY = '../tables_poly/SMILES_polymers_bond.txt'
DESCRIPTORS = '../unique_descriptors.json'
SCALERS = '../scalers.pkl'
MODEL_PATH = './model/'
NUM_WORKERS = 0

hyperparameters = json.load(open(MODEL_PATH + 'configure.json'))

In [4]:
%%capture
run_preprocess = preprocess(df_temp, MON_SMILES_POLY, BOND_SMILES_POLY, DESCRIPTORS, SCALERS, MODEL = hyperparameters['model'])
data = run_preprocess.main()

del run_preprocess
gc.collect()

infer_loader = load_data(DATA = data, BATCH_SIZE = hyperparameters['batch_size'], NUM_WORKERS = NUM_WORKERS)

run_inference = inference(GPU = 0, DATALOADER = infer_loader, HYPERPARAMETERS = hyperparameters, MODEL_PATH = MODEL_PATH)
run_inference.main()

## Comparing Results

In [5]:
df = pd.read_csv('../shoshana_polymers/round1/uniform.csv')
df = df.rename(columns={"ID": "poly_ID", "Unnamed: 0": "ID"})
df['ID'] = df['ID'].map(lambda x: str(x + 1) + "_1")
df

Unnamed: 0,ID,poly_ID,sequence,seq_mol_dist,monomers,mon_SMILES,wt_%,mol_distribution,DP,MIC_ecoli,MIC_saureus,HC50,immunogenic,poly_SMILES
0,1_1,0,MoTmaNiTmaNiNiNiNiTmaNiMoTmaTmaTmaTmaTmaNiTmaM...,"[0.6190017782295035, 0.26909659071392517, 0.11...","['Tma', 'Ni', 'Mo']","['C[N+](C)(C)CCCNC(=O)C=C', 'CC(C)NC(=O)C=C', ...","[59.1, 30.7, 10.2]","[0.4542123849646637, 0.43100113585911337, 0.11...",70,>512,>512,>2000,No,CC(C)NC(=O)C(CC(CC(CC(CC(CC(CC(CC(CC(CC(CC(CC(...
1,2_1,0,MoTmaNiTmaTmaNiMoNiNiNiNiTmaTmaMoTmaTmaNiMoTma...,"[0.6479333604409899, 0.2660253703286092, 0.086...","['Tma', 'Ni', 'Mo']","['C[N+](C)(C)CCCNC(=O)C=C', 'CC(C)NC(=O)C=C', ...","[59.1, 30.7, 10.2]","[0.4542123849646637, 0.43100113585911337, 0.11...",70,>512,>512,>2000,No,CC(C)NC(=O)C(C)CC(CC(CC(CC(CC(CC(CC(CC(CC(CC(C...
2,3_1,0,NiNiTmaTmaTmaTmaNiMoTmaNiNiTmaNiTmaNiMoNiNiNiM...,"[0.6042840248218312, 0.2706589478626383, 0.125...","['Tma', 'Ni', 'Mo']","['C[N+](C)(C)CCCNC(=O)C=C', 'CC(C)NC(=O)C=C', ...","[59.1, 30.7, 10.2]","[0.4542123849646637, 0.43100113585911337, 0.11...",70,>512,>512,>2000,No,CC(C)NC(=O)CCC(CC(CC(CC(CC(CC(CC(CC(CC(CC(CC(C...
3,4_1,0,TmaNiTmaMoNiNiTmaTmaNiTmaNiTmaNiNiNiNiTmaTmaNi...,"[0.6013000166600996, 0.2493726018051085, 0.149...","['Tma', 'Ni', 'Mo']","['C[N+](C)(C)CCCNC(=O)C=C', 'CC(C)NC(=O)C=C', ...","[59.1, 30.7, 10.2]","[0.4542123849646637, 0.43100113585911337, 0.11...",70,>512,>512,>2000,No,CC(C)NC(=O)C(CCC(=O)NCCC[N+](C)(C)C)CC(CC(CC(C...
4,5_1,0,TmaNiNiNiTmaTmaTmaTmaTmaTmaNiTmaNiNiNiTmaTmaNi...,"[0.6118752466721683, 0.3248107351727601, 0.063...","['Tma', 'Ni', 'Mo']","['C[N+](C)(C)CCCNC(=O)C=C', 'CC(C)NC(=O)C=C', ...","[59.1, 30.7, 10.2]","[0.4542123849646637, 0.43100113585911337, 0.11...",70,>512,>512,>2000,No,CC(C)NC(=O)C(C)CC(CC(CC(CC(CC(CC(CC(CC(CC(CC(C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,4596_1,22,OlamMepOlamTmaTmaMepMepTmaTmaTmaTmaTmaOlamTmaO...,"[0.5191608101589935, 0.3517510920377041, 0.129...","['Tma', 'Olam', 'Mep']","['C[N+](C)(C)CCCNC(=O)C=C', 'CCCCCCCC/C=C\\CCC...","[59.1, 30.7, 10.2]","[0.6315666917027046, 0.21106726063929626, 0.15...",70,128,64-32,>8000,Yes,CCCCCCCC/C=C\CCCCCCCCNC(=O)CCC(CC(CC(CC(CC(CC(...
4596,4597_1,22,OlamTmaTmaTmaOlamTmaTmaTmaTmaMepTmaTmaTmaTmaTm...,"[0.7177630911538243, 0.17500488507644904, 0.10...","['Tma', 'Olam', 'Mep']","['C[N+](C)(C)CCCNC(=O)C=C', 'CCCCCCCC/C=C\\CCC...","[59.1, 30.7, 10.2]","[0.6315666917027046, 0.21106726063929626, 0.15...",70,128,64-32,>8000,Yes,CCCCCCCC/C=C\CCCCCCCCNC(=O)CCC(CC(CC(CC(CC(CC(...
4597,4598_1,22,TmaTmaTmaTmaTmaOlamOlamTmaTmaTmaTmaTmaTmaTmaTm...,"[0.6162751986325532, 0.33925918380529807, 0.04...","['Tma', 'Olam', 'Mep']","['C[N+](C)(C)CCCNC(=O)C=C', 'CCCCCCCC/C=C\\CCC...","[59.1, 30.7, 10.2]","[0.6315666917027046, 0.21106726063929626, 0.15...",70,128,64-32,>8000,Yes,CCCCCCCC/C=C\CCCCCCCCNC(=O)C(CC(CC(C)C(=O)NCCC...
4598,4599_1,22,TmaOlamTmaTmaMepOlamTmaTmaTmaTmaTmaTmaTmaOlamT...,"[0.4910003160989371, 0.4305157894159844, 0.078...","['Tma', 'Olam', 'Mep']","['C[N+](C)(C)CCCNC(=O)C=C', 'CCCCCCCC/C=C\\CCC...","[59.1, 30.7, 10.2]","[0.6315666917027046, 0.21106726063929626, 0.15...",70,128,64-32,>8000,Yes,CCCCCCCC/C=C\CCCCCCCCNC(=O)C(C)CC(CC(CC(CC(CC(...


In [6]:
%%time
run_preprocess = preprocess(df, MON_SMILES_POLY, BOND_SMILES_POLY, DESCRIPTORS, SCALERS, MODEL = hyperparameters['model'])
data = run_preprocess.main()

del run_preprocess
gc.collect()

infer_loader = load_data(DATA = data, BATCH_SIZE = hyperparameters['batch_size'], NUM_WORKERS = NUM_WORKERS)

run_inference = inference(GPU = 0, DATALOADER = infer_loader, HYPERPARAMETERS = hyperparameters, MODEL_PATH = MODEL_PATH)
run_inference.main()

CPU times: user 7.89 s, sys: 1.81 s, total: 9.69 s
Wall time: 6.3 s


In [7]:
infer = pd.read_csv('results.txt').sort_values(by=['ID']).reset_index(drop=True)
from_training = pd.read_csv('model/val_model_on_infer_set/results.txt')[['ID', 'y_pred']].rename(columns={"y_pred": "pred"})
from_training['ID'] = from_training['ID'].str.replace('SID', '') + '_1'
from_training = from_training.sort_values(by=['ID']).reset_index(drop=True)

list(infer['ID']) == list(from_training['ID'])

True

In [8]:
infer['diff'] = infer['pred'] - from_training['pred']

In [9]:
max(infer['diff'])

7.271766662597656e-06

In [10]:
infer['diff'].value_counts().sort_values()

diff
5.364418e-06       1
4.410744e-06       1
7.376075e-07       1
4.172325e-07       1
7.748604e-07       1
                ... 
2.652407e-06      22
2.682209e-06      23
2.413988e-06      24
3.159046e-06      29
0.000000e+00    1605
Name: count, Length: 1027, dtype: int64