In [None]:
import os
import subprocess
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def draw_curve(filename: str, sep=';') -> None:
    df = pd.read_csv(filename, sep=sep)
    plt.figure(figsize=(10, 4))
    plt.title("Loss vs Epoch")
    plt.plot(df['epoch'], df['train_loss'], label="Train loss")
    plt.plot(df['epoch'], df['test_loss'], label="Test loss")
    plt.legend()

## Train the model

In [None]:
cmd = [
    'python',
    '-u',
    'train.py',
    '--smiles_data=COCONUT_CHO.txt',
    '--vocab_from=COCONUT_CHO.txt',
    '--save_dir=./save_2',
    '--lr=1e-4'
]

cmd_with_hepato = [
    'python',
    '-u',
    'train.py ',
    '--smiles_data=COCONUT_hepato_CHO.txt',
    '--vocab_from=COCONUT_hepato_CHO.txt',
    '--save_dir=./save_3 ',
    '--lr=1e-4',
]


#### training without hepato

In [None]:
with open('training_logs.txt', 'w') as out:
    return_code = subprocess.call(cmd, stdout=out)

#### training with hepato

In [None]:
with open('hepato_training_logs.txt', 'w') as out:
    return_code = subprocess.call(cmd_with_hepato, stdout=out)

### Retrain the model using pretrained model with low learning rate

### Taking different checkpoint to retrain model

In [None]:
retrain_cmd_100_50 = [
    'python',
    '-u',
    'train.py',
    '--smiles_data=chembl_hepato.txt',
    '--vocab_from=COCONUT_CHO.txt',
    '--save_dir=./save_2_2000_50',
    '--lr=1e-5',
    '--num_epochs=50',
    '--pretrained=./save_2/model_99.ckpt-99',
]

with open('retraining_100_50_logs.txt', 'w') as out:
    return_code = subprocess.call(retrain_cmd_100_50, stdout=out)

In [None]:
retrain_cmd_45_50 = [
    'python',
    '-u',
    'train.py',
    '--smiles_data=2000_smiles.txt',
    '--vocab_from=COCONUT_CHO_drop_bad_stuff.txt',
    '--save_dir=./save_2_2000_45_50',
    '--lr=1e-5',
    '--num_epochs=50',
    '--pretrained=./save_2/model_44.ckpt-44',
]

with open('retrain_cmd_45_50.txt', 'w') as out:
    return_code = subprocess.call(retrain_cmd_45_50, stdout=out)

In [None]:
retrain_cmd_40_50 = [
    'python',
    '-u',
    'train.py',
    '--smiles_data=2000_smiles.txt',
    '--vocab_from=COCONUT_CHO_drop_bad_stuff.txt ',
    '--save_dir=./save_2_2000_39_50',
    '--lr=1e-5',
    '--num_epochs=50',
    '--pretrained=./save_2/model_39.ckpt-39',
]

with open('retrain_cmd_40_50.txt', 'w') as out:
    return_code = subprocess.call(retrain_cmd_40_50, stdout=out)

### Grabbing information about train and test loss from logs and save it to .csv

### Plotting the results

In [None]:
draw_curve("retrain_cmd_100_50.csv", sep='\t')

In [None]:
draw_curve("retrain_cmd_45_50.csv", sep='\t')

In [None]:
draw_curve("retrain_cmd_40_50.csv", sep='\t')

### Smiles sampling

In [None]:
sampling_cmd = [
    'python',
    'sample.py',
    '--vocab_from=COCONUT_CHO.txt',
    '--save_file=save_2_2000_44_50/model_49.ckpt-49',
    '--result_filename=result_COCONUT_99_9_2000_44_50.txt',
    '--num_iteration=500000',
]

sampling_cmd_a = [
    'python',
    'sample.py',
    '--vocab_from=COCONUT_CHO_drop_bad_stuff.txt',
    '--save_file=save_2_2000_44_50/model_49.ckpt-49',
    '--result_filename=result_COCONUT_99_9_2000_44_50a.txt',
    '--num_iteration=500000',
]

sampling_cmd_b = [
    'python',
    'sample.py',
    '--vocab_from=COCONUT_CHO_drop_bad_stuff.txt',
    '--save_file=save_2_2000_44_50/model_49.ckpt-49',
    '--result_filename=result_COCONUT_99_9_2000_44_50b.txt',
    '--num_iteration=500000',
]


with open('sampling_cmd_44_50.txt', 'w') as out:
    return_code = subprocess.call(sampling_cmd, stdout=out)

    
with open('sampling_cmd_a.txt', 'w') as out:
    return_code = subprocess.call(sampling_cmd_a, stdout=out)

    
with open('sampling_cmd_b.txt', 'w') as out:
    return_code = subprocess.call(sampling_cmd_b, stdout=out)

### Drawing smiles

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw, AllChem

In [None]:
def draw_random(smiles, x=4, y=4):
    ms = []
    n = x*y
    for i in range(n):
        ms.append(random.choice(smiles))
        
    ms = [Chem.MolFromSmiles(_) for _ in ms]
    return Draw.MolsToGridImage(ms,molsPerRow=x,subImgSize=(200,200))

In [None]:
smiles = [smile.strip() for smile in open("result_COCONUT_99_9_2000_44_50.txt").readlines()]

In [None]:
draw_random(smiles, x=6, y=6)

### Analysing the results

In [None]:
a = pd.read_csv("result_COCONUT_99_9_2000_44_50a.txt", header=None)[0]
b = pd.read_csv("result_COCONUT_99_9_2000_44_50b.txt", header=None)[0]
{
    "iterations": 500000,
    "len(a)": len(a),
    "len(set(b)": len(set(a)),
    "len(b)": len(b),
    "len(set(b))": len(set(b)),
    "len(a&b)": len(set(a) & set(b))
}