In [1]:
import sys
import os

# This means you will use the first GPU among the four GPUs in our case.
# "0", "1", "2", "3". Since FT dataset is small, using one GPU should be proper.
os.environ["CUDA_VISIBLE_DEVICES"]= "0"


import torch
import numpy as np
import pandas as pd
import warnings
import lightning as L
torch.set_float32_matmul_precision('high')

# Filter out FutureWarning and UnderReviewWarning messages from pl_bolts
warnings.filterwarnings("ignore", module="pl_bolts")

# Add the parent directory to sys.path
sys.path.append( '../')

from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint
import tokenizer_sol, datamodule_finetune_sol, model_finetune_sol, chemllama_mtr, utils_sol
import auto_evaluator_sol

# print(os.path.dirname(__file__))


torch.manual_seed(1004)
np.random.seed(1004)

print(os.getcwd())

2024-06-17 14:03:11.986303: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-17 14:03:12.022183: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-17 14:03:12.022211: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-17 14:03:12.022232: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-17 14:03:12.028812: I tensorflow/core/platform/cpu_feature_g

/home/ylee/SolLlama


  "lr_options": generate_power_seq(LEARNING_RATE_CIFAR, 11),
  contrastive_task: Union[FeatureMapContrastiveTask] = FeatureMapContrastiveTask("01, 02, 11"),
  self.nce_loss = AmdimNCELoss(tclip)


In [None]:
# Clone Pretrained-model from git repo
utils_sol.get_pretrained_model()

In [2]:
#### Hyper Parameters ##### <- You can control these parameters as you want
# solute_or_solvent = 'solvent'
solute_or_solvent = 'solute'
ver_ft = 0 # version control for FT model & evaluation data # Or it will overwrite the models and results
batch_size_pair = [64, 64] if solute_or_solvent == 'solute' else [10, 10] # [train, valid(test)] 
# since 'solute' has very small dataset. So I thinl 10 for train and 10 for valid(test) should be the maximum values.
lr = 0.0001 
epochs = 7
use_freeze = False  # Freeze the model or not # False measn not freezing
overwrite_level_2 = True # If you don't want to overwrite the models and csv files, then change this to False
###########################

In [3]:
max_seq_length = 512
tokenizer = tokenizer_sol.fn_load_tokenizer_llama(
    max_seq_length=max_seq_length,
)
max_length = max_seq_length
num_workers = 2

In [4]:
################################
ep = 7 # this is the target epoch to load the trained fine-tuned model
# dir_main = "/home/ylee/SolLlama"
dir_main = "./"
#################################
name_model_mtr = "ChemLlama_Medium_30m_vloss_val_loss=0.029_ep_epoch=04.ckpt" 

dir_model_mtr = f"{dir_main}/model_mtr/{name_model_mtr}"

ver_ft = 0
dir_model_ft_to_save = f"{dir_main}/save_models_ft/ft_version_{ver_ft}"
name_model_ft = f'AbraLlama_{solute_or_solvent}'

In [5]:
# Load dataset for finetune
batch_size_for_train = batch_size_pair[0]
batch_size_for_valid = batch_size_pair[1]

data_module = datamodule_finetune_sol.CustomFinetuneDataModule(
    solute_or_solvent=solute_or_solvent,
    tokenizer=tokenizer,
    max_seq_length=max_length,
    batch_size_train=batch_size_for_train,
    batch_size_valid=batch_size_for_valid,
    # num_device=int(config.NUM_DEVICE) * config.NUM_WORKERS_MULTIPLIER,
    num_device=num_workers,
)

data_module.prepare_data()
data_module.setup()
steps_per_epoch = len(data_module.train_dataloader())

# Load model and optimizer for finetune
learning_rate = lr


model_mtr = chemllama_mtr.ChemLlama.load_from_checkpoint(dir_model_mtr)


model_ft = model_finetune_sol.CustomFinetuneModel(
    model_mtr=model_mtr,
    steps_per_epoch=steps_per_epoch,
    warmup_epochs=1,
    max_epochs=epochs,
    learning_rate=learning_rate,
    # dataset_dict=dataset_dict,
    use_freeze=use_freeze,
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename=name_model_ft + '_{epoch:02d}',
    every_n_epochs=1,
    save_top_k=-1,
    enable_version_counter=False, # keep the version == 0
    save_weights_only=True,
)
checkpoint_callback.FILE_EXTENSION = ".pt"

csv_logger = CSVLogger(
    save_dir=dir_model_ft_to_save,
    name=name_model_ft,
    version=0,
)

trainer = L.Trainer(
    default_root_dir=dir_model_ft_to_save,
    # profiler=profiler,
    logger=csv_logger,
    accelerator='auto',
    devices='auto',
    # accelerator='gpu',
    # devices=[0],
    min_epochs=1,
    max_epochs=epochs,
    precision=32,
    callbacks=[checkpoint_callback]
)
trainer.fit(model_ft, data_module)
trainer.validate(model_ft, data_module)

/home/ylee/chemllm/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /opt/software/jupyter-server/2.7.2-GCCcore-12.3.0/l ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/ylee/chemllm/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /opt/software/jupyter-server/2.7.2-GCCcore-12.3.0/l ...
/home/ylee/chemllm/lib/python3.11/site-packages/lightning/fabric/loggers/csv_logs.py:268: Experiment logs directory .//save_models_ft/ft_version_0/AbraLlama_solute/version_0 ex

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Training: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=7` reached.
/home/ylee/chemllm/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3 /opt/software/jupyter-server/2.7.2-GCCcore-12.3.0/l ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  "labels": torch.tensor(self.labels.iloc[idx]),


Validation: |          | 0/? [00:00<?, ?it/s]

  "labels": torch.tensor(self.labels.iloc[idx]),


[{'val_loss': 0.07019355148077011}]

In [6]:

local_model_ft = utils_sol.load_model_ft_with_epoch(
    class_model_ft=model_ft, 
    target_epoch=ep,
    dir_model_ft=dir_model_ft_to_save,
    name_model_ft=name_model_ft
)

result = trainer.predict(local_model_ft, data_module)
result_pred = list()
result_label = list()
for bat in range(len(result)):
    result_pred.append(result[bat][0].squeeze())
    result_label.append(result[bat][1])

Loaded model with epoch 7


IndexError: list index out of range

In [None]:
result_pred[0]

In [None]:
result_label[0]

# dataframe

In [None]:
data_module.test_df['SMILES'].reset_index(drop=True)

In [None]:
list_df_predict = list()
for i in result_pred:
    local_df_predict = pd.DataFrame(np.array(i))
    list_df_predict.append(local_df_predict)

df_predict = pd.concat(list_df_predict)
df_predict.insert(0, "SMILES", data_module.test_df['SMILES'].reset_index(drop=True))
df_predict


In [None]:
df_label = data_module.test_df.reset_index(drop=True)
df_label