In [1]:
import pandas as pd
import glob

# big_smiles 폴더 안의 polyBERT_len85_로 시작하는 모든 csv 경로 가져오기
paths = sorted(glob.glob("big_smiles/polyBERT_len85_*.csv"))

# 각 파일을 읽어서 리스트로 만든 다음
dfs = [pd.read_csv(p) for p in paths]

# 하나의 DataFrame으로 합치기
df = pd.concat(dfs, ignore_index=True)

print(len(paths), "files loaded")
print(df.shape)


50 files loaded
(4926212, 2)


In [2]:
big_smiles=df['0'].tolist()

In [3]:
# bigsmiles_chemberta_mlm_with_val.py

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
import torch

# ---------------------------------------------------------
# 0. BigSMILES 리스트 준비
# ---------------------------------------------------------
# 여기만 네 데이터로 바꾸면 됨
big_smiles = [
    "{$C(=O)c1cc(c2ccc(c3ccccc3)cc2)ccc1C1CCC(CC1)C(=O)$}",
    "{[*]CC(=O)O[*]}",
    "{[*]CCO[*]}",
] * 1000

dataset = Dataset.from_dict({"text": big_smiles})

# ---------------------------------------------------------
# 1. train / validation split
# ---------------------------------------------------------
split = dataset.train_test_split(test_size=0.1, seed=42)
raw_train = split["train"]
raw_val   = split["test"]

print(raw_train, raw_val)

# ---------------------------------------------------------
# 2. 토크나이저 & 모델 (ChemBERTa → BigSMILES 도메인 적응)
# ---------------------------------------------------------
base_model_name = "DeepChem/ChemBERTa-77M-MTR"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# BigSMILES에서 자주 나오는 특수 토큰 추가
special_bigsmiles_tokens = ["{", "}", "$", "[*]"]
new_tokens = []
for tok in special_bigsmiles_tokens:
    if tok not in tokenizer.vocab:
        new_tokens.append(tok)

if new_tokens:
    num_added = tokenizer.add_tokens(new_tokens)
    print(f"Added {num_added} new tokens: {new_tokens}")
else:
    num_added = 0
    print("No new tokens added.")

model = AutoModelForMaskedLM.from_pretrained(base_model_name)

if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))

# ---------------------------------------------------------
# 3. 토크나이즈 함수
# ---------------------------------------------------------
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

tokenized_train = raw_train.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)
tokenized_val = raw_val.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

# ---------------------------------------------------------
# 4. MLM용 data collator (BERT-style 마스킹 자동)
# ---------------------------------------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

# ---------------------------------------------------------
# 5. TrainingArguments (validation + TensorBoard)
# ---------------------------------------------------------
training_args = TrainingArguments(
    output_dir="bigsmiles-mlm",
    per_device_train_batch_size=32,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,

    logging_strategy="steps",
    logging_steps=100,

    evaluation_strategy="epoch",   # 매 epoch 끝날 때 val loss 계산
    save_strategy="epoch",         # 매 epoch 체크포인트 저장
    save_total_limit=2,
    load_best_model_at_end=True,   # 가장 낮은 eval_loss 모델로 롤백
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    prediction_loss_only=True,

    # TensorBoard 설정
    logging_dir="runs/bigsmiles_mlm",
    report_to=["tensorboard"],     # wandb 안 쓰고 tensorboard만
)

# ---------------------------------------------------------
# 6. Trainer
# ---------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# ---------------------------------------------------------
# 7. 학습
# ---------------------------------------------------------
trainer.train()

# ---------------------------------------------------------
# 8. 최종 모델/토크나이저 저장
# ---------------------------------------------------------
save_dir = "bigsmiles-mlm-final"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Saved best model & tokenizer to {save_dir}")



Added 4 new tokens: ['{', '}', '$', '[*]']


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4926212 [00:00<?, ? examples/s]

Step,Training Loss
100,15.263
200,14.8954
300,13.9713
400,12.7889
500,11.367
600,9.7776
700,8.256
800,7.166
900,6.1719
1000,5.5959


Saved to bigsmiles-mlm/
