In [1]:
import pandas as pd
import torch
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')




In [2]:
# 1. Load preprocessed training data CSV
train_df = pd.read_csv('../data/mentalchat16k_train.csv')
print(f"Training samples: {len(train_df)}")

Training samples: 11186


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11186 entries, 0 to 11185
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11186 non-null  object
 1   target  11186 non-null  object
dtypes: object(2)
memory usage: 174.9+ KB


In [4]:
# 2. Initialize tokenizer and model (small version for speed)
model_name = "facebook/blenderbot-400M-distill"

tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)


In [5]:
# 3. Tokenize inputs and targets
def tokenize(batch):
    inputs = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)
    outputs = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=128)
    batch['input_ids'] = inputs['input_ids']
    batch['attention_mask'] = inputs['attention_mask']
    batch['labels'] = outputs['input_ids']
    
    # Important for labels: replace tokenizer.pad_token_id with -100 for loss ignoring
    batch['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels] 
        for labels in batch['labels']
    ]
    return batch

# Convert pandas dataframe to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=8)

# Remove original textual columns to avoid issues during training
train_dataset = train_dataset.remove_columns(['text', 'target', '__index_level_0__'] if '__index_level_0__' in train_dataset.column_names else ['text', 'target'])


Map:   0%|          | 0/11186 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./blenderbot_mentalhealth_finetuned',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    report_to="none",
    remove_unused_columns=False,
    fp16=False,
    push_to_hub=False,
)


In [10]:
# 5. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


In [None]:
# 6. Train
trainer.train()
# 7. Save model and tokenizer
trainer.save_model('./blenderbot_mentalhealth_finetuned')
tokenizer.save_pretrained('./blenderbot_mentalhealth_finetuned')
print("Training complete and model saved.")


Step,Training Loss
100,2.4077
200,2.0076
300,1.8691
400,1.7987
500,1.7476
600,1.7055
700,1.6514
800,1.6387
900,1.6318
1000,1.5873


Training complete and model saved.
