🚀 Step 2: Incremental Training Loop (Run Anytime)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_from_disk
import torch
import os
import time

In [None]:
# ✅ **1. Load Preprocessed Dataset**
print("📥 Loading preprocessed dataset...")
dataset = load_from_disk("./processed_data")


In [None]:
# ✅ **2. Tokenizer Initialization**
print("📝 Initializing tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
# ✅ **3. Chunk the Dataset** — Adjust the size as needed
CHUNK_SIZE = 10000
num_chunks = (len(dataset) + CHUNK_SIZE - 1) // CHUNK_SIZE  # Optimized calculation
print(f"🔀 Splitting dataset into {num_chunks} chunks of size {CHUNK_SIZE}.")


In [None]:
# ✅ **4. Initialize or Load Model**
checkpoint_path = "./goemotions_ekman_model"

# Check if there's a checkpoint to resume from
if os.path.exists(checkpoint_path):
    print("🔄 Loading model from checkpoint...")
    model = BertForSequenceClassification.from_pretrained(checkpoint_path)
else:
    print("🚀 Initializing a new model...")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)


In [None]:
# ✅ **5. Training Arguments (Shared across all chunks)**
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="no",   # Disable evaluation during training
    save_strategy="steps",  # Save model after a fixed number of steps
    save_steps=500,  # Save every 500 steps (adjust as needed)
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    logging_first_step=True,  # Log the first step
    load_best_model_at_end=False,  # Disable loading best model, since we're not doing eval
)

In [None]:
# ✅ **6. Trainer Initialization**
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# ✅ **7. Training Loop for Each Chunk**
last_checkpoint = 0  # To keep track of the last chunk trained
checkpoint_file = os.path.join(checkpoint_path, "last_checkpoint.txt")

# Try loading the last checkpoint index from a file, if available
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r") as f:
        last_checkpoint = int(f.read())
        print(f"🔄 Resuming from checkpoint {last_checkpoint + 1}")


In [None]:
# ✅ **8. Training Loop with Time Estimation**
for i in range(last_checkpoint, num_chunks):
    print(f"\n📝 Training on chunk {i + 1} of {num_chunks}...")
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(dataset))
    chunk_data = dataset.select(range(start_idx, end_idx))
    
    # 🔄 **Track time per chunk and calculate remaining time**
    start_time = time.time()  # Start the timer for this chunk
    trainer.train_dataset = chunk_data
    trainer.train()
    end_time = time.time()  # End the timer for this chunk

    time_taken_for_chunk = end_time - start_time
    print(f"⏳ Time taken for chunk {i + 1}: {time_taken_for_chunk:.2f} seconds")

    # Estimate total time for all chunks
    average_time_per_chunk = time_taken_for_chunk
    remaining_chunks = num_chunks - (i + 1)
    estimated_remaining_time = average_time_per_chunk * remaining_chunks
    print(f"⏳ Estimated remaining time: {estimated_remaining_time / 60:.2f} minutes")

    # 🔒 **Save Checkpoint after each chunk**
    print(f"💾 Saving checkpoint for chunk {i + 1}...")
    trainer.save_model(checkpoint_path)
    
    # Save the current checkpoint index to resume from next time
    with open(checkpoint_file, "w") as f:
        f.write(str(i + 1))  # Save the current chunk index
        

print("\n✅ Training complete for all chunks!")