In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Print GPU info if available
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load TweetEval Sentiment Dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Extract texts and labels from train split
train_texts_full = dataset['train']['text']
train_labels_full = dataset['train']['label']

# Split into training and validation (90/10 split)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts_full, train_labels_full, test_size=0.1, random_state=42
)

# Test data
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3  # 0: negative, 1: neutral, 2: positive
)

# Move model to GPU
model.to(device)
print(f"Model moved to: {next(model.parameters()).device}")

# Custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Create torch datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

# Check actual GPU memory for better batch size determination
if torch.cuda.is_available():
    free_gpu_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
    print(f"Free GPU memory: {free_gpu_memory / 1e9:.2f} GB")

# Training arguments - optimized for P100 GPU (16GB, not 30GB as we see from actual output)
training_args = TrainingArguments(
    output_dir='../data/models/bert_sentiment',
    run_name="bert_sentiment_run",  # Separate run_name from output_dir
    num_train_epochs=3,
    
    # Adjusted batch sizes for 16GB P100 GPU
    per_device_train_batch_size=24,  # Reduced from 32 to be safer for 16GB
    per_device_eval_batch_size=64,   # Reduced from 128
    
    # Enable gradient accumulation for larger effective batch size
    gradient_accumulation_steps=2,
    
    # Enable fp16 mixed precision training for P100 GPU
    fp16=True,
    
    # Memory optimizations
    optim="adamw_torch",  # Memory-efficient optimizer
    
    # Training hyperparameters
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    
    # Enable gradient checkpointing to save memory
    gradient_checkpointing=True,
    
    # Logging and evaluation
    logging_dir='../logs',
    logging_steps=50,
    eval_strategy="epoch",  # Using the new parameter name
    save_strategy="epoch",
    
    # Enable progress bar
    disable_tqdm=False,
    
    # Report metrics
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Disable wandb to prevent hanging
    report_to=["none"],
)

# Print training configuration
print(f"\nTraining configuration:")
print(f"Batch size: {training_args.per_device_train_batch_size} (effective: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps})")
print(f"FP16: {training_args.fp16}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Gradient checkpointing: {training_args.gradient_checkpointing}\n")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Print dataset sizes
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Train the model
print("\nStarting training...")
trainer.train()

# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

# Save the model and tokenizer
model.save_pretrained("/kaggle/working/bert_sentiment")
tokenizer.save_pretrained("/kaggle/working/bert_sentiment")

print("Training completed and model saved!")

Using device: cuda
GPU Name: Tesla P100-PCIE-16GB
GPU Memory: 17.06 GB


README.md:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/901k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/167k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to: cuda:0
Free GPU memory: 16.62 GB

Training configuration:
Batch size: 24 (effective: 48)
FP16: True
Epochs: 3
Gradient accumulation steps: 2
Gradient checkpointing: True

Training samples: 41053
Validation samples: 4562
Test samples: 12284

Starting training...


Epoch,Training Loss,Validation Loss
1,0.6431,0.612507
2,0.2177,0.809726



Evaluating on test set...


Test Results: {'eval_loss': 0.6833956241607666, 'eval_runtime': 27.6064, 'eval_samples_per_second': 444.97, 'eval_steps_per_second': 6.955, 'epoch': 2.9970777323202804}
Training completed and model saved!
