In [None]:

from datasets import load_dataset

imdb = load_dataset("imdb")


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Create a preprocessing function to tokenize text and truncate sequences to be no longer 
# than DistilBERT’s maximum input length:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


# Use 🤗 Datasets map function to apply the preprocessing function over the entire dataset. You can speed up the map
#  function by setting batched=True to process multiple elements of the dataset at once
tokenized_imdb = imdb.map(preprocess_function, batched=True)

#Use DataCollatorWithPadding to create a batch of examples. It will also dynamically pad 
# #your text to the length of the longest element in its batch, so they are a uniform length.
# While it is possible to pad your text in the tokenizer function by setting padding=True, dynamic padding is more efficient.

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load DistilBERT with AutoModelForSequenceClassification along with the number of expected labels:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)





# Trainer will apply dynamic padding by default when you pass tokenizer to it. In this case, you don’t need to specify a data collator explicitly.
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

# speedup


In [1]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
from datasets import load_dataset
import torch
# Load IMDb dataset
imdb = load_dataset("imdb")

from transformers import AutoTokenizer

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Preprocessing function: tokenize and truncate
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Apply preprocessing
tokenized_imdb = imdb.map(preprocess_function, batched=True)

from transformers import DataCollatorWithPadding

# Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification

# Load DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# OPTIONAL: Use smaller dataset for faster experimentation
small_train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(200))

from transformers import TrainingArguments, Trainer

# Define evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Set device to MPS (Apple Metal) if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=2,# Reduced for faster testing
    per_device_eval_batch_size=2,# Reduced for faster testing
    num_train_epochs=3,      # Reduced for faster testing
    weight_decay=0.01,
    fp16=False,              # Explicitly disable mixed precision
    bf16=False,
)

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Evaluate after training (optional)
metrics = trainer.evaluate()
print(metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4067


{'eval_loss': 0.6897389888763428, 'eval_accuracy': 0.835, 'eval_f1': 0.8374384236453202, 'eval_precision': 0.794392523364486, 'eval_recall': 0.8854166666666666, 'eval_runtime': 12.8305, 'eval_samples_per_second': 15.588, 'eval_steps_per_second': 7.794, 'epoch': 3.0}
