In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("imdb")

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
from transformers import BertTokenizer

# Load tokenizer for 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define tokenization function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",    # Pad all to same length
        truncation=True,         # Truncate long reviews
        max_length=256           # Limit to 256 tokens (you can choose 128/512 too)
    )

# Apply tokenization to entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [12]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5  # 2 for binary classification: Positive/Negative
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return accuracy_metric.compute(predictions=preds, references=p.label_ids)

# Define training parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    # Remove tensorboard logging to avoid tf conflict
    # logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"  # disables logging to TensorBoard
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()


In [None]:
trainer.evaluate()

# Save the model for later use
trainer.save_model("bert-finetuned-imdb")
tokenizer.save_pretrained("bert-finetuned-imdb")
