In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the IMDb dataset
dataset = load_dataset("imdb")

Found cached dataset imdb (C:/Users/matth/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 95.19it/s]


In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [4]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [5]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

                                                                  

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments and set up Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="tensorboard",
    logging_first_step=True,
    push_to_hub=False,
    logging_steps_per_epoch=500,
    eval_steps_per_epoch=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()

print(results)

# Predict sentiment for a given sentence
def predict_sentiment(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=256)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return "Positive" if torch.argmax(probs) == 1 else "Negative"

In [None]:
sentence = "This movie was fantastic!"
print(f"Sentiment for '{sentence}': {predict_sentiment(sentence)}")

sentence = "This movie was terrible."
print(f"Sentiment for '{sentence}': {predict_sentiment(sentence)}")
