In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m547.8/547.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3

**Import Necessary Libraries**

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Create a Sample Dataset**

In [3]:
data = {
    "text": [
        "I love this movie, it's fantastic!",
        "This film was terrible and boring.",
        "Amazing movie! I would watch it again.",
        "I did not like the movie. It was too slow.",
        "The plot was very interesting and well-paced."
    ],
    "label": [1, 0, 1, 0, 1]  # 1 for positive, 0 for negative
}

# Convert the data into a Dataset object
dataset = Dataset.from_dict(data)

# Split the dataset into train and test sets (80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

**Tokenize the Dataset**

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

**Prepare the Data for Training**

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

**Define the Training Arguments**

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)



**Initialize Trainer and Train the Model**

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.659136,1.0
2,No log,0.616168,1.0
3,No log,0.577753,1.0


TrainOutput(global_step=6, training_loss=0.38681594530741376, metrics={'train_runtime': 98.4084, 'train_samples_per_second': 0.122, 'train_steps_per_second': 0.061, 'total_flos': 3157332664320.0, 'train_loss': 0.38681594530741376, 'epoch': 3.0})

**Evaluate the Model**

In [14]:
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']}")

Test Accuracy: 1.0


**Save the Trained Model**

In [15]:
model.save_pretrained("./fine_tuned_bert_sample")
tokenizer.save_pretrained("./fine_tuned_bert_sample")

('./fine_tuned_bert_sample/tokenizer_config.json',
 './fine_tuned_bert_sample/special_tokens_map.json',
 './fine_tuned_bert_sample/vocab.txt',
 './fine_tuned_bert_sample/added_tokens.json')

**Load and Use the Model for Inference**

In [16]:
model = BertForSequenceClassification.from_pretrained("./fine_tuned_bert_sample")
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_bert_sample")

text = "This movie was absolutely fantastic!"

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
inputs = {key: val.to(device) for key, val in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1)

labels = ["Negative", "Positive"]
print(f"Prediction: {labels[prediction.item()]}")

Prediction: Positive
