In [None]:
%env WANDB_DISABLED=true
%pip install wandb

In [None]:
%pip install --upgrade transformers datasets
import transformers
%pip install transformers[torch]
%pip install 'accelerate>=0.26.0


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset=load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
def tokenize(data):
  return tokenizer(data["text"], padding=True, truncation=True,max_length=256) #512

In [None]:
tokenized_text=dataset.map(tokenize,batched=True)

In [None]:
!pip install evaluate


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
model=AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)



In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1":       f1_metric.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    fp16=True,
    num_train_epochs=1,
    report_to=[]
)


In [None]:

accuracy_metric = evaluate.load("accuracy")
f1_metric       = evaluate.load("f1")


In [None]:
small_train = tokenized_text["train"] \
    .shuffle(seed=42) \
    .select(range(5000))
small_test  = tokenized_text["test"] \
    .shuffle(seed=42) \
    .select(range(1000))

In [None]:



from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset= small_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
eval_metrics = trainer.evaluate()
print(f"Eval Accuracy: {eval_metrics['eval_accuracy']:.4f}")
print(f"Eval F1:       {eval_metrics['eval_f1']:.4f}")


In [None]:
trainer.save_model("fine_tuned_bert_imdb")
tokenizer.save_pretrained("fine_tuned_bert_imdb")

In [None]:
from transformers import pipeline


sentiment = pipeline(
    "text-classification",
    model="fine_tuned_bert_imdb",
    tokenizer="fine_tuned_bert_imdb"
)


print(sentiment("What an amazing film—truly a masterpiece!"))



Pipeline Overview-

The machine learning pipeline launches by loading the IMDb movie‐review dataset through the Hugging Face datasets library, we have split the data into training and test sets and fixed a random seed for the sake of reproducibility.

Components and Rationale
Tokenizer-

We have used the method AutoTokenizer.from_pretrained("bert-base-uncased") to convert raw text into token IDs. We used padding and truncating with a maximum length of 256 tokens to get a balance between expressive capacity and computational costs.

Model-

We have used "AutoModelForSequenceClassification" (and a num_labels=2 argument) to provide a pre‐trained BERT encoder and lightweight classification head model. The selection of the model takes advantage of transfer learning to generalize better when trained with only a limited amount of observation data.

Training & Evaluation-

The pipeline also sets the learning rate (2e‑5), batch size (8), and to train one epoch (sufficient for proof of concept while managing GPU runtime). The Trainer handler optimises the model and includes logging & checkpointing capabilities. The pipeline's evaluate library also permits accuracy and F1‑score (binary average) to be computed.

Inference-

once the fine‑tuned model and tokenizer are saved we are able to wrap them in a Hugging Face pipeline for end‑to‑end text‑classification, providing ease of deployment and allowing rapid prototype capabilities.

Challenges faced-

The major challenge faced is the training phase, the training took significant time, and transformer also gave some issues in training arguments, so install some required libraries.
To make the training process fast I used small portion of the training and test dataset.
Anyway the worksheet gave me a great fun.