In [None]:
# Install the following libraries if not done already. 
#!pip install transformers datasets torch 

In [None]:
# InstallPyTorch for CPU-only support
#!pip install torch torchvision torchaudio!
#!pip install --user "accelerate>=0.26.0"

In [None]:
# For GPU Support (NVIDIA CUDA)
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install --user "accelerate>=0.26.0"

In [None]:
# This program demonstrates how to fine-tune and use the DistilBERT model for sentiment analysis on the IMDb dataset.
# It involves:
# 1. Loading the IMDb dataset and preparing it.
# 2. Tokenizing the text data using DistilBERT's tokenizer.
# 3. Fine-tuning the pre-trained DistilBERT model for binary classification.
# 4. Evaluating the fine-tuned model and making predictions on new text.

# Explanation of imports:
# - DistilBertTokenizer: Used to tokenize input text into the format required by the DistilBERT model.
# - DistilBertForSequenceClassification: Pre-trained DistilBERT model tailored for sequence classification tasks.
# - Trainer: High-level API for training and evaluating Hugging Face models.
# - TrainingArguments: Configuration for the Trainer, including batch size, epochs, learning rate, etc.
# - load_dataset: Part of the `datasets` library, used to load and manage datasets like IMDb for training and evaluation.

In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [6]:
# Final code follows

In [5]:
# Step 1: Load Dataset
from datasets import load_dataset
dataset = load_dataset("imdb")

# Split into training and validation sets
train_data = dataset["train"].shuffle(seed=42).select(range(2000))  # Select smaller subset for demo
val_data = dataset["test"].shuffle(seed=42).select(range(500))

# Step 2: Load Tokenizer
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Step 3: Load Model
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

# Step 4: Define Training Arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # 'eval_strategy' for updated versions
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

# Step 5: Trainer Object
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    processing_class=tokenizer,  # Updated to 'processing_class' to avoid deprecation warning
)

# Step 6: Train and Evaluate
trainer.train()
trainer.evaluate()

# Step 7: Inference
def predict(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model(**tokens)
    prediction = outputs.logits.argmax(-1).item()
    label = "Positive" if prediction == 1 else "Negative"
    return label

# Example prediction 1
text = "The movie was fantastic! The plot and acting were top-notch."
print(f"Review: '{text}'\nPrediction: {predict(text)}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2596,0.334589
2,0.1919,0.313853
3,0.0926,0.318552


Review: 'The movie was fantastic! The plot and acting were top-notch.'
Prediction: Positive


In [7]:
# Example prediction 2
text = "The movie was super boring! The plot and acting were terrible."
print(f"Review: '{text}'\nPrediction: {predict(text)}")

Review: 'The movie was super boring! The plot and acting were terrible.'
Prediction: Negative


In [8]:
# Example prediction 3
text = "This morning I am not able to decide what to do with the day."
print(f"Review: '{text}'\nPrediction: {predict(text)}")

Review: 'This morning I am not able to decide what to do with the day.'
Prediction: Negative


In [11]:
# Example prediction 4
text = "The movie is neighther good nor bad."
print(f"Review: '{text}'\nPrediction: {predict(text)}")

Review: 'The movie is neighther good nor bad.'
Prediction: Negative


In [12]:
# Example prediction 4
text = "The movie was neutral."
print(f"Review: '{text}'\nPrediction: {predict(text)}")

Review: 'The movie was neutral.'
Prediction: Negative


In [13]:
# Step 8: Save the Model and Tokenizer
output_dir = "C://AA SK 53//A INDUS//Papers with Murugan///NLP Book//Python Tutorials//Chapter 8//saved_model_8.1"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to C://AA SK 53//A INDUS//Papers with Murugan///NLP Book//Python Tutorials//Chapter 8//saved_model_8.1


In [20]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Step 1: Load the saved model and tokenizer
output_dir = "C://AA SK 53//A INDUS//Papers with Murugan///NLP Book//Python Tutorials//Chapter 8//saved_model_8.1//"
loaded_model = DistilBertForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = DistilBertTokenizer.from_pretrained(output_dir)

# Step 2: Define the prediction function
def predict_with_loaded_model(text):
    tokens = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = loaded_model(**tokens)
    prediction = outputs.logits.argmax(-1).item()
    label = "Positive" if prediction == 1 else "Negative"
    return label

# Step 3: Make a prediction
example_text = "The movie was boring and lacked excitement."
print(f"Review: '{example_text}'\nPrediction: {predict_with_loaded_model(example_text)}")

Review: 'The movie was boring and lacked excitement.'
Prediction: Negative
