# Import necessary libraries  




In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import load_dataset

# Model loading

In [None]:
# Loading the pre-trained model and its corresponding tokenizer
name = "distilbert-base-uncased"
pretrained_model = DistilBertForSequenceClassification.from_pretrained(name, num_labels=2)
tokenizer = DistilBertTokenizer.from_pretrained(name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Loading the Data

In [None]:
# Loading the IMDB dataset and using only first 500 rows for training
data = load_dataset("imdb", split="train[:500]")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Peeping into the data

In [None]:
data[:4]

{'text': ['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

# Tokenizing the data

In [None]:
# Tokenize the input data
def tokenize_function(reviews):
    return tokenizer(reviews["text"], padding="max_length", truncation=True)

tokenized_data = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

# Formatting the tokenized data

In [None]:
tokenized_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
tokenized_data[:4]

{'label': tensor([0, 0, 0, 0]),
 'input_ids': tensor([[  101,  1045, 12524,  ...,     0,     0,     0],
         [  101,  1000,  1045,  ...,     0,     0,     0],
         [  101,  2065,  2069,  ...,     0,     0,     0],
         [  101,  2023,  2143,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

# Defining and applying the LoRA config



In [None]:
# Defining LoRA config
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none"
)

# Applying the defined config to the model
peft_model = get_peft_model(pretrained_model, config)

# Defining training arguments

In [None]:
# Defining training arguments
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
)

# Initializing the trainer

In [None]:
# Initializing the Trainer
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_data,
)

# Training and saving the model

In [None]:
# Train the model
trainer.train()

# Save the model
peft_model.save_pretrained("./peft-distilbert-imdb")

Step,Training Loss
10,0.7035
20,0.6669
30,0.6502
40,0.6242
50,0.5856
60,0.5394
70,0.4789
80,0.4092
90,0.3385
100,0.2825


# Saving the tokenizer

In [None]:
tokenizer.save_pretrained("./peft-distilbert-imdb")

('./peft-distilbert-imdb/tokenizer_config.json',
 './peft-distilbert-imdb/special_tokens_map.json',
 './peft-distilbert-imdb/vocab.txt',
 './peft-distilbert-imdb/added_tokens.json')

#  Inference

In [None]:
# Sentence to run inference on
test_sentence = "I love this horrible shite movie!"

# Tokenizing the sentence
input = tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True)


In [None]:

# Running inference
with torch.no_grad():
    input = {k: v.to(peft_model.device) for k, v in input.items()}  # Move input to the same device as the model
    output = peft_model(**input)
    logits = output.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

# Mapping the predicted label to the actual label
label_mapping = {0: "negative", 1: "positive"}
predicted_review = label_mapping[predicted_class]

print(f"Test sentence is: {test_sentence}")
print(f"Predicted sentiment is: {predicted_review}")

Test sentence is: I love this horrible shite movie!
Predicted sentiment is: negative
