In [4]:
# !pip install transformers
# !pip install accelerate
# !pip install peft

In [5]:
import numpy as np
import pandas as pd
import time
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [8]:
reviews = pd.read_csv("/content/drive/MyDrive/Transformers/Sentiment_Analysis/Movies_Dataset/movie_data.csv")
reviews.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [9]:
train_text = reviews.iloc[:35000, 0].tolist()
train_label = reviews.iloc[:35000, 1].values

val_text = reviews.iloc[35000:40000, 0].tolist()
val_label = reviews.iloc[35000:40000, 1].values

test_text = reviews.iloc[40000:, 0].tolist()
test_label = reviews.iloc[40000:, 1].values

In [10]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

In [12]:
train_embedding = tokenizer(train_text, padding = True, truncation = True, max_length = 512)
val_embedding = tokenizer(val_text, padding = True, truncation = True, max_length = 512)
test_embedding = tokenizer(test_text, padding = True, truncation = True, max_length = 512)

In [13]:
class ReviewsDataset(Dataset):
    def __init__(self, embedding, label):
        self.embedding = embedding
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.embedding.items()}
        item["label"] = torch.tensor(self.label[idx])
        return item

In [14]:
train_dataset = ReviewsDataset(train_embedding, train_label)
val_dataset = ReviewsDataset(val_embedding, val_label)
test_dataset = ReviewsDataset(test_embedding, test_label)

In [15]:
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True, drop_last = True)
val_loader = DataLoader(val_dataset, batch_size = 16, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = True, drop_last = True)

In [None]:
# epochs = 3

In [16]:
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [17]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r=4,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules = ['q_lin'])

In [18]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


In [19]:
model.to(device)
model.train()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(
                  in_features=768, out_features=768, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=4, bias=Fal

In [20]:
def check_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct, total = 0, 0

        for batch in data_loader:

            embeddings = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(embeddings, attention_mask = attention_mask, labels = labels)
            loss, logits = outputs["loss"], outputs["logits"]

            _, predicted_labels = torch.max(logits, 1)
            correct += (predicted_labels == labels).sum()
            total += labels.size(0)

    return (correct.float() / total) * 100

In [21]:
trainer_args = TrainingArguments(
    output_dir = "DistilBERT3_LoRA_Result",
    learning_rate = 1e-3,
    num_train_epochs = 3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs_LoRA",
    logging_steps=10
)

In [22]:
trainer = Trainer(
    model=model,
    args=trainer_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [23]:
trainer.train()

Step,Training Loss
10,0.4391
20,0.4917
30,0.3213
40,0.3211
50,0.2743
60,0.2611
70,0.2715
80,0.2484
90,0.2532
100,0.2507


TrainOutput(global_step=3282, training_loss=0.21491651469472703, metrics={'train_runtime': 3771.9517, 'train_samples_per_second': 27.837, 'train_steps_per_second': 0.87, 'total_flos': 1.411196516352e+16, 'train_loss': 0.21491651469472703, 'epoch': 3.0})

In [24]:
model.eval()
model.to(device)

print(f'Test accuracy: {check_accuracy(model, test_loader, device):.2f}%')

Test accuracy: 92.60%
