In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    BertTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("SetFit/tweet_sentiment_extraction")
ds

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

In [4]:
print(ds["train"].features)

{'textID': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'label_text': Value(dtype='string', id=None)}


In [35]:
import pandas as pd
train_df = pd.DataFrame(ds["train"])

label_mapping = train_df[["label", "label_text"]].drop_duplicates().sort_values("label")
print("Label mapping:")
print(label_mapping)

label_names = label_mapping["label_text"].tolist()
label_names

Label mapping:
   label label_text
1      0   negative
0      1    neutral
6      2   positive


['negative', 'neutral', 'positive']

In [5]:
set(ds['train']['label'])

{0, 1, 2}

In [23]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
text = "puppy is cute"
input = tokenizer(text,return_tensors='pt',padding=True, truncation=True,max_length =512)
output = model(**input)
predicted_class = torch.argmax(output.logits).item()
print(f"Predicted Sentiment: {model.config.id2label[predicted_class]}")

Predicted Sentiment: LABEL_0


In [8]:
for name, module in model.named_modules():
    print(name)


bert
bert.embeddings
bert.embeddings.word_embeddings
bert.embeddings.position_embeddings
bert.embeddings.token_type_embeddings
bert.embeddings.LayerNorm
bert.embeddings.dropout
bert.encoder
bert.encoder.layer
bert.encoder.layer.0
bert.encoder.layer.0.attention
bert.encoder.layer.0.attention.self
bert.encoder.layer.0.attention.self.query
bert.encoder.layer.0.attention.self.key
bert.encoder.layer.0.attention.self.value
bert.encoder.layer.0.attention.self.dropout
bert.encoder.layer.0.attention.output
bert.encoder.layer.0.attention.output.dense
bert.encoder.layer.0.attention.output.LayerNorm
bert.encoder.layer.0.attention.output.dropout
bert.encoder.layer.0.intermediate
bert.encoder.layer.0.intermediate.dense
bert.encoder.layer.0.intermediate.intermediate_act_fn
bert.encoder.layer.0.output
bert.encoder.layer.0.output.dense
bert.encoder.layer.0.output.LayerNorm
bert.encoder.layer.0.output.dropout
bert.encoder.layer.1
bert.encoder.layer.1.attention
bert.encoder.layer.1.attention.self
bert.e

In [5]:
peft_config = LoraConfig(
                    task_type = 'SEQ_CLS',
                    r = 3,
                    lora_alpha = 42,
                    lora_dropout = 0.01,
                    target_modules = ["query", "value"])
model = get_peft_model(model,peft_config)

In [10]:
help(LoraConfig)

Help on class LoraConfig in module peft.tuners.lora.config:

class LoraConfig(peft.config.PeftConfig)
 |  LoraConfig(task_type: Union[str, peft.utils.peft_types.TaskType, NoneType] = None, peft_type: Union[str, peft.utils.peft_types.PeftType, NoneType] = None, auto_mapping: Optional[dict] = None, base_model_name_or_path: Optional[str] = None, revision: Optional[str] = None, inference_mode: bool = False, r: 'int' = 8, target_modules: 'Optional[Union[list[str], str]]' = None, exclude_modules: 'Optional[Union[list[str], str]]' = None, lora_alpha: 'int' = 8, lora_dropout: 'float' = 0.0, fan_in_fan_out: 'bool' = False, bias: "Literal['none', 'all', 'lora_only']" = 'none', use_rslora: 'bool' = False, modules_to_save: 'Optional[list[str]]' = None, init_lora_weights: "bool | Literal['gaussian', 'eva', 'olora', 'pissa', 'pissa_niter_[number of iters]', 'corda', 'loftq']" = True, layers_to_transform: 'Optional[Union[list[int], int]]' = None, layers_pattern: 'Optional[Union[list[str], str]]' = No

In [6]:
model.print_trainable_parameters()

trainable params: 112,899 || all params: 109,597,446 || trainable%: 0.1030


In [7]:
from sklearn.model_selection import train_test_split
train = ds['train'].train_test_split(test_size=0.1, seed=42)['test']  # 10% sample
test = ds['test'].train_test_split(test_size=0.1, seed=42)['test']

In [13]:
print(train,test)

Dataset({
    features: ['textID', 'text', 'label', 'label_text'],
    num_rows: 2749
}) Dataset({
    features: ['textID', 'text', 'label', 'label_text'],
    num_rows: 354
})


In [8]:
tokenized_train = tokenizer(train['text'],return_tensors='pt',padding=True, truncation=True,max_length =512)
tokenized_test = tokenizer(test['text'],return_tensors='pt',padding=True, truncation=True,max_length =512)

In [9]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    
    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = SentimentDataset(tokenized_train, train['label'])
val_dataset = SentimentDataset(tokenized_test, test['label'])

In [11]:
train_dataset[0]

{'input_ids': tensor([  101,  2054,  4148,  2000,  1996, 29229,  1997,  6737,  1029,  8046,
          2320,  2153,  1029,  1012,  1012,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}


In [19]:
len(train['text'])

2749

In [20]:
len(train['label'])

2749

In [14]:
training_args = TrainingArguments(
    output_dir= "lora-text-classification",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
try:
    trainer.train()
except Exception as e:
    print(f"Error during raining: {e}")
    raise

Step,Training Loss
500,0.89


In [16]:
trainer.evaluate()

{'eval_loss': 0.6954167485237122,
 'eval_accuracy': 0.7175141242937854,
 'eval_runtime': 0.8248,
 'eval_samples_per_second': 429.196,
 'eval_steps_per_second': 107.905,
 'epoch': 1.0}

In [19]:
trainer.save_model('lora_model')

In [25]:
lora_classification = BertForSequenceClassification.from_pretrained("./lora_model", num_labels=3 )
lora_classification.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): lora.Linear(
                (base_layer): Linear(in_features=768, out_features=768, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.01, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=3, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_featu

In [50]:
texts = ["maybe someday I`ll find a book of yours on the bestsellers list? lol,awesome!","can`t go to bed  An am sooooo tired!","just woke up, no school today, we are free"]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = lora_classification(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()

for text, prob in zip(texts, probs):
    pred_idx = np.argmax(prob)
    print(f"Text: {text}")
    print(f"Predicted: {label_names[pred_idx]} ({prob[pred_idx]:.2f})")

Text: maybe someday I`ll find a book of yours on the bestsellers list? lol,awesome!
Predicted: positive (0.98)
Text: can`t go to bed  An am sooooo tired!
Predicted: negative (0.85)
Text: just woke up, no school today, we are free
Predicted: neutral (0.90)
