In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    BertTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("SetFit/tweet_sentiment_extraction")
ds

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['textID', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

In [3]:
import pandas as pd
train_df = pd.DataFrame(ds["train"])

label_mapping = train_df[["label", "label_text"]].drop_duplicates().sort_values("label")
print("Label mapping:")
print(label_mapping)

label_names = label_mapping["label_text"].tolist()
label_names

Label mapping:
   label label_text
1      0   negative
0      1    neutral
6      2   positive


['negative', 'neutral', 'positive']

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for name, param in model.named_parameters():
    print(name,param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [12]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = True if "pooler" in name else False
    print(name,param.requires_grad)

embeddings.word_embeddings.weight False
embeddings.position_embeddings.weight False
embeddings.token_type_embeddings.weight False
embeddings.LayerNorm.weight False
embeddings.LayerNorm.bias False
encoder.layer.0.attention.self.query.weight False
encoder.layer.0.attention.self.query.bias False
encoder.layer.0.attention.self.key.weight False
encoder.layer.0.attention.self.key.bias False
encoder.layer.0.attention.self.value.weight False
encoder.layer.0.attention.self.value.bias False
encoder.layer.0.attention.output.dense.weight False
encoder.layer.0.attention.output.dense.bias False
encoder.layer.0.attention.output.LayerNorm.weight False
encoder.layer.0.attention.output.LayerNorm.bias False
encoder.layer.0.intermediate.dense.weight False
encoder.layer.0.intermediate.dense.bias False
encoder.layer.0.output.dense.weight False
encoder.layer.0.output.dense.bias False
encoder.layer.0.output.LayerNorm.weight False
encoder.layer.0.output.LayerNorm.bias False
encoder.layer.1.attention.self.query

In [31]:
def tokenize(dataset):
    return tokenizer(dataset['text'],truncation=True)

In [34]:
tokenized_ds = ds.map(tokenize,batched=True)

Map: 100%|██████████| 27481/27481 [00:04<00:00, 5717.31 examples/s]
Map: 100%|██████████| 3534/3534 [00:00<00:00, 5679.20 examples/s]


In [35]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['textID', 'text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['textID', 'text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3534
    })
})

In [19]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}


In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [39]:
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=100,
    eval_steps=100
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [40]:
try:
    trainer.train()
except Exception as e:
    print(f"Error during raining: {e}")
    raise

Step,Training Loss
100,0.8765
200,0.8526
300,0.8579
400,0.8514
500,0.8553
600,0.8402
700,0.8461
800,0.8289
900,0.8457
1000,0.8352


In [41]:

predictions = trainer.predict(tokenized_ds["test"])

logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)

{'accuracy': 0.6397849462365591}


In [42]:
trainer.save_model('transfer_learning_model')

In [45]:
model1 = BertForSequenceClassification.from_pretrained("./transfer_learning_model")
model1.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
texts = ["maybe someday I`ll find a book of yours on the bestsellers list? lol,awesome!","can`t go to bed  An am sooooo tired!","just woke up, no school today, we are free"]
inputs = tokenizer(texts,padding=True,truncation=True, return_tensors="pt").to('cuda')

with torch.no_grad():
    outputs = model1(**inputs)
    logits  = outputs.logits
logits

tensor([[-1.7622, -0.1409,  1.8675],
        [ 1.5190, -0.0333, -1.8211],
        [-0.6380,  0.5900, -0.1299]], device='cuda:0')

In [59]:

probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
probs

array([[0.02285041, 0.11561893, 0.86153066],
       [0.8017957 , 0.16979383, 0.02841046],
       [0.164569  , 0.56190395, 0.27352706]], dtype=float32)

In [65]:
predictions = torch.argmax(logits, dim=-1).cpu().numpy()
predictions

array([2, 0, 1])

In [66]:
for text, pred_idx, prob_dist in zip(texts, predictions, probs):
    label = label_names[pred_idx]
    prob = prob_dist.max()
    print(f'{text}: {label} ({prob:.2f})')

maybe someday I`ll find a book of yours on the bestsellers list? lol,awesome!: positive (0.86)
can`t go to bed  An am sooooo tired!: negative (0.80)
just woke up, no school today, we are free: neutral (0.56)
