In [1]:
!pip install datasets peft evaluate transformers

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-an

In [2]:
from datasets import load_dataset,Dataset,DatasetDict

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
import evaluate
import torch
import numpy as np


#Load Dataset and Trucating it

In [6]:
spam_dataset = load_dataset("SetFit/enron_spam")

#modifying dataset with only 1500 samples
N = 1500

#generate random indexes for random subsamples
rand_idx_train = np.random.randint(23999,size=N)
rand_idx_test = np.random.randint(2000,size=N)

#form train and test data
x_train = spam_dataset['train'][rand_idx_train]['text']
y_train = spam_dataset['train'][rand_idx_train]['label']

x_test = spam_dataset['test'][rand_idx_test]['text']
y_test = spam_dataset['test'][rand_idx_test]['label']


#new dataset

dataset = DatasetDict()
dataset['train'] = Dataset.from_dict({'text':x_train,'label':y_train})
dataset['validation'] = Dataset.from_dict({'text':x_test,'label':y_test})

Repo card metadata block was not found. Setting CardData to empty.


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1500
    })
})

In [8]:
set(dataset['train']['label'])

{0, 1}

#Define the model - RoBERTa base

In [9]:
from re import I
id2label = {0: "ham", 1: "spam"}
label2id = {label: idx for idx, label in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
#create a tokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base",add_prefix_space=True)

#Adding pad token if it does'nt exist

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [11]:
#define tokenizer function
def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512,
    )

    return tokenized_inputs

#Applying tokenization to my truncated dataset

In [12]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [13]:
#applying datacollator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
#define computer/performance metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"Accuracy":accuracy.compute(predictions=predictions, references=labels)}
#

In [16]:
text_list = ["100 dollar credit message me.","Can you please send me the report by end of day?","Get rich quick! Click here for a limited-time offer!","Reminder: Meeting at 10 AM tomorrow in conference room B","Congrate you are a millionaire now"]


print("Untrained Model Predictions")
print("-------------------------------------")

for text in text_list:
  #tokenize
  inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
  #compute logits
  logits = model(**inputs).logits
  #take argmax
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Untrained Model Predictions
-------------------------------------
100 dollar credit message me. - spam
Can you please send me the report by end of day? - spam
Get rich quick! Click here for a limited-time offer! - spam
Reminder: Meeting at 10 AM tomorrow in conference room B - spam
Congrate you are a millionaire now - spam


In [17]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

#define config for Parameter Efficient Finetunning

In [18]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=5,
    lora_alpha=33,
    lora_dropout=0.03,
    target_modules=["query", "value"],
)
#

In [19]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 776,450 || all params: 125,423,620 || trainable%: 0.6191


In [22]:
lr = 1e-3
batch_size = 4
num_epochs = 10

#Define training arguements and model trainer

In [25]:
training_args = TrainingArguments(
    output_dir="RoBERTa-base"+"text_classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [26]:
trainer = Trainer (
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.200391,{'accuracy': 0.9653333333333334}
2,0.324100,0.222317,{'accuracy': 0.954}
3,0.130500,0.11225,{'accuracy': 0.9746666666666667}
4,0.081100,0.15936,{'accuracy': 0.978}
5,0.081100,0.204141,{'accuracy': 0.9753333333333334}
6,0.035900,0.353647,{'accuracy': 0.9626666666666667}
7,0.011500,0.30289,{'accuracy': 0.9733333333333334}
8,0.005800,0.31386,{'accuracy': 0.9713333333333334}
9,0.005800,0.250255,{'accuracy': 0.978}
10,0.000000,0.267402,{'accuracy': 0.974}


Trainer is attempting to log a value of "{'accuracy': 0.9653333333333334}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.954}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9746666666666667}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.978}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9753333333333334}" of type <class 'dict'> for 

TrainOutput(global_step=3750, training_loss=0.07851869401255002, metrics={'train_runtime': 1655.8853, 'train_samples_per_second': 9.059, 'train_steps_per_second': 2.265, 'total_flos': 3388556365935840.0, 'train_loss': 0.07851869401255002, 'epoch': 10.0})

In [27]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
100 dollar credit message me. - spam
Can you please send me the report by end of day? - ham
Get rich quick! Click here for a limited-time offer! - spam
Reminder: Meeting at 10 AM tomorrow in conference room B - ham
Congrate you are a millionaire now - spam


In [32]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
#Pusing the model to HuggingFace

hf_name = 'darshanluffy'
model_id = hf_name + "/" + "roberta-base" + "-lora-text-classification" # you can name the model whatever you want
print(model_id)


darshanluffy/roberta-base-lora-text-classification


In [34]:
model.push_to_hub(model_id)

adapter_model.safetensors:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/darshanluffy/roberta-base-lora-text-classification/commit/622150b06bc72d9309ef5122c989044d20491f44', commit_message='Upload model', commit_description='', oid='622150b06bc72d9309ef5122c989044d20491f44', pr_url=None, pr_revision=None, pr_num=None)