# Download model

In [1]:
from transformers import AutoTokenizer, AutoModel 
from pathlib import Path
roberta_model = "roberta-large"



# Download dataset

In [2]:
from datasets import load_dataset,Dataset, load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
conll = load_dataset("conll2003")


In [3]:
conll['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [4]:
# convert each label to it's number
tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
tags_names = ClassLabel(num_classes=len(tags) , names=tags)
def create_tag_names(batch):
    return {'ner_tags_names' : [tags_names.int2str(idx) for idx in batch['ner_tags']]}

In [5]:
conll = conll.map(create_tag_names)

In [6]:
# remove additional columns
conll['train'] = conll['train'].remove_columns(['pos_tags','chunk_tags','id'])
conll['test'] = conll['test'].remove_columns(['pos_tags','chunk_tags','id'])
conll['validation'] = conll['validation'].remove_columns(['pos_tags','chunk_tags','id'])
conll.save_to_disk("Conell_en.hf")


Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
conll

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 3453
    })
})

# Training without Lora

In [8]:
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from transformers import AutoTokenizer , RobertaConfig , AutoConfig , TrainingArguments , DataCollatorForTokenClassification , Trainer
from transformers.modeling_outputs import TokenClassifierOutput

class Roberta(RobertaPreTrainedModel):
    
    class_config = RobertaConfig
    
    def __init__(self , config):
        
        super().__init__(config)
        
        self.num_labels = config.num_labels
        
        self.roberta = RobertaModel(config , add_pooling_layer=False)
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.classifier = nn.Linear(config.hidden_size , config.num_labels)
        
        self.init_weights()
        
    def forward(self , input_ids=None , attention_mask=None , token_type_ids=None , labels=None , **kwargs):

        outputs = self.roberta(input_ids , attention_mask=attention_mask , token_type_ids=token_type_ids , **kwargs)

        sequence_output = self.dropout(outputs[0])

        logits = self.classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1 , self.num_labels) , labels.view(-1))

        return TokenClassifierOutput(
            loss = loss,
            logits = logits,
            hidden_states = outputs.hidden_states,
            attentions = outputs.attentions
        )

In [9]:
from transformers import RobertaTokenizerFast

roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model, add_prefix_space=True)


In [10]:

def tokenize_and_align_labels(examples):
    tokenized_inputs = roberta_tokenizer(examples["tokens"], truncation=True,
    is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags_names"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                #  if a word convert more than one toekn, ignore except first token
                label_ids.append(-100)
            else:
                label_token = label[word_idx]
                # Use the label map to get the numerical value for each entity
                label_ids.append(tag2index[label_token])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
ner_tags = conll['train']['ner_tags_names']
ner_tag_names = set(tag for tags in ner_tags for tag in tags)
                
index2tag = {idx: tag for idx, tag in enumerate(ner_tag_names)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tag_names)}

In [12]:
ner_tag_names

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [13]:
def encode(corpus):
    return corpus.map(tokenize_and_align_labels,batched=True,
                      remove_columns=['tokens' , 'ner_tags' , 'ner_tags_names'])

In [14]:
encoded_data = encode(conll)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
import numpy as np

def align_predictions(predictions, label_ids):
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape
        labels_list, preds_list = [], []
        for batch_idx in range(batch_size):
            example_labels, example_preds = [], []
            for seq_idx in range(seq_len):
                # Ignore label IDs = -100
                if label_ids[batch_idx, seq_idx] != -100:
                    example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                    example_preds.append(index2tag[preds[batch_idx][seq_idx]])
            labels_list.append(example_labels)
            preds_list.append(example_preds)
        return preds_list, labels_list


In [16]:
from transformers import TrainingArguments
import torch

# init training args
num_epochs = 4
batch_size = 24
logging_steps = len(encoded_data['train']) // batch_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

training_args = TrainingArguments(
    output_dir=roberta_model, log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    seed=42,
    logging_strategy="steps", evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=roberta_tokenizer)

In [18]:
# function for model
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig
import torch


def model_init():
    roberta_config  = AutoConfig.from_pretrained(
    "roberta-large",
    num_labels = len(index2tag),
    id2label = index2tag,
    label2id = tag2index
    )
    return Roberta.from_pretrained(roberta_model, config=roberta_config,cache_dir=Path.cwd()).to(device)


In [19]:
from seqeval.metrics import f1_score,recall_score,precision_score,accuracy_score
import wandb

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
    eval_pred.label_ids)
    wandb.log({"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true, y_pred)})
    return {"f1": f1_score(y_true, y_pred),"Recall":recall_score(y_true, y_pred),"Precision":precision_score(y_true, y_pred),"Accuracy":accuracy_score(y_true,y_pred)}

In [20]:
trainer = Trainer(model_init=model_init, args=training_args,
                    data_collator=data_collator, compute_metrics=compute_metrics,
                    train_dataset=encoded_data['train'],
                    eval_dataset=encoded_data['validation'],
                    tokenizer=roberta_tokenizer)

In [21]:
import wandb

# monitoring
wandb.init(project="Roberta",name='without-lora')
result = trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mali-fartout[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…



  0%|          | 0/2344 [00:00<?, ?it/s]

{'loss': 0.0955, 'learning_rate': 3.7521331058020484e-05, 'epoch': 1.0}


  0%|          | 0/136 [00:00<?, ?it/s]

{'eval_loss': 0.04328341409564018, 'eval_f1': 0.9372141372141373, 'eval_Recall': 0.9483338943116796, 'eval_Precision': 0.9263521288837745, 'eval_Accuracy': 0.9903819944706203, 'eval_runtime': 9.0545, 'eval_samples_per_second': 358.937, 'eval_steps_per_second': 15.02, 'epoch': 1.0}
{'loss': 0.0338, 'learning_rate': 2.5042662116040955e-05, 'epoch': 2.0}


  0%|          | 0/136 [00:00<?, ?it/s]

{'eval_loss': 0.03254313766956329, 'eval_f1': 0.9524687735769972, 'eval_Recall': 0.9560753954897341, 'eval_Precision': 0.9488892600634709, 'eval_Accuracy': 0.9918811572758071, 'eval_runtime': 9.0805, 'eval_samples_per_second': 357.908, 'eval_steps_per_second': 14.977, 'epoch': 2.0}
{'loss': 0.0173, 'learning_rate': 1.2563993174061433e-05, 'epoch': 2.99}


  0%|          | 0/136 [00:00<?, ?it/s]

{'eval_loss': 0.031557872891426086, 'eval_f1': 0.9570243779844183, 'eval_Recall': 0.9612924941097274, 'eval_Precision': 0.9527939949958298, 'eval_Accuracy': 0.9922900198590398, 'eval_runtime': 9.097, 'eval_samples_per_second': 357.259, 'eval_steps_per_second': 14.95, 'epoch': 3.0}
{'loss': 0.0085, 'learning_rate': 8.532423208191127e-08, 'epoch': 3.99}


  0%|          | 0/136 [00:00<?, ?it/s]

{'eval_loss': 0.029325664043426514, 'eval_f1': 0.9638574423480084, 'eval_Recall': 0.9671827667452036, 'eval_Precision': 0.9605549055657697, 'eval_Accuracy': 0.9935750165491998, 'eval_runtime': 9.1179, 'eval_samples_per_second': 356.441, 'eval_steps_per_second': 14.916, 'epoch': 4.0}
{'train_runtime': 719.6786, 'train_samples_per_second': 78.04, 'train_steps_per_second': 3.257, 'train_loss': 0.03870046309015869, 'epoch': 4.0}


VBox(children=(Label(value='0.000 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
Accuracy,▁▄▅█
Precision,▁▆▆█
Recall,▁▄▆█
eval/Accuracy,▁▄▅█
eval/Precision,▁▆▆█
eval/Recall,▁▄▆█
eval/f1,▁▅▆█
eval/loss,█▃▂▁
eval/runtime,▁▄▆█
eval/samples_per_second,█▅▃▁

0,1
Accuracy,0.99358
Precision,0.96055
Recall,0.96718
eval/Accuracy,0.99358
eval/Precision,0.96055
eval/Recall,0.96718
eval/f1,0.96386
eval/loss,0.02933
eval/runtime,9.1179
eval/samples_per_second,356.441


In [22]:
trainer.model.num_parameters()

354319369

# Training with lora

In [101]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoConfig, AutoModel 
import torch.nn as nn
import torch

def model_init():
    roberta_config  = AutoConfig.from_pretrained(
    "roberta-large",
    num_labels = len(index2tag),
    id2label = index2tag,
    label2id = tag2index
    )
    
    lora_config = LoraConfig(
        task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
    )
    original_model = Roberta.from_pretrained(roberta_model, config=roberta_config).to(device)
    return  get_peft_model(original_model, lora_config)

In [102]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_lora = model_init()
model_lora.print_trainable_parameters()

trainable params: 1,853,449 || all params: 355,901,458 || trainable%: 0.5207758940959438


In [103]:
from transformers import TrainingArguments
import torch
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
num_epochs = 4
batch_size = 32
logging_steps = len(encoded_data['train']) // batch_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

training_args = TrainingArguments(
    output_dir="./lorass", log_level="error", num_train_epochs=num_epochs,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-3,
    seed=42,
    logging_strategy="steps", evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=False)

In [104]:
logging_steps

438

In [105]:
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['validation'],
    tokenizer=roberta_tokenizer
)



In [106]:
wandb.init(project="Roberta",name='lora')
result = trainer.train()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

  0%|          | 0/1756 [00:00<?, ?it/s]

{'loss': 0.1108, 'learning_rate': 0.0007505694760820045, 'epoch': 1.0}


  0%|          | 0/102 [00:00<?, ?it/s]

{'eval_loss': 0.04405393451452255, 'eval_f1': 0.9235382308845577, 'eval_Recall': 0.9330191854594413, 'eval_Precision': 0.9142480211081794, 'eval_Accuracy': 0.987773061796659, 'eval_runtime': 9.1971, 'eval_samples_per_second': 353.37, 'eval_steps_per_second': 11.09, 'epoch': 1.0}
{'loss': 0.0431, 'learning_rate': 0.0005011389521640091, 'epoch': 2.0}


  0%|          | 0/102 [00:00<?, ?it/s]

{'eval_loss': 0.032071053981781006, 'eval_f1': 0.952969676718737, 'eval_Recall': 0.9599461460787614, 'eval_Precision': 0.9460938795820202, 'eval_Accuracy': 0.9923289591526809, 'eval_runtime': 9.1989, 'eval_samples_per_second': 353.304, 'eval_steps_per_second': 11.088, 'epoch': 2.0}
{'loss': 0.0302, 'learning_rate': 0.0002517084282460137, 'epoch': 2.99}


  0%|          | 0/102 [00:00<?, ?it/s]

{'eval_loss': 0.029818153008818626, 'eval_f1': 0.9573007367716009, 'eval_Recall': 0.9621339616290812, 'eval_Precision': 0.9525158280573143, 'eval_Accuracy': 0.9925431252677076, 'eval_runtime': 9.1969, 'eval_samples_per_second': 353.38, 'eval_steps_per_second': 11.091, 'epoch': 3.0}
{'loss': 0.0217, 'learning_rate': 2.2779043280182233e-06, 'epoch': 3.99}


  0%|          | 0/102 [00:00<?, ?it/s]

{'eval_loss': 0.026289232075214386, 'eval_f1': 0.9632235905168803, 'eval_Recall': 0.9675193537529452, 'eval_Precision': 0.9589658048373645, 'eval_Accuracy': 0.993613955842841, 'eval_runtime': 9.315, 'eval_samples_per_second': 348.901, 'eval_steps_per_second': 10.95, 'epoch': 4.0}
{'train_runtime': 555.7154, 'train_samples_per_second': 101.066, 'train_steps_per_second': 3.16, 'train_loss': 0.051398200680902714, 'epoch': 4.0}
