### Installing all necessary dependencies and login to HuggingFace Hub

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!python -m spacy download es_core_news_lg

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!nvidia-smi -L

### Setup the model

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "dquisi/story_spanish_gpt2_v2",
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("dquisi/story_spanish_gpt2_v2")

### Freezing the original weights


In [None]:
for param in model.parameters():
  param.requires_grad = False
  if param.ndim == 1:
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [None]:
def print_trainable_parameters(model):
    # Prints the number of trainable parameters in the model.
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

### Preprocessing the dataset of news

In [None]:
import transformers
import spacy
from datasets import load_dataset

In [None]:
data = load_dataset("BrauuHdzM/noticias-en-espanol")
print(data)

In [None]:
nlp = spacy.load("es_core_news_lg")

In [None]:
def extract_first_loc(text):
    doc = nlp(text)
    first_loc = next((ent.text for ent in doc.ents if ent.label_ == "LOC"), None) # Find the first entity of type LOC

    return first_loc

In [None]:
def extract_all_ner(text):
    doc = nlp(text)

    ner_entities = [] # List to store all entities of type LOC

    for ent in doc.ents: # Iterate over all entities and add those of type LOC to the list
      ner_entities.append(ent.text)

    all_ner = ' '.join(ner_entities) # Concatenates all entities of type LOC into a text string

    return all_ner

In [None]:
def merge_columns(example):
    doc = nlp(example["Contenido"])
    entities_text = "\n".join([f"{ent.text} ({ent.label_})" for ent in doc.ents])
    example["entities"] = entities_text

    example["text"] = example["Título"] + ". " + extract_all_ner(example["Contenido"]) + ". Fecha: " + example["Fecha"] + ". Lugar: " +  "->: " + "El "+ example["Fecha"] +  ". " + example["Contenido"]
    return example

data['train'] = data['train'].map(merge_columns, remove_columns= 'Vínculo' )
data['test'] = data['test'].map(merge_columns, remove_columns= 'Vínculo' )

In [None]:
data = data.map(lambda samples: tokenizer(samples['text']), batched=True) # mapping all the dataset

### Training

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    args=transformers.TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=10000,
        learning_rate=2e-4,
        fp16=False,
        logging_steps=100,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [None]:
trainer.train()

### Push the model adapters and tokenizer to the repository in HuggingFace

In [None]:
model.push_to_hub(" ", #Your repository in HuggingFace
                  use_auth_token=True,
                  commit_message=" ")

In [None]:
tokenizer.push_to_hub("", #Your repository in HuggingFace
                      use_auth_token=True,
                      commit_message="con NER")