In [1]:
import torch
from torch import nn
from torch.nn import functional as F

model_name = "google/electra-large-discriminator"

In [2]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

bin c:\Users\Abstract\mambaforge\envs\sentenv2\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [3]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType


model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id, trust_remote_code=True # , torch_dtype=torch.float16 # , load_in_8bit=True #
)

# for param in model.transformer.parameters():
#   param.requires_grad = False  # freeze the model - train adapters later
#   if param.ndim == 1:
#     # cast the small parameters (e.g. layernorm) to fp32 for stability
#     param.data = param.data.to(torch.float32)

# model.gradient_checkpointing_enable()  # reduce number of stored activations
# model.enable_input_require_grads()

# class CastOutputToFloat(nn.Sequential):
#   def forward(self, x): return super().forward(x).to(torch.float32)
# model.lm_head = CastOutputToFloat(model.lm_head)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=16,
    lora_alpha=32,
    # target_modules=["c_proj", "c_attn"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# model.config.use_cache = False
# model.base_model.model.model.embed_tokens.weight.data = model.base_model.model.model.embed_tokens.weight.data.float()
# model.base_model.model.score.weight.data = model.base_model.model.score.weight.data.float()


# for param in model.transformer.parameters():
#     param.requires_grad = False

model

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 3676164 || all params: 337768452 || trainable%: 1.0883680752991105


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): ElectraForSequenceClassification(
      (electra): ElectraModel(
        (embeddings): ElectraEmbeddings(
          (word_embeddings): Embedding(30522, 1024, padding_idx=0)
          (position_embeddings): Embedding(512, 1024)
          (token_type_embeddings): Embedding(2, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): ElectraEncoder(
          (layer): ModuleList(
            (0-23): 24 x ElectraLayer(
              (attention): ElectraAttention(
                (self): ElectraSelfAttention(
                  (query): Linear(
                    in_features=1024, out_features=1024, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default

In [4]:
from datasets import load_dataset

imdb = load_dataset("imdb")
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [5]:
from transformers import DataCollatorWithPadding, AutoTokenizer, LlamaTokenizerFast

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [7]:
training_args = TrainingArguments(
    output_dir="my_awesome_lora_"+model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    warmup_steps=100,
    fp16=True,
    # load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/1562 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.6999, 'learning_rate': 2e-05, 'epoch': 0.13}
{'loss': 0.6805, 'learning_rate': 1.863201094391245e-05, 'epoch': 0.26}
{'loss': 0.4501, 'learning_rate': 1.729138166894665e-05, 'epoch': 0.38}
{'loss': 0.1921, 'learning_rate': 1.59233926128591e-05, 'epoch': 0.51}
{'loss': 0.185, 'learning_rate': 1.4555403556771547e-05, 'epoch': 0.64}
{'loss': 0.1705, 'learning_rate': 1.320109439124487e-05, 'epoch': 0.77}
{'loss': 0.1657, 'learning_rate': 1.183310533515732e-05, 'epoch': 0.9}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.14086991548538208, 'eval_accuracy': 0.9532, 'eval_runtime': 342.701, 'eval_samples_per_second': 72.95, 'eval_steps_per_second': 4.561, 'epoch': 1.0}
{'loss': 0.1593, 'learning_rate': 1.0478796169630643e-05, 'epoch': 1.02}
{'loss': 0.1535, 'learning_rate': 9.110807113543091e-06, 'epoch': 1.15}
{'loss': 0.158, 'learning_rate': 7.742818057455541e-06, 'epoch': 1.28}
{'loss': 0.1351, 'learning_rate': 6.388508891928865e-06, 'epoch': 1.41}
{'loss': 0.151, 'learning_rate': 5.0205198358413135e-06, 'epoch': 1.54}
{'loss': 0.139, 'learning_rate': 3.6525307797537625e-06, 'epoch': 1.66}
{'loss': 0.1581, 'learning_rate': 2.284541723666211e-06, 'epoch': 1.79}
{'loss': 0.1478, 'learning_rate': 9.165526675786594e-07, 'epoch': 1.92}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.1308840662240982, 'eval_accuracy': 0.9562, 'eval_runtime': 324.5554, 'eval_samples_per_second': 77.028, 'eval_steps_per_second': 4.816, 'epoch': 2.0}
{'train_runtime': 2473.7207, 'train_samples_per_second': 20.212, 'train_steps_per_second': 0.631, 'train_loss': 0.2453161360512317, 'epoch': 2.0}


TrainOutput(global_step=1562, training_loss=0.2453161360512317, metrics={'train_runtime': 2473.7207, 'train_samples_per_second': 20.212, 'train_steps_per_second': 0.631, 'train_loss': 0.2453161360512317, 'epoch': 2.0})