In [1]:
import torch
from torch import nn
from torch.nn import functional as F

model_name = "google/electra-large-discriminator"
# model_name = "google/electra-base-discriminator"

In [2]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, ratings = eval_pred
    # predictions = np.argmax(predictions, axis=1)
    pred_pos = predictions > 5
    label_pos = ratings > 5
    # return accuracy.compute(predictions=pred_pos, references=label_pos)
    return {"accuracy": accuracy.compute(predictions=pred_pos, references=label_pos)["accuracy"],
            "rmse": np.mean((predictions - ratings)**2)**0.5}

# id2label = {0: "NEGATIVE", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

bin c:\Users\Abstract\mambaforge\envs\sentenv2\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [3]:
from datasets import load_dataset

imdb = load_dataset("./imdb_reg.py")
imdb.pop("unsupervised")
imdb, imdb["test"][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
 }),
 {'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care abou

In [4]:
imdb["test"][12500]

{'text': "Previous reviewer Claudio Carvalho gave a much better recap of the film's plot details than I could. What I recall mostly is that it was just so beautiful, in every sense - emotionally, visually, editorially - just gorgeous.<br /><br />If you like movies that are wonderful to look at, and also have emotional content to which that beauty is relevant, I think you will be glad to have seen this extraordinary and unusual work of art.<br /><br />On a scale of 1 to 10, I'd give it about an 8.75. The only reason I shy away from 9 is that it is a mood piece. If you are in the mood for a really artistic, very romantic film, then it's a 10. I definitely think it's a must-see, but none of us can be in that mood all the time, so, overall, 8.75.",
 'label': 10.0}

In [5]:
from transformers import DataCollatorWithPadding, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def preprocess_function(examples):
    tokens = tokenizer(examples["text"], truncation=False)
    if type(tokens['input_ids'][0]) == list:
        for i in range(len(tokens['input_ids'])):
            if len(tokens['input_ids'][i]) > 512:
                tokens['input_ids'][i] = tokens['input_ids'][i][:129] + \
                    [102] + tokens['input_ids'][i][-382:]
                tokens['token_type_ids'][i] = [0]*512
                tokens['attention_mask'][i] = [1]*512
    elif len(tokens['input_ids']) > 512:
        tokens['input_ids'] = tokens['input_ids'][:129] + \
            [102] + tokens['input_ids'][-382:]
        tokens['token_type_ids'] = [0]*512
        tokens['attention_mask'] = [1]*512
    return tokens

In [6]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [8]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

model

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [9]:
training_args = TrainingArguments(
    output_dir="electra_large_imdb_reg_spliced_fix10",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    warmup_steps=100,
    # torch_compile=True,
    # fp16=True,
    # load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/2343 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 36.9779, 'learning_rate': 1e-05, 'epoch': 0.06}
{'loss': 16.6502, 'learning_rate': 2e-05, 'epoch': 0.13}
{'loss': 13.7007, 'learning_rate': 1.9554168524297815e-05, 'epoch': 0.19}
{'loss': 6.0772, 'learning_rate': 1.9108337048595635e-05, 'epoch': 0.26}
{'loss': 4.2448, 'learning_rate': 1.8662505572893448e-05, 'epoch': 0.32}
{'loss': 5.0142, 'learning_rate': 1.8216674097191264e-05, 'epoch': 0.38}
{'loss': 3.3927, 'learning_rate': 1.777084262148908e-05, 'epoch': 0.45}
{'loss': 3.4273, 'learning_rate': 1.7325011145786894e-05, 'epoch': 0.51}
{'loss': 2.906, 'learning_rate': 1.687917967008471e-05, 'epoch': 0.58}
{'loss': 2.6533, 'learning_rate': 1.6433348194382527e-05, 'epoch': 0.64}
{'loss': 2.5707, 'learning_rate': 1.598751671868034e-05, 'epoch': 0.7}
{'loss': 2.5857, 'learning_rate': 1.5541685242978156e-05, 'epoch': 0.77}
{'loss': 2.5658, 'learning_rate': 1.5095853767275971e-05, 'epoch': 0.83}
{'loss': 2.7225, 'learning_rate': 1.4650022291573786e-05, 'epoch': 0.9}
{'loss': 2.0151

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 3.9117867946624756, 'eval_accuracy': 0.88208, 'eval_rmse': 4.946967789532184, 'eval_runtime': 621.0961, 'eval_samples_per_second': 40.251, 'eval_steps_per_second': 2.517, 'epoch': 1.0}
{'loss': 2.4671, 'learning_rate': 1.3758359340169416e-05, 'epoch': 1.02}
{'loss': 1.8725, 'learning_rate': 1.3312527864467232e-05, 'epoch': 1.09}
{'loss': 1.9538, 'learning_rate': 1.2866696388765047e-05, 'epoch': 1.15}
{'loss': 1.7274, 'learning_rate': 1.2420864913062862e-05, 'epoch': 1.22}
{'loss': 1.9257, 'learning_rate': 1.197503343736068e-05, 'epoch': 1.28}
{'loss': 1.8435, 'learning_rate': 1.1529201961658493e-05, 'epoch': 1.34}
{'loss': 1.6858, 'learning_rate': 1.1083370485956308e-05, 'epoch': 1.41}
{'loss': 1.9954, 'learning_rate': 1.0637539010254126e-05, 'epoch': 1.47}
{'loss': 1.7033, 'learning_rate': 1.019170753455194e-05, 'epoch': 1.54}
{'loss': 1.4969, 'learning_rate': 9.745876058849756e-06, 'epoch': 1.6}
{'loss': 1.5146, 'learning_rate': 9.30004458314757e-06, 'epoch': 1.66}
{'lo

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 1.6247199773788452, 'eval_accuracy': 0.95872, 'eval_rmse': 4.865195563248444, 'eval_runtime': 619.8743, 'eval_samples_per_second': 40.331, 'eval_steps_per_second': 2.521, 'epoch': 2.0}
{'loss': 1.4602, 'learning_rate': 6.625055728934463e-06, 'epoch': 2.05}
{'loss': 1.2891, 'learning_rate': 6.179224253232279e-06, 'epoch': 2.11}
{'loss': 1.2627, 'learning_rate': 5.733392777530095e-06, 'epoch': 2.18}
{'loss': 1.4664, 'learning_rate': 5.28756130182791e-06, 'epoch': 2.24}
{'loss': 1.2305, 'learning_rate': 4.841729826125725e-06, 'epoch': 2.3}
{'loss': 1.2872, 'learning_rate': 4.39589835042354e-06, 'epoch': 2.37}
{'loss': 1.2644, 'learning_rate': 3.950066874721356e-06, 'epoch': 2.43}
{'loss': 1.3754, 'learning_rate': 3.5042353990191713e-06, 'epoch': 2.5}
{'loss': 1.1153, 'learning_rate': 3.0584039233169866e-06, 'epoch': 2.56}
{'loss': 1.3498, 'learning_rate': 2.612572447614802e-06, 'epoch': 2.62}
{'loss': 1.2545, 'learning_rate': 2.1667409719126175e-06, 'epoch': 2.69}
{'loss': 1

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 1.7369074821472168, 'eval_accuracy': 0.96212, 'eval_rmse': 5.055060677999024, 'eval_runtime': 635.5792, 'eval_samples_per_second': 39.334, 'eval_steps_per_second': 2.459, 'epoch': 3.0}
{'train_runtime': 7900.3535, 'train_samples_per_second': 9.493, 'train_steps_per_second': 0.297, 'train_loss': 3.3375680559575938, 'epoch': 3.0}


TrainOutput(global_step=2343, training_loss=3.3375680559575938, metrics={'train_runtime': 7900.3535, 'train_samples_per_second': 9.493, 'train_steps_per_second': 0.297, 'train_loss': 3.3375680559575938, 'epoch': 3.0})

In [10]:
model(**(tokenizer("""When I was a kid I watched this many times over, and I remember whistling the "Happy Cat" song quite often. All the songs are great, and actually memorable, unlike many children's musicals, where the songs are just stuck in for no real reason. The scenes and costumes are lavish, and the acting is very well-done, which isn't surprising, considering the cast. Christopher Walken is very catlike, and doesn't need stupid make-up, or a cat costume for the viewer to believe he's a cat transformed to a human. And Jason Connery's so cute, as the shy and awkward miller's son, Corin, who falls in love with beautiful and the bold Princess Vera. This is a really fun, enjoyable, feature-length movie, where unlike most fairytales, the characters are given personalities. Some of my favourite parts are when Puss makes Corin pretend he's drowning; at the ball when everybody starts dancing a country dance, as it's "all the rage abroad"; when Walken is in the kitchen, dancing on the table (he's a pretty good dancer, too!); and when Vera tells Corin all the things she used to do when she was young, like pretending she was a miller's daughter. I'd recommend this film to children and parents alike, who love magic and fairytales. And it actually IS a movie you can watch together, as it won't drive adults up the wall.""", return_tensors="pt").to('cuda')))

SequenceClassifierOutput(loss=None, logits=tensor([[9.1055]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [11]:
model(**(tokenizer("It was a bit boring. Kinda drawn out but fine ig.", return_tensors="pt").to('cuda'))).logits

tensor([[3.9524]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [60]:
torch.cuda.empty_cache()