In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from matplotlib import pyplot as plt
%matplotlib inline

from transformers import DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModel, ElectraForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google/electra-large-discriminator")
data_collator = DataCollatorWithPadding(tokenizer)

electra = ElectraForSequenceClassification.from_pretrained("AbstractQbit/electra_large_imdb_htsplice") #("my_awesome_electra_large_spliced/checkpoint-1562")
electra

bin c:\Users\Abstract\mambaforge\envs\sentenv2\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [2]:
with torch.no_grad():
    inp = tokenizer(["I enjoyed this movie.", "this was an absolute blast!", "This was a fine movie, but I wouldn't watch it again to be honest, a bit too drawn out for my liking. My friends liked it though, so if you like detectives, you may want to watch it, if you have some spare time.", "This was an absolutely horrible movie!"], return_tensors="pt", padding=True)
    print(1 + 9*F.softmax(electra(**inp).logits, dim=-1)[...,1])

tensor([9.9518, 9.9582, 6.4365, 1.0593])


In [3]:
from datasets import load_dataset

imdb = load_dataset("./imdb_reg.py")
imdb.pop("unsupervised")
imdb, imdb["test"][0], imdb["test"][12500]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
 }),
 {'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care abou

In [4]:
def preprocess_tokenize_with_splicing(examples):
    tokens = tokenizer(examples["text"], truncation=False)
    if type(tokens['input_ids'][0]) == list:
        for i in range(len(tokens['input_ids'])):
            if len(tokens['input_ids'][i]) > 512:
                tokens['input_ids'][i] = tokens['input_ids'][i][:129] + \
                    [102] + tokens['input_ids'][i][-382:]
                tokens['token_type_ids'][i] = [0]*512
                tokens['attention_mask'][i] = [1]*512
    elif len(tokens['input_ids']) > 512:
        tokens['input_ids'] = tokens['input_ids'][:129] + \
            [102] + tokens['input_ids'][-382:]
        tokens['token_type_ids'] = [0]*512
        tokens['attention_mask'] = [1]*512
    return tokens

In [5]:
tokenized_imdb = imdb.map(preprocess_tokenize_with_splicing, batched=True)
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [6]:
electra.to('cuda');

In [7]:
def embed_with_electra(examples):
    # print(type(examples))
    batch = data_collator({'input_ids':examples['input_ids'],
                'token_type_ids':examples['token_type_ids'],
                'attention_mask':examples['attention_mask']}).to('cuda')
    
    with torch.no_grad():
        conf_based_rating = 1 + 9*F.softmax(electra(**batch).logits, dim=-1)[...,1]
    return {"predicted": conf_based_rating.to('cpu')}

In [8]:
imdb_with_predicts = tokenized_imdb.map(pred_with_electra, batch_size=256, batched=True)
imdb_with_predicts

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'predicted'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'predicted'],
        num_rows: 25000
    })
})

In [10]:
imdb_with_predicts.save_to_disk('./imdb_with_predicts')

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [13]:
np.array(imdb_with_predicts['test']['label']), np.array(imdb_with_predicts['test']['predicted'])

(array([3., 4., 3., ..., 7., 9., 7.]),
 array([1.01970351, 1.88304043, 1.0522753 , ..., 9.85711288, 9.97132874,
        9.75077724]))

In [37]:
predictions = np.array(imdb_with_predicts['test']['predicted'])
ratings = np.array(imdb_with_predicts['test']['label'])
np.mean((predictions - ratings)**2)**0.5

1.9215112316539822

In [24]:
from sklearn.metrics import mean_squared_error, accuracy_score

mean_squared_error(ratings, predictions, squared=False), accuracy_score(ratings>5.5, predictions>5.5)

(1.9215112316539822, 0.96628)

In [25]:
del electra
torch.cuda.empty_cache()

In [27]:
electra_reg = ElectraForSequenceClassification.from_pretrained("AbstractQbit/electra_large_imdb_regression_htsplice") #("./electra_large_imdb_reg_spliced_fix10/checkpoint-2343")

In [32]:
def pred_with_electra_reg(examples):
    batch = data_collator({'input_ids':examples['input_ids'],
                'token_type_ids':examples['token_type_ids'],
                'attention_mask':examples['attention_mask']}).to('cuda')
    
    with torch.no_grad():
        reg_rating = electra_reg(**batch).logits[...,0]
    return {"predicted_reg": reg_rating.to('cpu')}

In [38]:
electra_reg.to('cuda');

In [35]:
imdb_with_predicts2 = imdb_with_predicts.map(pred_with_electra_reg, batch_size=256, batched=True)
imdb_with_predicts2

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'predicted', 'predicted_reg'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'predicted', 'predicted_reg'],
        num_rows: 25000
    })
})

In [36]:
imdb_with_predicts2.save_to_disk('./imdb_with_predicts2')

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [4]:
# from datasets import load_from_disk
# imdb_with_predicts2 = load_from_disk('./imdb_with_predicts2')

In [7]:
from sklearn.metrics import mean_squared_error, accuracy_score

def print_metrics(label, target, pred):
    print(label)
    print(f'Rating RMSE: {mean_squared_error(target, pred, squared=False):0.3f}')
    print(f'Polarity accuracy: {accuracy_score(target>5.5, pred>5.5):0.3f}\n')

In [8]:
print('On Test split:')
ratings = np.array(imdb_with_predicts2['test']['label'])
predictions_cls = np.array(imdb_with_predicts2['test']['predicted'])
predictions_reg = np.array(imdb_with_predicts2['test']['predicted_reg'])
print_metrics('ELECTRA tuned with polarity:', ratings, predictions_cls)
print_metrics('ELECTRA tuned with regression:', ratings, predictions_reg)

print('\nOn Train split:')
ratings = np.array(imdb_with_predicts2['train']['label'])
predictions_cls = np.array(imdb_with_predicts2['train']['predicted'])
predictions_reg = np.array(imdb_with_predicts2['train']['predicted_reg'])
print_metrics('ELECTRA tuned with polarity:', ratings, predictions_cls)
print_metrics('ELECTRA tuned with regression:', ratings, predictions_reg)

On Test split:
ELECTRA tuned with polarity:
Rating RMSE: 1.922
Polarity accuracy: 0.966

ELECTRA tuned with regression:
Rating RMSE: 1.318
Polarity accuracy: 0.964


On Train split:
ELECTRA tuned with polarity:
Rating RMSE: 1.790
Polarity accuracy: 0.987

ELECTRA tuned with regression:
Rating RMSE: 1.070
Polarity accuracy: 0.983



In [9]:
# ratings = np.array(imdb_with_predicts2['test']['label'])
# predictions_cls = np.array(imdb_with_predicts2['test']['predicted'])
# predictions_reg = np.array(imdb_with_predicts2['test']['predicted_reg'])
# ratings, predictions_cls, predictions_reg

In [10]:
# print(f'ELECTRA tuned with polarity:\nRating RMSE: {mean_squared_error(ratings, predictions_cls, squared=False):0.3f}\nPolarity accuracy: {accuracy_score(ratings>5.5, predictions_cls>5.5):0.3f}\n')
# print(f'ELECTRA tuned with regression:\nRating RMSE: {mean_squared_error(ratings, predictions_reg, squared=False):0.3f}\nPolarity accuracy: {accuracy_score(ratings>5.5, predictions_reg>5.5):0.3f}')