In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
from adapters import AutoAdapterModel
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import train_test_split
from scipy.special import softmax
from datasets import Dataset




In [2]:
def conv_logits(predictions):
    logits = np.array(predictions)
    probabilities = softmax(logits, axis=-1)
    predicted_classes = np.argmax(probabilities, axis=-1)

    return predicted_classes

In [3]:
tokeniser = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
model = AutoAdapterModel.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
adapter = model.load_adapter("SOUMYADEEPSAR/text_level_bias1", set_active=True)
classifier = pipeline('text-classification', model=model, tokenizer=tokeniser)

RobertaAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2022-154m and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

  state_dict = torch.load(weights_file, map_location="cpu")
The model 'RobertaAdapterModel' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClas

In [4]:
test = True
args = TrainingArguments(
    "baseline-bias-classifier",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    num_train_epochs = 3,
    weight_decay = 0.01,
)
baseline_tokeniser = tokeniser



In [5]:
annoNotes = pd.read_csv("data/annotation.csv")

agreedNotes = []
agreedLabels = []

for index, note in enumerate(annoNotes.loc[:, "summary"]):
    label1 = annoNotes.loc[index, "label1"]
    label2 = annoNotes.loc[index, "label2"]

    if label1 == label2:
        agreedNotes.append(note)
        agreedLabels.append(label1)

In [6]:
cn_X_train, cn_X_test, cn_y_train, cn_y_test = train_test_split(agreedNotes, agreedLabels, test_size = 0.1, shuffle = True, random_state=42)
cn_X_train, cn_X_val, cn_y_train, cn_y_val = train_test_split(cn_X_train, cn_y_train, test_size = 0.1, shuffle = True, random_state=42)

cn_X_train_embed = baseline_tokeniser(cn_X_train)
cn_X_val_embed =  baseline_tokeniser(cn_X_val)
cn_X_test_embed = baseline_tokeniser(cn_X_test)

cn_train_dataset = Dataset.from_dict({
    'input_ids': cn_X_train_embed['input_ids'],
    'label': cn_y_train
})

cn_eval_dataset = Dataset.from_dict({
    'input_ids': cn_X_val_embed['input_ids'],
    'label': cn_y_val
})

cn_test_dataset = Dataset.from_dict({
    'input_ids': cn_X_test_embed['input_ids'],
    'label': cn_y_test
})

In [7]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=cn_train_dataset,
    eval_dataset=cn_eval_dataset,
    tokenizer=baseline_tokeniser
)

In [9]:
trainer._load_from_checkpoint(resume_from_checkpoint="data/fine-tuned_adapter_model")


There were missing keys in the checkpoint model loaded: ['roberta.prompt_tuning.base_model_embeddings.weight', 'heads.default.3.weight'].


In [16]:
data = pd.read_csv("data/analysis.csv")
data = data.astype(str)

# for index in range(0, 1000, 100):
for index in range(1):
    summaries = list(data.loc[index:index+99, "summary"])

    summary_embeds = baseline_tokeniser(summaries)

    summary_dataset = Dataset.from_dict({
        'input_ids': summary_embeds["input_ids"],
        #'label': [i % 2 for i in range(100)]
    })

    print(summary_dataset["input_ids"])
    logits = trainer.predict(summary_dataset)
    predictions = conv_logits(logits[0])
    print(len(predictions))

    print("Saving predictions...")
    for predIndex, prediction in enumerate(predictions):

        data.loc[index+predIndex, "label"] = str(prediction)

    data.to_csv("data/analysis.csv",
                columns=["noteId", "status", "summary", "label"])

[[0, 133, 2274, 473, 45, 311, 10, 588, 3286, 2058, 4, 85, 16, 10, 2125, 9, 1778, 1808, 4, 1437, 1437, 1437, 1205, 640, 24905, 15954, 4, 2001, 642, 4, 175, 73, 37447, 4, 2001, 642, 4, 175, 4, 3367, 530, 1000, 36491, 2], [0, 9682, 261, 7315, 222, 45, 3545, 42, 4, 1437, 1437, 1437, 1205, 640, 1178, 4, 175, 73, 523, 261, 13792, 330, 116, 29, 5214, 2146, 2], [0, 133, 2026, 2624, 1870, 10183, 168, 3413, 11, 5, 3085, 812, 24018, 485, 5126, 189, 33, 103, 5377, 6, 53, 24, 2092, 14, 24, 965, 75, 2024, 3307, 7, 13524, 1076, 12, 15844, 18, 5824, 50, 10, 739, 831, 13446, 25, 2528, 4, 1205, 640, 1401, 4, 34929, 1173, 13447, 4, 175, 73, 2926, 73, 29917, 12, 16843, 73, 8628, 6374, 73, 354, 763, 2507, 12, 4540, 2586, 12, 25667, 12, 261, 12, 8628, 6374, 12, 179, 12, 844, 1978, 12, 330, 5622, 12, 463, 12, 179, 267, 4123, 12, 406, 12, 4950, 4235, 12, 463, 12, 853, 26106, 12, 119, 12427, 5003, 12, 179, 12, 15177, 47362, 73, 844, 1978, 73, 3570, 73, 1570, 73, 2], [0, 713, 16, 6, 11, 754, 6, 10, 17190, 5212,