## Data exploration

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
news_df = pd.read_csv("data/Sora_LREC2020_biasedsentences.csv")

Seems that each instance consists of an article with sentences and a bias score between 1 and 4 is assigned to the article as a whole, the title, and each sentence.

In [2]:
news_test_sentence = news_df.iloc[0] # visual inspection
print(f"sentence: {news_test_sentence['s0']}", f"score: {news_test_sentence['0']}")
print(f"dataset score range: {news_df['article_bias'].min(),news_df['article_bias'].max()}")
# pd.DataFrame(news_test_sentence)

sentence: [0]: LOUISVILLE - Dan Johnson posted a final message on Facebook to his friends and family on Wednesday afternoon. score: 3
dataset score range: (np.int64(1), np.int64(4))


In [3]:
# model source: https://huggingface.co/cardiffnlp/twitter-roberta-base-2022-
# adapter source: https://huggingface.co/SOUMYADEEPSAR/text_level_bias_roberta-twitter
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
from adapters import AutoAdapterModel
tokeniser = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
model = AutoAdapterModel.from_pretrained("cardiffnlp/twitter-roberta-base-2022-154m")
adapter = model.load_adapter("SOUMYADEEPSAR/text_level_bias1", set_active=True)
classifier = pipeline('text-classification', model=model, tokenizer=tokeniser) # cuda = 0,1 based on gpu availability

RobertaAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2022-154m and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

  state_dict = torch.load(weights_file, map_location="cpu")
The model 'RobertaAdapterModel' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClas

In [4]:
#preprocess news data
def labels_to_binary(label: int):
    if label == 1 or label == 2:
        return 0
    elif label == 3 or label == 4:
        return 1
    else:
        raise Exception(f"expected values 1,2,3 or 4, got {label}")

def extract_sentences_and_labels(df):
    sentences_list = []
    labels_list = []
    for _, row in df.iterrows():
        for i in range(0, 20):  # Sentences are named f"s0" to f"s19"
            if type(row[f"s{i}"]) == str: # skipping nan sentences
                sentences_list.append(row[f"s{i}"])
                labels_list.append(labels_to_binary(row[f"{i}"]))
            
    return sentences_list, labels_list

sentences, labels = extract_sentences_and_labels(news_df)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size = 0.1, shuffle = True, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, shuffle = True, random_state=42)

In [6]:
# Evaluate baseline

def eval_classifier(sentences, labels, classifier):
    pred = []
    y = []
    count = 0
    for index in tqdm(range(len(sentences))):
            try:
                pred.append(classifier(sentences[index])[-1]["label"])
            except ValueError:
                # Todo: add "if" statement checking if nan
                print('g')
                pred.append(0) # seems (?) to be the case that for NaNs, minimum bias is assigned
            y.append(labels[index])
    return pred, y

pred, y = eval_classifier(X_test, y_test, classifier=classifier)

100%|██████████| 393/393 [00:43<00:00,  9.13it/s]


In [7]:
def calc_accuracy(pred, y):
    pred_copy = [np.int64(label) for label in pred]
    y_copy = [np.int64(label) for label in y]
    count = 0
    for index,label in enumerate(pred_copy):
        if y_copy[index] == label:
            count += 1
    return count/len(pred_copy)
calc_accuracy(pred, y)

0.6030534351145038

## Fine tune classifier and compare to baseline

In [8]:
baseline_tokeniser = tokeniser
X_train_embed = baseline_tokeniser(X_train)
X_val_embed =  baseline_tokeniser(X_val)
X_test_embed = baseline_tokeniser(X_test)

In [10]:
X_train_embed

{'input_ids': [[0, 10975, 996, 42645, 374, 378, 662, 6, 3691, 511, 10, 2063, 359, 7837, 2835, 15, 5, 3274, 331, 4223, 108, 999, 6, 3259, 8320, 4, 2], [0, 10975, 1366, 42645, 305, 4360, 324, 115, 45, 1320, 28, 2034, 4, 2], [0, 10975, 1092, 42645, 178, 5, 2971, 9, 194, 2442, 699, 59, 5, 724, 35, 14, 5, 382, 3284, 115, 45, 3264, 10, 1748, 12, 17651, 369, 1101, 4, 2], [0, 10975, 245, 42645, 20, 3097, 6, 10, 1083, 470, 1440, 8445, 2258, 36326, 229, 17302, 6, 341, 5, 1553, 7, 2364, 899, 7, 15343, 335, 111, 217, 5, 2523, 9, 1434, 6, 49, 22, 462, 13349, 60, 1441, 8204, 6, 8, 97, 414, 4, 2], [0, 10975, 246, 42645, 12542, 18, 3795, 6, 17869, 6, 54, 34, 393, 1481, 2550, 409, 31, 11509, 69, 9734, 15, 599, 12, 20345, 77, 24, 606, 7, 69, 1159, 12, 36797, 196, 11, 5, 144, 3901, 169, 678, 35, 2], [0, 10975, 1558, 42645, 20, 394, 34, 1433, 4768, 8760, 18, 1920, 13, 1431, 19, 5, 369, 6, 15096, 11, 779, 14, 39, 299, 12176, 21, 22, 605, 15374, 39, 86, 667, 7, 8654, 19, 4046, 21599, 1554, 113, 111, 39, 233

In [9]:
from datasets import Dataset
train_dataset = Dataset.from_dict({
    'input_ids': X_train_embed['input_ids'],
    'label': y_train
})

eval_dataset = Dataset.from_dict({
    'input_ids':X_val_embed['input_ids'],
    'label': y_val
})

test_dataset = Dataset.from_dict({
    'input_ids':X_test_embed['input_ids'],
    'label':y_test
})

In [11]:
def compute_metrics(pred, y):
    return calc_accuracy(pred,y)

In [12]:
from transformers import TrainingArguments
from transformers import Trainer
test = True
args = TrainingArguments(
    "baseline-bias-classifier",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    num_train_epochs = 3,
    weight_decay = 0.01,
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset=train_dataset,
    eval_dataset= eval_dataset,
    tokenizer= baseline_tokeniser
)






In [13]:
# this seems like it will take very long (3hrs or so). Maybe we should get this to work with GPUs. I think that would involve using a data collator so that everything is the same shape.

# fine-tuning
trainer.train()

  0%|          | 0/1194 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_runtime': 24.5159, 'eval_samples_per_second': 14.44, 'eval_steps_per_second': 1.836, 'epoch': 1.0}
{'loss': 0.6655, 'grad_norm': 3.352205276489258, 'learning_rate': 1.1624790619765495e-05, 'epoch': 1.26}


  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_runtime': 25.1438, 'eval_samples_per_second': 14.079, 'eval_steps_per_second': 1.79, 'epoch': 2.0}
{'loss': 0.6252, 'grad_norm': 3.4330527782440186, 'learning_rate': 3.2495812395309884e-06, 'epoch': 2.51}


  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_runtime': 27.5974, 'eval_samples_per_second': 12.827, 'eval_steps_per_second': 1.631, 'epoch': 3.0}
{'train_runtime': 2736.3189, 'train_samples_per_second': 3.484, 'train_steps_per_second': 0.436, 'train_loss': 0.6413694090978983, 'epoch': 3.0}


TrainOutput(global_step=1194, training_loss=0.6413694090978983, metrics={'train_runtime': 2736.3189, 'train_samples_per_second': 3.484, 'train_steps_per_second': 0.436, 'total_flos': 255432387668580.0, 'train_loss': 0.6413694090978983, 'epoch': 3.0})

In [14]:
# Saving the model
trainer.save_model("data/fine-tuned_adapter_model")

In [15]:
# Loading model from save
trainer._load_from_checkpoint(resume_from_checkpoint="data/fine-tuned_adapter_model")

There were missing keys in the checkpoint model loaded: ['roberta.prompt_tuning.base_model_embeddings.weight', 'heads.default.3.weight'].


In [16]:
# obtaining predictions from fine-tuned model as logits
testPredictions = trainer.predict(test_dataset)

  0%|          | 0/50 [00:00<?, ?it/s]

In [23]:
conv_logits(testPredictions[0][-1])

104


np.int64(104)

In [24]:
from scipy.special import softmax
 
# converting logits into predicted labels
def conv_logits(predictions):
    logits = np.array(predictions)
    probabilities = softmax(logits, axis = -1)
    predicted_classes = np.argmax(probabilities, axis = -1)
    print(predicted_classes)
    test_dataset["label"]

    return predicted_classes

compute_metrics(conv_logits(testPredictions[0][-1]), test_dataset["label"])

[0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0
 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0]


0.5903307888040712

## Make predictions for X notes data

In [17]:
print(test_dataset)

Dataset({
    features: ['input_ids', 'label'],
    num_rows: 393
})


In [None]:
annoNotes = pd.read_csv("data/annotation.csv")

tokens = baseline_tokeniser(list(annoNotes.loc[:, "summary"]))
dataset = Dataset.from_dict({
    "input_ids": tokens["input_ids"],
    "labels": annoNotes.loc[:, "label1"]
})
predictions = trainer.predict(dataset)
print(predictions)

  0%|          | 0/25 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 0.34620348, -0.2979768 ],
       [ 0.5537176 , -0.52134496],
       [-0.21525861,  0.2220012 ],
       [ 0.32885295, -0.2924871 ],
       [ 0.39996213, -0.35059458],
       [ 0.18768539, -0.16080087],
       [-0.22029711,  0.22896886],
       [-0.34772477,  0.35249346],
       [ 0.01292546,  0.0027965 ],
       [-0.48634216,  0.4721828 ],
       [ 0.49034876, -0.44860524],
       [ 0.32648107, -0.30675718],
       [-0.4940798 ,  0.49413982],
       [ 0.09959418, -0.05490456],
       [ 0.12447432, -0.0668625 ],
       [-0.3665264 ,  0.3537118 ],
       [ 0.07211012, -0.01336409],
       [ 0.11071128, -0.06568698],
       [ 0.21493815, -0.15154035],
       [-0.08464696,  0.11310592],
       [-0.22921884,  0.22524486],
       [-0.67228365,  0.67454153],
       [ 0.58478385, -0.5304098 ],
       [-0.48857275,  0.475034  ],
       [ 0.04446379, -0.0112771 ],
       [ 0.00825835,  0.01982651],
       [ 0.19107667, -0.1450686 ],
       [ 0.3036787 , -0.26

In [40]:
print(len(tokens))

2


In [33]:
predictions[0]

array([[ 0.34620348, -0.2979768 ],
       [ 0.5537176 , -0.52134496],
       [-0.21525861,  0.2220012 ],
       [ 0.32885295, -0.2924871 ],
       [ 0.39996213, -0.35059458],
       [ 0.18768539, -0.16080087],
       [-0.22029711,  0.22896886],
       [-0.34772477,  0.35249346],
       [ 0.01292546,  0.0027965 ],
       [-0.48634216,  0.4721828 ],
       [ 0.49034876, -0.44860524],
       [ 0.32648107, -0.30675718],
       [-0.4940798 ,  0.49413982],
       [ 0.09959418, -0.05490456],
       [ 0.12447432, -0.0668625 ],
       [-0.3665264 ,  0.3537118 ],
       [ 0.07211012, -0.01336409],
       [ 0.11071128, -0.06568698],
       [ 0.21493815, -0.15154035],
       [-0.08464696,  0.11310592],
       [-0.22921884,  0.22524486],
       [-0.67228365,  0.67454153],
       [ 0.58478385, -0.5304098 ],
       [-0.48857275,  0.475034  ],
       [ 0.04446379, -0.0112771 ],
       [ 0.00825835,  0.01982651],
       [ 0.19107667, -0.1450686 ],
       [ 0.3036787 , -0.26469794],
       [ 0.16120611,

In [34]:
compute_metrics(conv_logits(predictions[0]), dataset["labels"])

AxisError: axis 1 is out of bounds for array of dimension 0

## Compare classifier predictions to human predictions

In [None]:
# my idea for how to evaluate:
# 1) we set some guidelines for how we will evaluate notes 
# 2) we individually make labels for one days worth of notes (that's about 200 notes; this would be the train set)
# 3) we calculate out interrater agreement (if it's very bad, we revise our guidelines and re-label)
# 4) we average our labels and use that to further fine-tune the classifier
# 5) we repeat step 2 on a new set of notes (this would be the test set)
# 6) we make predictions using the classifier obtained from step 4
# 7) either we calculate the MSE using our labels as ground truth, or we calculate three interrater agreements (chico-andrew, chico-classifier, andrew-classifier)
# and see if the human-human agreement is better than the human-classifier agreement.
...