In [7]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed
from multiprocessing import cpu_count
#to output plots within the notebook
%matplotlib inline

import importlib
import inspect
import os
import sys

%cd /content/
!ls -R
repo_name = "MindTheGap"
!rm -rf {repo_name}
if not os.path.exists(repo_name):
    print(f"Directory {repo_name} does not exist, proceeding with clone.")
    !git clone https://github.com/Abudo-S/MindTheGap.git

%cd MindTheGap


import LightTransformerModel as LightTransformerModel_Module
import intersentence_loader as intersentence_loader_Module
import dataloader
importlib.reload(LightTransformerModel_Module) # in case of updates
importlib.reload(intersentence_loader_Module) # in case of updates
from LightTransformerModel import LightTransformerModel
from intersentence_loader import IntersentenceDataset

/content
.:
MindTheGap  sample_data

./MindTheGap:
data	       intersentence_loader.py	 MindTheGap.ipynb  README.md
dataloader.py  LightTransformerModel.py  __pycache__	   requirements.txt

./MindTheGap/data:
stereo_dataset.json  test_terms.txt

./MindTheGap/__pycache__:
intersentence_loader.cpython-312.pyc  LightTransformerModel.cpython-312.pyc

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md
Directory MindTheGap does not exist, proceeding with clone.
Cloning into 'MindTheGap'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 30 (delta 10), reused 23 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 1.29 MiB | 7.28 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/MindTheGap


In [9]:

class BiasEvaluator():
    def __init__(self, pretrained_class="roberta-base", no_cuda=False,
                 input_file="/content/MindTheGap/data/stereo_dataset.json", tokenizer="roberta-base",
                 intersentence_load_path=None, intrasentence_load_path=None, skip_intrasentence=False,
                 skip_intersentence=False, batch_size=1, max_seq_length=128,
                 output_dir="predictions/", output_file="predictions.json"):
        print(f"Loading {input_file}...")
        filename = os.path.abspath(input_file)
        self.dataloader = dataloader.StereoSet(filename)
        self.cuda = not no_cuda
        self.device = "cuda" if self.cuda else "cpu"

        self.INTRASENTENCE_LOAD_PATH = intrasentence_load_path
        self.INTERSENTENCE_LOAD_PATH = intersentence_load_path
        self.SKIP_INTERSENTENCE = skip_intersentence
        self.SKIP_INTRASENTENCE = skip_intrasentence
        self.INTRASENTENCE_LOAD_PATH = intrasentence_load_path
        self.INTERSENTENCE_LOAD_PATH = intersentence_load_path

        self.PRETRAINED_CLASS = pretrained_class
        self.TOKENIZER = tokenizer
        self.tokenizer = LightTransformerModel(model_name=self.PRETRAINED_CLASS).tokenizer

        # to keep padding consistent with the other models -> improves LM score.
        if self.tokenizer.__class__.__name__ == "XLNetTokenizer":
            self.tokenizer.padding_side = "right"
        self.MASK_TOKEN = self.tokenizer.mask_token

        # Set this to be none if you don't want to batch items together!
        self.batch_size = batch_size
        self.max_seq_length = None if self.batch_size == 1 else max_seq_length

        self.MASK_TOKEN_IDX = self.tokenizer.encode(
            self.MASK_TOKEN, add_special_tokens=False)
        assert len(self.MASK_TOKEN_IDX) == 1
        self.MASK_TOKEN_IDX = self.MASK_TOKEN_IDX[0]

    def evaluate_intrasentence(self):
        model = LightTransformerModel(model_name=self.PRETRAINED_CLASS).to(self.device)

        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            model = nn.DataParallel(model)
        model.eval()

        print()
        if self.INTRASENTENCE_LOAD_PATH:
            state_dict = torch.load(self.INTRASENTENCE_LOAD_PATH)
            model.load_state_dict(state_dict)

        pad_to_max_length = True if self.batch_size > 1 else False
        dataset = dataloader.IntrasentenceLoader(self.tokenizer, max_seq_length=self.max_seq_length,
                                                 pad_to_max_length=pad_to_max_length,
                                                 input_file= "/content/MindTheGap/data/stereo_dataset.json",)

        loader = DataLoader(dataset, batch_size=self.batch_size)
        word_probabilities = defaultdict(list)

        # calculate the logits for each prediction
        for sentence_id, next_token, input_ids, attention_mask, token_type_ids in tqdm(loader, total=len(loader)):
            # start by converting everything to a tensor
            input_ids = torch.stack(input_ids).to(self.device).transpose(0, 1)
            attention_mask = torch.stack(attention_mask).to(
                self.device).transpose(0, 1)
            next_token = next_token.to(self.device)
            token_type_ids = torch.stack(token_type_ids).to(
                self.device).transpose(0, 1)

            mask_idxs = (input_ids == self.MASK_TOKEN_IDX)

            # get the probabilities
            output = model(input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids)[0].softmax(dim=-1)

            output = output[mask_idxs]
            output = output.index_select(1, next_token).diag()
            for idx, item in enumerate(output):
                word_probabilities[sentence_id[idx]].append(item.item())

        # now reconcile the probabilities into sentences
        sentence_probabilties = []
        for k, v in word_probabilities.items():
            pred = {}
            pred['id'] = k
            # score = np.sum([np.log2(i) for i in v]) + np.log2(len(v))
            score = np.mean(v)
            pred['score'] = score
            sentence_probabilties.append(pred)

        return sentence_probabilties

    def count_parameters(self, model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    def evaluate_intersentence(self):
        print()
        model = LightTransformerModel().to(self.device)

        print(f"Number of parameters: {self.count_parameters(model):,}")
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model)

        if self.INTERSENTENCE_LOAD_PATH:
            model.load_state_dict(torch.load(self.INTERSENTENCE_LOAD_PATH))

        model.eval()
        dataset = IntersentenceDataset(self.tokenizer)
        # TODO: test this on larger batch sizes.
        #assert args.batch_size == 1
        dataloader = DataLoader(dataset, shuffle=True, num_workers=0)

        if not self.cuda:
            n_cpus = cpu_count()
            print(f"Using {n_cpus} cpus!")
            predictions = Parallel(n_jobs=n_cpus, backend="multiprocessing")(delayed(process_job)(
                batch, model, self.PRETRAINED_CLASS) for batch in tqdm(dataloader, total=len(dataloader)))
        else:
            predictions = []

            for batch_num, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
                input_ids, token_type_ids, attention_mask, sentence_id = batch
                input_ids = input_ids.to(self.device)
                token_type_ids = token_type_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                outputs = model(input_ids, token_type_ids=token_type_ids)
                if type(outputs) == tuple:
                    outputs = outputs[0]
                outputs = torch.softmax(outputs, dim=1)

                for idx in range(input_ids.shape[0]):
                    probabilities = {}
                    probabilities['id'] = sentence_id[idx]
                    if "bert" == self.PRETRAINED_CLASS[:4] or "roberta-base" == self.PRETRAINED_CLASS:
                        probabilities['score'] = outputs[idx, 0].item()
                    else:
                        probabilities['score'] = outputs[idx, 1].item()
                    predictions.append(probabilities)

        return predictions

    def evaluate(self):
        bias = {}
        if not self.SKIP_INTERSENTENCE:
            intersentence_bias = self.evaluate_intersentence()
            bias['intersentence'] = intersentence_bias

        if not self.SKIP_INTRASENTENCE:
            intrasentence_bias = self.evaluate_intrasentence()
            bias['intrasentence'] = intrasentence_bias
        return bias


def process_job(batch, model, pretrained_class):
    input_ids, token_type_ids, sentence_id = batch
    outputs = model(input_ids, token_type_ids=token_type_ids)
    if type(outputs) == tuple:
        outputs = outputs[0]
    outputs = torch.softmax(outputs, dim=1)

    pid = sentence_id[0]
    # if "bert"==self.PRETRAINED_CLASS[:4]:
    if "bert" in pretrained_class:
        pscore = outputs[0, 0].item()
    else:
        pscore = outputs[0, 1].item()
    return (pid, pscore)


evaluator = BiasEvaluator(pretrained_class="roberta-base", skip_intersentence=True)
results = evaluator.evaluate()
results
# if args.output_file is not None:
#     output_file = args.output_file
# else:
#     output_file = f"predictions_{args.pretrained_class}_{args.intersentence_model}_{args.intrasentence_model}.json"

# output_file = os.path.join(args.output_dir, output_file)
# with open(output_file, "w+") as f:
#     json.dump(results, f, indent=2)


Loading /content/MindTheGap/data/stereo_dataset.json...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





  0%|          | 0/11017 [00:00<?, ?it/s]


TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'pad_to_max_length'