In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/Colab Notebooks/cs566-project

/content/drive/MyDrive/Colab Notebooks/cs566-project


In [3]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch] -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers[torch]
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1


In [4]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

In [5]:
INFERENCE_MAX_LENGTH = 2500
# This be a function called 'tokenize', me hearties!
def tokenize(example, tokenizer):
    # We be creatin' two empty lists, 'text' and 'token_map', to store our tokens and their respective maps.
    text = []
    token_map = []

    # We start the 'idx' at 0, it be used to keep track of the tokens.
    idx = 0

    # Now, we be loopin' through the tokens and their trailin' white spaces.
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):

        # We add the token 't' to the 'text' list.
        text.append(t)

        # We be extendin' the 'token_map' list by repeatin' the 'idx' as many times as the length of token 't'.
        token_map.extend([idx]*len(t))

        # If there be trailin' whitespace (ws), we add a space to 'text' and mark it with a '-1' in 'token_map'.
        if ws:
            text.append(" ")
            token_map.append(-1)

        # We increment 'idx' to keep track of the next token.
        idx += 1

    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'token_map'.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)

    # We return a dictionary containin' the tokenized data and the 'token_map'.
    return {
        **tokenized,
        "token_map": token_map,
    }

In [6]:
# Load the test data from a JSON file
data = json.load(open("data/test.json"))

# Create a dataset from the loaded data
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

# Initialize a tokenizer and model from the pretrained model path

# model_paths = {'/kaggle/input/pii-deberta-models/cola-de-piiranha' : 1/3,
#               '/kaggle/input/pii-deberta-models/cuerpo-de-piiranha' : 1/3,
#               '/kaggle/input/pii-deberta-models/cabeza-de-piiranha' : 1/3}
model_paths = {'model/deberta3base_1024' : 1}

first_model_path = list(model_paths.keys())[0]

tokenizer = AutoTokenizer.from_pretrained(first_model_path)

# Tokenize the dataset using the 'tokenize' function in parallel
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 2)


import gc
import torch
import numpy as np

from scipy.special import softmax


all_preds = []

# Calculate the total weight
total_weight = sum(model_paths.values())

for model_path, weight in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)
    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        tokenizer=tokenizer,
    )
    predictions = trainer.predict(ds).predictions
    # This idea from this notebook: https://www.kaggle.com/code/olyatsimboy/912-blending-0-903-0-854-deberta3base
    weighted_predictions = softmax(predictions, axis = -1) * weight
    all_preds.append(weighted_predictions)
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

# Calculate the weighted average of predictions
weighted_average_predictions = np.sum(all_preds, axis=0) / total_weight

Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12]
# Change this threshold to "manually" adjust for the FBeta metric
threshold = 0.9875
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

In [8]:
print(ds["full_text"][-3])

Storytelling  The Path to Innovation

Dr Sakir Ahmad

Challenge & Selection

Be it any organization, team or a government venture, I have often observed that they experience  failures because they couldn’t connect with their audience aptly. In such scenarios, storytelling  can turn the table significantly. I have used the tool ‘storytelling’ in my organization to convey a  complicated message as simply as possible or to generate ideas from people around me. The idea  is to implement this innovative tool to make the intricate issues understandable and engage the  targeted audience. I relayed the context of the story to captivate the attention of the audience. I  have used the technique in front of my team-members of the organization to impart its essence,  brainstorm notions, make productive connections and develop appropriate strategies. The crucial  element in storytelling is to engagingly answer the questions raised in the story to its targeted  audience and allow them to resonate wi

In [9]:
# Prepare to plunder the data for valuable triplets!
triplets = []
document, token, label, token_str = [], [], [], []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token

        # If start and end indices sum to zero, continue to the next iteration
        if start_idx + end_idx == 0:
            continue

        # If the token mapping at the start index is -1, increment start index
        if token_map[start_idx] == -1:
            start_idx += 1

        # Ignore leading whitespace tokens ("\n\n")
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # If start index exceeds the length of token mapping, break the loop
        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]  # Token ID at start index

        # Ignore "O" predictions and whitespace tokens
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])  # Form a triplet

            # If the triplet is not in the list of triplets, add it
            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# We've gathered the valuable triplets from the dataset, ready for analysis!


In [12]:
# Haul in the data and prepare for your quest!
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

# Assign each row a unique 'row_id'
df["row_id"] = list(range(len(df)))

# Display a glimpse of the first 100 rows of your data
display(df.head(100))

# Cast your findings into a CSV file for further exploration
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
