In [None]:
# Handle warnings
import warnings
warnings.simplefilter('ignore')

In [None]:
# Workflow essentials
import gc
import json
import torch
import random
import argparse
from pathlib import Path
from itertools import chain
from functools import partial
from datasets import Dataset, features

# Data preprocessing and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Model development
from sklearn.metrics import f1_score
from transformers import AutoModel, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

2024-05-10 02:38:47.673038: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 02:38:47.673169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 02:38:47.805558: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
class CFG:
    # Path to test dataset
    test_data = '/kaggle/input/pii-detection-removal-from-educational-data/test.json'

    # Path to saved model checkpoint
    checkpoint = '/kaggle/input/piidd-train-deberta-with-hugging-face/deberta3base'

    # Data preprocessing
    max_len = 3072
    workers = 4

    # Model inference
    threshold = 0.84
    batch_size = 4
    stride = 256
    seed = 457

In [None]:
def global_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
# Set seed for reproducibility across multiple libraries
global_seed(CFG.seed)

In [None]:
gc.collect()

23

In [None]:
class DataPreprocessor:
    def tokenize(self, example, tokenizer):
        # Initialize lists to store tokenized text and token map
        text, token_map = [], []

        # Initialize index for token mapping
        idx = 0

        # Iterate over tokens and trailing whitespace in the example
        for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
            # Append token to the text list
            text.append(t)

            # Extend token map with index for the current token repeated for its length
            token_map.extend([idx] * len(t))

            # If there is trailing whitespace after the token, append a space to the text and mark its position as -1 in the token map
            if ws:
                text.append(" ")
                token_map.append(-1)

            # Increment the index for token mapping
            idx += 1

        # Tokenize the text using the provided tokenizer
        tokenized = tokenizer("".join(text),
                              return_offsets_mapping=True,
                              truncation=True,
                              max_length=CFG.max_len,
                              stride=CFG.stride,
                              return_overflowing_tokens=True)

        # Return the tokenized text along with the token map
        return {
            **tokenized,
            "token_map": token_map,
        }

# Initialize DataPreprocessor class
dp = DataPreprocessor()

In [None]:
# Load test data
data = json.load(open(CFG.test_data))

# Create the dataset
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

# Initialize the tokenizer for the pre-trained model checkpoint
tokenizer = AutoTokenizer.from_pretrained(CFG.checkpoint)

# Tokenize the test essays
ds = ds.map(dp.tokenize,
            fn_kwargs={"tokenizer": tokenizer},
            num_proc=CFG.workers)

      

#0:   0%|          | 0/3 [00:00<?, ?ex/s]

#1:   0%|          | 0/3 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/2 [00:00<?, ?ex/s]

#3:   0%|          | 0/2 [00:00<?, ?ex/s]

In [None]:
class ModelInference:
    def __init__(self):
        pass

    def backwards_map_preds(self, sub_predictions, max_len, CFG):
        # nothing to map backwards if sequence is too short to be split in the first place
        if max_len != 1:
            for i in range(max_len):
                if i == 0:
                    # First sequence needs no SEP token (used to end a sequence)
                    sub_predictions = sub_predictions[:,:-1,:]
                elif i == max_len-1:
                    # End sequence needs to CLS token + Stride tokens
                    sub_predictions = sub_predictions[:,1+CFG.stride:,:] # CLS tokens + Stride tokens
                else:
                    # Middle sequence needs to CLS token + Stride tokens + SEP token
                    sub_predictions = sub_predictions[:,1+CFG.stride:-1,:]

        return sub_predictions

    def backwards_map_(self, row_attribute, max_len, CFG):
        # Same logics as for backwards_map_preds - except lists instead of 3darray
        if max_len != 1:
            for i in range(max_len):
                if i == 0:
                    row_attribute = row_attribute[:-1]
                elif i == max_len-1:
                    row_attribute = row_attribute[1+CFG.stride:]
                else:
                    row_attribute = row_attribute[1+CFG.stride:-1]

        return row_attribute

    def process_dataset(self, ds, tokenizer, CFG):
        # Initialize model and collator
        model = AutoModelForTokenClassification.from_pretrained(CFG.checkpoint)
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

        # Define training arguments
        args = TrainingArguments(
            ".",
            per_device_eval_batch_size=CFG.batch_size,
            report_to="none",
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=args,
            data_collator=collator,
            tokenizer=tokenizer,
        )

        # Initialize empty lists and dictionary for storing predictions
        preds = []
        ds_dict = {
            "document":[],
            "token_map":[],
            "offset_mapping":[],
            "tokens":[]
        }

        # Iterate over each row in the dataset
        for row in ds:
            # Initialize lists for storing predictions and offsets
            row_preds = []
            row_offset = []

            # Iterate over tokens and their offset mappings in the row
            for i, y in enumerate(row["offset_mapping"]):
                # Create a new dataset for each split of the document
                x = Dataset.from_dict({
                    "token_type_ids":[row["token_type_ids"][i]],
                    "input_ids":[row["input_ids"][i]],
                    "attention_mask":[row["attention_mask"][i]],
                    "offset_mapping":[row["offset_mapping"][i]]
                })
                # Predict for the split
                pred = trainer.predict(x).predictions
                # Remove stride and additional CLS & SEP tokens
                row_preds.append(self.backwards_map_preds(pred, len(row["offset_mapping"]), CFG))
                row_offset += self.backwards_map_(y, len(row["offset_mapping"]), CFG)

            # Finalize row
            ds_dict["document"].append(row["document"])
            ds_dict["tokens"].append(row["tokens"])
            ds_dict["token_map"].append(row["token_map"])
            ds_dict["offset_mapping"].append(row_offset)

            # Finalize prediction collection by concatenating
            p_concat = np.concatenate(row_preds, axis = 1)
            preds.append(p_concat)

        # Load model configuration
        config = json.load(open(Path(CFG.checkpoint) / "config.json"))
        id2label = config["id2label"]

        # Finalize predictions
        preds_final = []
        for predictions in preds:
            predictions_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
            predictions = predictions.argmax(-1)
            predictions_without_O = predictions_softmax[:,:,:12].argmax(-1)
            O_predictions = predictions_softmax[:,:,12]

            threshold = CFG.threshold
            preds_final.append(np.where(O_predictions < threshold, predictions_without_O , predictions))

        # Create a new dataset from the dictionary
        ds = Dataset.from_dict(ds_dict)

        # Initialize empty lists for storing pairs
        pairs = []
        document, token, label, token_str = [], [], [], []

        # Iterate over predictions, token maps, offsets, tokens, and documents
        for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
            # Iterate over each token prediction and its offset
            for token_pred, (start_idx, end_idx) in zip(p[0], offsets):
                label_pred = id2label[str(token_pred)]

                if start_idx + end_idx == 0: continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # Ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map): break

                token_id = token_map[start_idx]

                # Ignore "O" predictions and whitespace predictions
                if label_pred != "O" and token_id != -1:
                    pair=(doc, token_id)

                    if pair not in pairs:
                        document.append(doc)
                        token.append(token_id)
                        label.append(label_pred)
                        token_str.append(tokens[token_id])
                        pairs.append(pair)

        # Create DataFrame
        df = pd.DataFrame({
            "document": document,
            "token": token,
            "label": label,
            "token_str": token_str
        })
        df["row_id"] = list(range(len(df)))

        return df

# Initialize ModelInference class
mi = ModelInference()

In [None]:
# Process dataset and create DataFrame
df = mi.process_dataset(ds, tokenizer, CFG)

In [None]:
# Display DataFrame
display(df)

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [None]:
comp = pd.read_csv("/kaggle/input/output/output.csv")

In [None]:
comp

Unnamed: 0,Token Position,label
0,9,B-NAME_STUDENT
1,10,I-NAME_STUDENT
2,482,B-NAME_STUDENT
3,483,I-NAME_STUDENT
4,741,B-NAME_STUDENT
5,742,I-NAME_STUDENT
6,0,B-NAME_STUDENT
7,1,I-NAME_STUDENT
8,464,B-NAME_STUDENT
9,465,I-NAME_STUDENT


In [None]:
import pandas as pd

total_rows_df = len(df)
total_rows_comp = len(comp)

# Initialize a counter for matching rows
matching_rows = 0

# Iterate through rows in df
for index, row in df.iterrows():
    token_df = row['token']
    label_df = row['label']

    # Iterate through rows in comp
    for index_comp, row_comp in comp.iterrows():
        token_comp = row_comp['Token Position']
        label_comp = row_comp['label']

        # Compare 'Token' and 'Label' values
        if token_df == token_comp and label_df == label_comp:
            matching_rows += 1
            break  # Exit the inner loop if a match is found

# Calculate accuracy
accuracy = (matching_rows / total_rows_df) * 100

print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 89.66%


In [None]:
import pandas as pd

# Assuming df and comp are your DataFrames
# Initialize variables for TP, FP, and FN
TP = 0
FP = 0
FN = 0

# Iterate through rows in df
for index, row in df.iterrows():
    token_df = row['token']
    label_df = row['label']

    # Initialize a flag for matching
    match_found = False

    # Iterate through rows in comp
    for index_comp, row_comp in comp.iterrows():
        token_comp = row_comp['Token Position']
        label_comp = row_comp['label']

        # Compare 'Token' and 'Label' values
        if token_df == token_comp and label_df == label_comp:
            match_found = True
            break  # Exit the inner loop if a match is found

    # Update TP, FP, and FN counts
    if match_found:
        TP += 1
    else:
        FN += 1

# Calculate FP as the difference between total rows in comp and TP
FP = total_rows_comp - TP

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Precision: 1.00
Recall: 0.90


In [None]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)