# Setup

## Imports

In [1]:
import json
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from collections import Counter

2024-02-28 14:37:03.635907: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 14:37:03.636057: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 14:37:03.833826: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Get data

In [2]:
# Get training data
train_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))
test_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))

In [3]:
# Transform into pandas dataframe
train_df = pd.json_normalize(train_data)
test_df = pd.json_normalize(test_data)

# Modelling

## Helper functions

In [4]:
# Function to get token index and predicted label
def get_token_predictions(predictions: list, token_list: list, whitespace_list:list, label_list: list = []) -> dict:
    tok_preds = {} # dict of token indexes and labels
    id = 0 # keep our position in the full_text by token
    valid = False

    for i in range(len(token_list)):
        for entity in predictions:

            label = '' # format label from model output -> submission
            if (entity['entity'] == "B-PER") or (entity['entity'] == "I-PER") : 
                label = "NAME_STUDENT"
                valid = True
                
            if id == entity['start'] and i-1 not in tok_preds and valid == True: # add B or I prefix
                    tok_preds[i] = ["B-" + label, label_list[i], token_list[i], id] if label_list else ["B-" + label, '', token_list[i], id]
            elif id == entity['start'] and i-1 in tok_preds and valid == True: # add B or I prefix
                    tok_preds[i] = ["I-" + label, label_list[i], token_list[i], id] if label_list else ["I-" + label, '', token_list[i], id]
            elif id > entity['start'] and id < entity['end'] and valid == True:
                    tok_preds[i] = ["I-" + label, label_list[i], token_list[i], id] if label_list else ["I-" + label, '', token_list[i], id]
            
            valid = False
                                
        id += len(token_list[i]) # increase by the length of the token
        if whitespace_list[i] == True: # add whitespace if needed
            id += 1
        

    return(tok_preds)

In [5]:
#nlp(train_df["full_text"][0].replace('\n', " "))

In [6]:
# Function to format submission
def get_submission(token_predictions: dict) -> pd.DataFrame:
    submission = []

    i = 0
    for key, value in token_predictions.items():
        for keys, values in value.items():  
            submission.append([i, key, keys, values[0], values[1]])
            i += 1

    submission = pd.DataFrame(submission)
    submission.columns = ['row_id', 'document', 'token', 'label', 'true label']

    return(submission)

## Apply model on formatted text

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
# Function to get predictions for each document
def get_total_predictions(data: pd.DataFrame, type: str = "test") -> dict:
    predictions = {}
    
    for line in data.index: # loop over every document to get all predictions
        doc_id = data["document"][line]
        full_text = data['full_text'][line].replace('\n', " ")
        tokens = data['tokens'][line]
        trailing_whitespaces = data['trailing_whitespace'][line]
        if type == "train":
            labels = data['labels'][line]
        
        prediction = nlp(full_text)
                
        if type == "train":
            token_preds = get_token_predictions(prediction, tokens, trailing_whitespaces, labels)
        else:
            token_preds = get_token_predictions(prediction, tokens, trailing_whitespaces)
        
        predictions[str(doc_id)] = token_preds
        
        #print(line)

    return predictions

# Evaluation

In [None]:
#sample_df = train_df.sample(n=100, random_state=42).sort_values("document", ignore_index=True)

In [None]:
#sample_df

## Make predictions on sample training data

In [None]:
#result_eval = get_total_predictions(sample_df, "train")

In [None]:
#eval_df = pd.DataFrame(get_submission(result_eval)).rename(columns={"label": "pred"})

In [None]:
#eval_df["document"] = eval_df["document"].astype(int)

In [None]:
#eval_df

## True labels

In [None]:
#true_labels = sample_df[['document','tokens', 'labels']]

In [None]:
#true_labels.head()

In [None]:
#labels_df = true_labels.apply(lambda x: x.explode()).reset_index(drop=True)

In [None]:
#labels_df['token_id'] = labels_df.groupby('document').cumcount()

In [None]:
#labels_df

## Merge both

In [None]:
#merged_df = pd.merge(labels_df, eval_df[['token', 'pred', 'document']], left_on=["document", "token_id"], right_on=["document", "token"], how='left').fillna({'pred': 'O'}).drop(["token", "document"], axis=1)

In [None]:
#merged_df

## Metrics time

In [None]:
# Compute rate
'''
TP, FP, TN, FN = 0, 0, 0, 0
tp, fp, fn, missclass = [],[],[],[]

for index, row in merged_df.iterrows():
    if (merged_df["labels"][index] == 'O') & (merged_df["pred"][index] == 'O'):
        TN += 1
    if (merged_df["labels"][index] != 'O') & (merged_df["pred"][index] == 'O'):
        FN += 1
        fn.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
    if (merged_df["labels"][index] == 'O') & (merged_df["pred"][index] != 'O'):
        FP += 1
        fp.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
    if (merged_df["labels"][index] != 'O') & (merged_df["pred"][index] != 'O'):
        if merged_df["labels"][index] == merged_df["pred"][index]:
            TP += 1
            tp.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])
        else:
            FN, FP = FN + 1, FP + 1
            missclass.append([merged_df["tokens"][index], merged_df["labels"][index], merged_df["pred"][index]])

print(TP, FP, TN, FN)
'''

In [None]:
# Pretty clear what this does
def calculate_f_beta_score(TP, FP, FN, beta=5):
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0

    beta_squared = beta ** 2
    f_beta_score = (1 + beta_squared) * (precision * recall) / ((beta_squared * precision) + recall) if (beta_squared * precision) + recall != 0 else 0

    return f_beta_score

In [None]:
#print(calculate_f_beta_score(TP, FP, FN))

## Errors analysis

### Analyze errors for false negatives

In [None]:
#fn_df = pd.DataFrame(fn, columns=["tokens", "labels", "preds"])

In [None]:
#fn

In [None]:
#fn_counts = Counter(fn_df["labels"])

In [None]:
#fn_counts.most_common()

### Same for false positives

In [None]:
#fp_df = pd.DataFrame(fp, columns=["tokens", "labels", "preds"])

In [None]:
#fp_counts = Counter(fp_df["preds"])

In [None]:
#fp_counts.most_common()

# Submission

In [None]:
results = get_total_predictions(test_df)

In [None]:
# Get submission data
submission = get_submission(results)

In [None]:
pd.DataFrame(submission)

In [None]:
pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv').head(10)

In [None]:
submission.to_csv('submission.csv', index = False)