## Named Entity Recognition with BERT

In [None]:
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json
import zipfile

# Configuration
MODEL_NAME = 'bert-base-multilingual-cased'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=3)  # Adjust num_labels based on unique labels in your dataset
model.to(DEVICE)

def load_data(file_path):
    """ Load JSON data from a file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def create_labels(tokens, ners):
    """ Create labels aligned with token positions in the document. """
    labels = [-100] * len(tokens)  # Use -100 to ignore certain positions in loss calculation
    if ners:
        for start, end, label in ners:
            for i, token in enumerate(tokens):
                token_start = token.offsets[0]
                token_end = token.offsets[1] - 1
                if token_start >= start and token_end <= end:
                    labels[i] = label
    return labels

def encode_data(data):
    """ Encode texts and labels for NER training or prediction. """
    encoded_texts = []
    encoded_labels = []
    for item in data:
        text_field = 'sentences' if 'sentences' in item else 'senences'
        # Tokenize the text; ensure that the tokenizer returns offset mappings
        encodings = tokenizer(item[text_field].split(), is_split_into_words=True, return_offsets_mapping=True)
        words = item[text_field].split()
        labels = [-100] * len(encodings['input_ids'])  # Use -100 to ignore certain positions in loss calculation
        ners = item.get('ners', [])
        idx = 0

        # Align the labels with token offsets
        for start, end, label in ners:
            while idx < len(encodings.offset_mapping) and (encodings.offset_mapping[idx][0] != start):
                idx += 1
            while idx < len(encodings.offset_mapping) and (encodings.offset_mapping[idx][1] <= end):
                labels[idx] = label
                idx += 1
                if idx < len(encodings.offset_mapping) and encodings.offset_mapping[idx][0] > end:
                    break

        encoded_texts.append(encodings)
        encoded_labels.append(labels)

    return encoded_texts, encoded_labels


def encode_tags(texts, tags, max_length):
    """ Encode tags to align with BERT tokenization. """
    encoded_labels = []
    for encodings, label in zip(texts, tags):
        labels = np.ones(len(encodings['input_ids']), dtype=int) * -100  # Ignore loss calculation for padding
        for idx, token_id in enumerate(encodings['input_ids']):
            if token_id != tokenizer.pad_token_id and idx < len(label):
                labels[idx] = label[idx]
        encoded_labels.append(labels)
    return encoded_labels

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def create_dataset(file_path, max_length=128):
    """ Prepare dataset for training or evaluation. """
    data = load_data(file_path)
    texts, tags = encode_data(data)
    labels = encode_tags(texts, tags, max_length)
    return NERDataset(texts, labels)

def train_and_save_model(train_dataset):
    """ Train and save the model. """
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        logging_dir='./logs',
        logging_steps=10,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    trainer.train()
    model.save_pretrained('./bert_ner_model')
    tokenizer.save_pretrained('./bert_ner_model')

def predict_and_save_results(data_file, output_file):
    """ Predict using the model and save the results. """
    test_dataset = create_dataset(data_file)
    trainer = Trainer(model=model)
    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

    output_predictions = []
    for i, (pred, label) in enumerate(zip(predictions, test_dataset.labels)):
        true_pred = [p for (p, l) in zip(pred, label) if l != -100]
        output_predictions.append({'id': i, 'predictions': true_pred})

    with open(output_file, 'w', encoding='utf-8') as f:
        for record in output_predictions:
            json.dump(record, f)
            f.write('\n')

def zip_output(output_filename, zip_filename='test.zip'):
    """ Zip the specified file into an archive. """
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(output_filename)

# Main execution logic
train_dataset = create_dataset('../data/train.jsonl')
train_and_save_model(train_dataset)

predict_and_save_results('../data/test.jsonl', 'test.jsonl')
zip_output('test.jsonl', 'test.zip')
