# Fine-Tuning

## Load Dataset into Google Colab

### Install Required Packages

In [3]:
# Install required packages
!pip install transformers datasets torch scikit-learn



### Import Necessary Libraries

In [4]:
# Import necessary libraries
from google.colab import drive # Library to mount Google Drive in Colab
import os # For file operations like reading file names from a directory
import json # Library to handle JSON data
import glob # For recursive file search
import re # For regular expression matching
from datasets import Dataset, concatenate_datasets # For creating Hugging Face Datasets
import torch # PyTorch library
import numpy as np # For numerical operations
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification  # Load pre-trained models and tokenizers for token classification tasks (e.g., NER), training arguments, trainer for training models, and data collator for dynamic padding during batch processing
from sklearn.metrics import precision_recall_fscore_support, classification_report # For evaluating the model

### Mount Google Drive and Dataset Accessibility

In [5]:
# Mount Google Drive in Google Colab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Check if the dataset is accessible in Colab
dataset_path = "/content/drive/MyDrive/Final Project/Fine Tuning "

# List files to verify dataset accessibility
print(os.listdir(dataset_path))

['FUND_LiLT_Format', 'XFUND_LiLT_Final_Format']


### Define Dataset Paths

In [7]:
# Base path to the project in Google Drive
base_path = "/content/drive/MyDrive/Final Project/Fine Tuning "

# FUNSD dataset (English)
funsd_path = os.path.join(base_path, "FUND_LiLT_Format/dataset")
funsd_train_path = os.path.join(funsd_path, "training_data")
funsd_test_path = os.path.join(funsd_path, "testing_data")

# XFUND dataset (Multilingual)
xfund_path = os.path.join(base_path, "XFUND_LiLT_Final_Format")

# Print directories to verify
print("FUNSD Train Path:", funsd_train_path)
print("FUNSD Test Path:", funsd_test_path)
print("XFUND Path:", xfund_path)

FUNSD Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/training_data
FUNSD Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/testing_data
XFUND Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Final_Format


###  Load FUNSD (English) Dataset

In [8]:
def load_dataset(data_path):
    """
    Loads the dataset from a directory path

    Args:
        data_path (str): Path to the directory containing JSON files

    Returns:
        list: A list of dictionaries where each dictionary represents a document
              in the dataset and contains 'id', 'words', 'bboxes', and 'ner_tags'
    """
    dataset = [] # Initialize an empty list to store dataset samples

    # Iterate through all files in the specified directory in a sorted order
    for file_name in sorted(os.listdir(data_path)): # Sorting ensures consistent order
        file_path = os.path.join(data_path, file_name) # Construct the full file path

        # Open and read the JSON file
        with open(file_path, "r") as f:
            data = json.load(f) # Load JSON data into a Python dictionary
            dataset.append(data) # Append the loaded data to the dataset list

    return dataset # Return the complete dataset as a list of dictionaries

# Load FUNSD training and testing datasets
funsd_train = load_dataset(funsd_train_path) # Load training data
funsd_test = load_dataset(funsd_test_path) # Load testing data

# Print confirmation message and display one sample document for verification
print("Loaded FUNSD dataset:")
print(json.dumps(funsd_train[0], indent=2)) # Print the first sample document for inspection

Loaded FUNSD dataset:
{
  "id": "0000971160",
  "words": [
    "R&D",
    ":",
    "Suggestion:",
    "Date:",
    "Licensee",
    "",
    "Yes",
    "No",
    "597005708",
    "R&D",
    "QUALITY",
    "IMPROVEMENT",
    "SUGGESTION/",
    "",
    "SOLUTION",
    "FORM",
    "Name",
    "/",
    "Phone",
    "Ext.",
    ":",
    "M.",
    "Hamann",
    "P.",
    "Harper,",
    "P.",
    "Martinez",
    "9/",
    "3/",
    "92",
    "R&D",
    "Group:",
    "J.",
    "S.",
    "Wigand",
    "Supervisor",
    "/",
    "Manager",
    "Discontinue",
    "coal",
    "retention",
    "analyses",
    "on",
    "licensee",
    "submitted",
    "product",
    "samples",
    "(Note",
    ":",
    "Coal",
    "Retention",
    "testing",
    "is",
    "not",
    "performed",
    "by",
    "most",
    "licensees.",
    "Other",
    "B&W",
    "physical",
    "measurements",
    "as",
    "ends",
    "stability",
    "and",
    "inspection",
    "for",
    "soft",
    "spots",
    "in",
    "cipare

### Load XFUND (Multilingual) Dataset

In [9]:
def load_XFUND_dataset(xfund_path):
    """
    Loads the multilingual XFUND dataset from directory

    Args:
        xfund_path (str): Path to the directory containing language-specific subdirectories

    Returns:
        dict: A dictionary where each key is a language code (e.g., "de", "es"),
              and the value is another dictionary with "train" and "test" datasets
    """
    xfund_data = {} # Dictionary to store data for each language

    # Iterate through all language directories in the XFUND dataset path
    for lang in os.listdir(xfund_path):
        lang_path = os.path.join(xfund_path, lang) # Construct full path to the language directory

        # Ensure that only directories (languages) are processed
        if os.path.isdir(lang_path):
            train_path = os.path.join(lang_path, "training_data") # Path to training data
            test_path = os.path.join(lang_path, "testing_data") # Path to testing data

            # Load training and testing data for the current language using the FUNSD loader function
            xfund_data[lang] = {
                "train": load_dataset(train_path), # Load training data
                "test": load_dataset(test_path) # Load testing data
            }

    return xfund_data # Return the loaded dataset as a dictionary

# Load XFUND dataset
xfund_data = load_XFUND_dataset(xfund_path) # Call function to load all languages

# Print sample from one language (e.g., German "de") to verify the data structure
print("Loaded XFUND dataset (German example):")
print(json.dumps(xfund_data["de"]["train"][0], indent=2)) # Print first training sample of German dataset

Loaded XFUND dataset (German example):
{
  "id": "de_train_0",
  "words": [
    "Bezeichnung,",
    "Ort",
    "und",
    "Gesch\u00e4ftsnummer",
    "des",
    "Gerichts:",
    "Erkl\u00e4rung",
    "\u00fcber",
    "die",
    "pers\u00f6nlichen",
    "und",
    "wirtschaftlichen",
    "Verh\u00e4ltnisse",
    "bei",
    "Prozess-",
    "oder",
    "Verfahrenskostenhilfe",
    "-",
    "Belege",
    "sind",
    "in",
    "Kopie",
    "durchnummeriert",
    "beizuf\u00fcgen",
    "-",
    "A",
    "Angaben",
    "zu",
    "Ihrer",
    "Person",
    "Julian",
    "Herrmann",
    "Lehrer",
    "09.08.1987",
    "ledig",
    "Name,",
    "Vorname,",
    "ggf.",
    "Geburtsname",
    "Beruf,",
    "Erwerbst\u00e4tigkeit",
    "Geburtsdatum",
    "Familienstand",
    "55232",
    "Alzey,",
    "Bahnhofstra\u00dfe",
    "19",
    "16724891042",
    "Anschrift",
    "(Stra\u00dfe,",
    "Hausnummer,",
    "Postleitzahl,",
    "Wohnort)",
    "Tags\u00fcber",
    "tel.",
    "erreichbar",
   

###  Verify the Loaded Data

In [10]:
# Print the number of samples in the FUNSD dataset
print(f"FUNSD Train Samples: {len(funsd_train)}") # Display count of training samples
print(f"FUNSD Test Samples: {len(funsd_test)}") # Display count of testing samples

print("XFUND dataset:")
# Iterate over all languages in the XFUND dataset and print dataset sizes
for lang in xfund_data.keys():
    print(f"{lang.upper()} - Train: {len(xfund_data[lang]['train'])}, Test: {len(xfund_data[lang]['test'])}")
    # Convert language code to uppercase and display counts for training & testing samples

FUNSD Train Samples: 149
FUNSD Test Samples: 50
XFUND dataset:
JA - Train: 149, Test: 50
IT - Train: 149, Test: 50
PT - Train: 149, Test: 50
ZH - Train: 149, Test: 50
FR - Train: 149, Test: 50
ES - Train: 149, Test: 50
DE - Train: 149, Test: 50


In [11]:
def print_tree(directory, prefix=""):
    """
    Recursively prints a visual representation of a directory tree, excluding JSON files

    Args:
        directory (str): The path to the root directory to be printed
        prefix (str): The prefix string used to format the tree structure visually

    The function prints each folder and file (excluding JSON files) in a structured
    tree format. Subdirectories are processed recursively to print their contents
    """
    try:
        # Get a sorted list of all files and directories in the given path
        files_and_dirs = sorted(os.listdir(directory))
    except PermissionError:
        # Skip directories that cannot be accessed due to permission issues
        return
    except FileNotFoundError:
        # Handle the case when the specified directory is not found
        print(f"Error: Directory '{directory}' not found.")
        return

    # Exclude JSON files from the list of items
    filtered_items = [name for name in files_and_dirs if not name.endswith(".json")]

    # Iterate through each item in the filtered list
    for index, name in enumerate(filtered_items):
        # Construct the full path to the current item
        path = os.path.join(directory, name)

        # Check if the current item is the last one in the list to format the tree properly
        is_last = index == len(filtered_items) - 1

        # Print the current item with the appropriate tree structure prefix
        if not is_last:
            print(f"{prefix}├── {name}")
        else:
            print(f"{prefix}└── {name}")

        # If the item is a directory, recursively print its contents with an updated prefix
        if os.path.isdir(path):
            # Adjust the prefix for nested items
            new_prefix = prefix + ("│   " if not is_last else "    ")
            print_tree(path, new_prefix)

# Set the dataset directory paths (ensure these paths are accurate)
funsd_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset"
xfund_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Final_Format"

# Print the directory structure of the FUNSD dataset, excluding JSON files
print("\nFUNSD Dataset Structure (Excluding JSON Files):")
print_tree(funsd_dir)

# Print the directory structure of the XFUND dataset, excluding JSON files
print("\nXFUND Dataset Structure (Excluding JSON Files):")
print_tree(xfund_dir)


FUNSD Dataset Structure (Excluding JSON Files):
├── testing_data
└── training_data

XFUND Dataset Structure (Excluding JSON Files):
├── de
│   ├── testing_data
│   └── training_data
├── es
│   ├── testing_data
│   └── training_data
├── fr
│   ├── testing_data
│   └── training_data
├── it
│   ├── testing_data
│   └── training_data
├── ja
│   ├── testing_data
│   └── training_data
├── pt
│   ├── testing_data
│   └── training_data
└── zh
    ├── testing_data
    └── training_data


## Fine-Tuning


###  Dataset Path Configuration

In [12]:
# Base path for FUNSD and XFUND datasets
base_path = "/content/drive/MyDrive/Final Project/Fine Tuning "

# FUNSD dataset directories (training and testing data)
funsd_train_path = os.path.join(base_path, "FUND_LiLT_Format/dataset/training_data")
funsd_test_path  = os.path.join(base_path, "FUND_LiLT_Format/dataset/testing_data")

# List of supported languages in the multilingual XFUND dataset
xfund_languages = ["de", "es", "fr", "it", "ja", "pt", "zh"]

# Dictionary to store paths for each language's training and testing data
xfund_paths = {
    lang: {
        "train": os.path.join(base_path, "XFUNDFinalFormat", lang, "training_data"),
        "test":  os.path.join(base_path, "XFUNDFinalFormat", lang, "testing_data")
    } for lang in xfund_languages
}

# Print the FUNSD dataset paths for verification
print("FUNSD Train Path:", funsd_train_path)
print("FUNSD Test Path:", funsd_test_path)

# Print the XFUND dataset paths for each language
for lang, paths in xfund_paths.items():
    print(f"XFUND {lang.upper()} Train Path:", paths["train"])
    print(f"XFUND {lang.upper()} Test Path:", paths["test"])

FUNSD Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/training_data
FUNSD Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/testing_data
XFUND DE Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/de/training_data
XFUND DE Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/de/testing_data
XFUND ES Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/es/training_data
XFUND ES Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/es/testing_data
XFUND FR Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/fr/training_data
XFUND FR Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/fr/testing_data
XFUND IT Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUNDFinalFormat/it/training_data
XFUND IT Test Path: /content/drive/MyDrive/Final Project/Fi

### Fine-Tuning on the FUNSD Dataset

#### Data Loading

In [13]:
def load_json_files(directory):
    """
    Recursively load all JSON files (case-insensitive) from the specified directory and its subdirectories
    Each JSON file should contain the keys: "id", "words", "bboxes", and "ner_tags"

    Args:
        directory (str): Path to the directory to search for JSON files

    Returns:
        list: A list of dictionaries, each containing data from a JSON file
    """
    # Recursively find all files in the directory and its subdirectories
    all_files = glob.glob(os.path.join(directory, "**/*"), recursive=True)
    # Filter out only JSON files (case-insensitive)
    json_files = [f for f in all_files if re.search(r'\.json$', f, re.IGNORECASE)]

    # Initialize an empty list to store dataset samples
    dataset = []
    for file in json_files:
        # Open and load each JSON file
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            # Append the data to the dataset list with a structured dictionary
            dataset.append({
                "id": data.get("id", os.path.basename(file)), # Use file name if "id" is missing
                "words": data["words"], # Extract words list from JSON
                "bboxes": data["bboxes"], # Extract bounding boxes from JSON
                "ner_tags": data["ner_tags"] # Extract NER tags from JSON
            })
    return dataset

In [14]:
# Load FUNSD data (training and testing)
funsd_train_data = load_json_files(funsd_train_path) # Load training data
funsd_test_data  = load_json_files(funsd_test_path) # Load testing data

# Create Hugging Face datasets from the loaded data
funsd_train_dataset = Dataset.from_list(funsd_train_data) # Create dataset from training data
funsd_test_dataset  = Dataset.from_list(funsd_test_data) # Create dataset from testing data

# Print the total number of training and testing samples
print("Total Training Samples:", len(funsd_train_dataset))
print("Total Testing Samples:", len(funsd_test_dataset))

# Print an example sample from the training dataset to verify correctness
print("Example sample from training dataset:")
print(funsd_train_dataset[0])

Total Training Samples: 149
Total Testing Samples: 50
Example sample from training dataset:
{'id': '71206427', 'words': ['ITEM:', 'DATE:', 'BRAND:', 'SUMMARY', 'OF', 'PROJECT:', 'Attached', 'See', 'FUNDING:', 'SIGNATURES:', '4514cbta', '', 'PROJECT', 'BRIEF', 'June', '1,', '1990', 'General', 'Merchandising', 'Nonspecific', 'Package', 'Fixture', 'Circle-K', 'Convenient', 'Stores', '', '', 'SUPPLIERS', 'BEING', 'CONSIDERED:', 'Chicago', 'Show', 'Display', 'Equation', 'Chicago', 'Display', 'Robert', 'Nielson', '&', 'Associates', '1990', 'Customized', 'Merchandising', 'Services', '', 'ESPNS', 'REQUESTING', 'MANAGER', 'MERCHANDISING', 'MANAGER', 'GROUP', 'PRODUCT', 'DIRECTOR', 'PURCHASING', 'DEPARTMENT', 'RETURN', 'TO:', 'REQUESTING', 'MANAGER', '462231045', 'PRODUCED', 'FROM', 'B&W', 'WEB', 'SITE'], 'bboxes': [[394, 145, 433, 162], [109, 112, 151, 129], [110, 140, 159, 155], [109, 183, 168, 198], [175, 184, 193, 195], [201, 184, 267, 198], [317, 186, 385, 199], [282, 186, 310, 197], [103, 

#### Label Analysis and Counting

In [14]:
# Determine unique labels (NER classes) in the FUNSD training dataset

# Initialize an empty set to store unique NER labels
unique_labels = set()

# Iterate over each sample in the training dataset
for sample in funsd_train_dataset:
    # Update the set with NER tags from the current sample
    unique_labels.update(sample["ner_tags"]) # The 'update' method adds all elements from the list, keeping only unique ones

# Calculate the total number of unique labels
num_labels = len(unique_labels)

# Print the number of unique labels found
print("Number of labels:", num_labels)

Number of labels: 7


#### Sample Verification and Model Integrity Check

In [20]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base", num_labels=num_labels)

print("Loaded tokenizer and model successfully!\n")

# Define a function to verify a single sample
def verify_sample(sample):
    try:
        # Check for required keys in the sample
        assert 'words' in sample, "Missing 'words' key!"
        assert 'bboxes' in sample, "Missing 'bboxes' key!"
        assert 'ner_tags' in sample, "Missing 'ner_tags' key!"

        # Ensure consistent lengths between words, bboxes, and ner_tags
        assert len(sample['words']) == len(sample['bboxes']) == len(sample['ner_tags']), (
            f"Inconsistent lengths: words={len(sample['words'])}, bboxes={len(sample['bboxes'])}, ner_tags={len(sample['ner_tags'])}"
        )
        print(f"Sample lengths are consistent (words, bboxes, ner_tags: {len(sample['words'])})")

        # Tokenize the input
        encoded = tokenizer(
            sample["words"],
            boxes=sample["bboxes"],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        print(f"Tokenization successful for sample with ID: {sample.get('id', 'Unknown')}")
        print("Tokenized keys:", encoded.keys())

        # Verify tensor shapes
        assert encoded['input_ids'].shape == torch.Size([1, 512]), "Shape mismatch in input_ids!"
        assert encoded['attention_mask'].shape == torch.Size([1, 512]), "Shape mismatch in attention_mask!"
        assert encoded['bbox'].shape == torch.Size([1, 512, 4]), "Shape mismatch in bbox!"
        print("Tensor shapes are correct!")

        # Perform a forward pass and verify output shape
        outputs = model(**encoded)
        logits = outputs.logits
        assert logits.shape[-1] == num_labels, f"Logits shape mismatch: expected {num_labels}, got {logits.shape[-1]}"
        print(f"Forward pass successful, logits shape: {logits.shape}")

        print("Sample verification passed!\n")
    except AssertionError as e:
        print(f"Error in sample verification: {e}\n")

# Verify a few samples from the training dataset
print("Verifying samples from the training dataset:")
for i in range(5):
    print(f"Sample {i+1}:")
    verify_sample(funsd_train_dataset[i])

# Check the entire dataset for consistent formats
print("Verifying the entire dataset:")
for sample in funsd_train_dataset:
    try:
        # Check consistency of lengths in the training dataset
        assert len(sample['words']) == len(sample['bboxes']) == len(sample['ner_tags']), (
            f"Inconsistent lengths in training data: words={len(sample['words'])}, bboxes={len(sample['bboxes'])}, ner_tags={len(sample['ner_tags'])}"
        )
    except AssertionError as e:
        print(f"Error in training dataset: {e}")

for sample in funsd_test_dataset:
    try:
        # Check consistency of lengths in the testing dataset
        assert len(sample['words']) == len(sample['bboxes']) == len(sample['ner_tags']), (
            f"Inconsistent lengths in testing data: words={len(sample['words'])}, bboxes={len(sample['bboxes'])}, ner_tags={len(sample['ner_tags'])}"
        )
    except AssertionError as e:
        print(f"Error in testing dataset: {e}")

print("Dataset verification completed successfully!")

Some weights of LiltForTokenClassification were not initialized from the model checkpoint at SCUT-DLVCLab/lilt-roberta-en-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded tokenizer and model successfully!

Verifying samples from the training dataset:
Sample 1:
Sample lengths are consistent (words, bboxes, ner_tags: 65)
Tokenization successful for sample with ID: 71206427
Tokenized keys: dict_keys(['input_ids', 'attention_mask', 'bbox'])
Tensor shapes are correct!
Forward pass successful, logits shape: torch.Size([1, 512, 7])
Sample verification passed!

Sample 2:
Sample lengths are consistent (words, bboxes, ner_tags: 189)
Tokenization successful for sample with ID: 0001123541
Tokenized keys: dict_keys(['input_ids', 'attention_mask', 'bbox'])
Tensor shapes are correct!
Forward pass successful, logits shape: torch.Size([1, 512, 7])
Sample verification passed!

Sample 3:
Sample lengths are consistent (words, bboxes, ner_tags: 77)
Tokenization successful for sample with ID: 92039708_9710
Tokenized keys: dict_keys(['input_ids', 'attention_mask', 'bbox'])
Tensor shapes are correct!
Forward pass successful, logits shape: torch.Size([1, 512, 7])
Sample 

#### LiLT Fine-Tuning Pipeline for NER and Document Understanding

##### Load Pre-trained Model and Tokenizer

In [21]:
# The number of NER labels
num_labels = num_labels

# Load the LiLT tokenizer and model
tokenizer = tokenizer
model = model

print("Tokenizer and model loaded successfully!\n")

Tokenizer and model loaded successfully!



##### Define the Preprocessing Function

In [22]:
def preprocess_data(data):
    """
    Preprocess a single sample for the LiLT model

      - example["words"] is a list of strings
      - example["bboxes"] is a list of [x1, y1, x2, y2] coordinates for each word
      - example["ner_tags"] is a list of integer labels corresponding to each word
    """
    # Extract words, bounding boxes, and NER labels from the input example
    words = data["words"]
    boxes = data["bboxes"]
    labels = data["ner_tags"]

    # Ensure that each word has a corresponding bounding box
    if len(words) != len(boxes):
        raise ValueError(f"Mismatch between words and bounding boxes: {len(words)} words, {len(boxes)} boxes")

    # Tokenize the words while aligning them with the provided bounding boxes and labels
    # From documentation, this also applies truncation and padding to a maximum sequence length
    encoding = tokenizer(
        words, # A list of words that will be tokenized
        boxes=boxes, # A list of bounding boxes corresponding to the words
        truncation=True, # If the tokenized output exceeds max_length (512 tokens), it will be truncated
        padding="max_length", # Pads the sequence with special tokens (e.g., [PAD]) to ensure it reaches the max_length
        max_length=512, # The maximum length for the tokenized sequence. Any sequence longer than this is truncated
        return_tensors="pt" # Returns the output as PyTorch tensors (instead of lists or NumPy arrays)
    )

    # Initialize lists to hold token-level bounding boxes and NER labels
    token_boxes = []
    new_labels = []

    # Iterate over each word, its bounding box, and label
    for word, bbox, label in zip(words, boxes, labels):
        # Tokenize the word into subword tokens
        word_tokens = tokenizer.tokenize(word)
        # For each subword token, use the same bounding box
        token_boxes.extend([bbox] * len(word_tokens))
        # For token classification, assign the original label to the first token
        # Subsequent subword tokens get a special label (-100) indicating they should be ignored
        new_labels.append(label)
        new_labels.extend([-100] * (len(word_tokens) - 1))

    # Handle special tokens (e.g., [CLS] and [SEP]) added by the tokenizer
    # Start by adding a dummy label and bounding box for the starting special token
    final_labels = [-100]
    final_boxes = [[0, 0, 0, 0]]

    # Append the labels and bounding boxes from the tokenization of the actual words
    final_labels.extend(new_labels)
    final_boxes.extend(token_boxes)

    # Append a dummy label and bounding box for the ending special token
    final_labels.append(-100)
    final_boxes.append([0, 0, 0, 0])

    # Ensure that the final labels and bounding boxes match the sequence length produced by the tokenizer
    seq_len = encoding['input_ids'].size(1)
    if len(final_labels) > seq_len:
        # If the lists are too long, truncate them
        final_labels = final_labels[:seq_len]
        final_boxes = final_boxes[:seq_len]
    else:
        # If the lists are too short, pad them with dummy values
        pad_length = seq_len - len(final_labels)
        final_labels.extend([-100] * pad_length)
        final_boxes.extend([[0, 0, 0, 0]] * pad_length)

    # Convert the lists to PyTorch tensors
    label_tensor = torch.tensor(final_labels, dtype=torch.long)
    bbox_tensor = torch.tensor(final_boxes, dtype=torch.long)

    # Insert the new labels and bounding boxes into the encoding dictionary
    encoding["labels"] = label_tensor
    encoding["bbox"] = bbox_tensor

    # Remove the extra batch dimension (squeeze the tensor) that was added by return_tensors="pt"
    encoding = {key: val.squeeze(0) for key, val in encoding.items()}

    # Return the final preprocessed encoding dictionary
    return encoding

##### Troubleshooting: Tokenization & Data Alignment Issues

In [18]:
# This code is mainly designed for troubleshooting some problems was in my code in the begining

# Define a dummy tokenizer that simulates basic tokenization and encoding
class DummyTokenizer:
    def __call__(self, words, boxes, truncation, padding, max_length, return_tensors):
        # Simulate tokenization by adding special tokens "[CLS]" at start and "[SEP]" at end
        tokens = ["[CLS]"]
        for word in words:
            tokens.extend(self.tokenize(word))
        tokens.append("[SEP]")
        # Create dummy input_ids: just use a range for simplicity
        token_ids = list(range(len(tokens)))
        # Pad token_ids to max_length
        token_ids = token_ids + [0] * (max_length - len(token_ids))
        # Return a dictionary with a dummy tensor
        return {"input_ids": torch.tensor([token_ids])}

    def tokenize(self, word):
        # A simple tokenizer: if a word ends with "!" split it into word and "!"
        if word.endswith("!") and len(word) > 1:
            return [word[:-1], "!"]
        return [word]

# Set the dummy tokenizer as global tokenizer
tokenizer = DummyTokenizer()

def preprocess_data_verbose(example):
    """
    Preprocess a single sample for the LiLT model while printing intermediate outputs

      - example["words"] is a list of strings
      - example["bboxes"] is a list of [x1, y1, x2, y2] coordinates for each word
      - example["ner_tags"] is a list of integer labels corresponding to each word
    """
    # ----- Step 1: Extract data from the input example -----
    print("=== Step 1: Extract Input Data ===")
    words = example["words"]
    boxes = example["bboxes"]
    labels = example["ner_tags"]
    print("Words:", words)
    print("Bounding Boxes:", boxes)
    print("NER Tags:", labels)

    # Ensure consistency between words and bounding boxes
    if len(words) != len(boxes):
        raise ValueError(f"Mismatch between words and bounding boxes: {len(words)} words, {len(boxes)} boxes")

    # ----- Step 2: Tokenization -----
    print("\n=== Step 2: Tokenization with Dummy Tokenizer ===")
    # Use a smaller max_length for demonstration purposes
    encoding = tokenizer(
        words,
        boxes=boxes,
        truncation=True,
        padding="max_length",
        max_length=20,
        return_tensors="pt"
    )
    print("Tokenization output (input_ids tensor):")
    print(encoding["input_ids"])

    # ----- Step 3: Create token-level bounding boxes and labels -----
    print("\n=== Step 3: Generate Token-Level Boxes and Labels ===")
    token_boxes = []
    new_labels = []
    # Process each word along with its bounding box and NER tag
    for word, bbox, label in zip(words, boxes, labels):
        word_tokens = tokenizer.tokenize(word)
        print(f"Word: {word}")
        print("  Tokens:", word_tokens)
        print("  Bounding Box:", bbox, "Label:", label)
        # Replicate bbox for each subword token
        token_boxes.extend([bbox] * len(word_tokens))
        # For the first token assign the real label, for the rest assign -100
        new_labels.append(label)
        new_labels.extend([-100] * (len(word_tokens) - 1))
    print("Token-level bounding boxes:", token_boxes)
    print("Token-level labels:", new_labels)

    # ----- Step 4: Add special tokens -----
    print("\n=== Step 4: Adding Special Tokens ===")
    # Add a dummy entry for the starting special token ([CLS])
    final_labels = [-100]
    final_boxes = [[0, 0, 0, 0]]
    # Append the token-level labels and bounding boxes
    final_labels.extend(new_labels)
    final_boxes.extend(token_boxes)
    # Append a dummy entry for the ending special token ([SEP])
    final_labels.append(-100)
    final_boxes.append([0, 0, 0, 0])
    print("After adding special tokens:")
    print("Final Labels:", final_labels)
    print("Final Boxes:", final_boxes)

    # ----- Step 5: Adjust to match the sequence length -----
    print("\n=== Step 5: Adjusting to Sequence Length ===")
    seq_len = encoding['input_ids'].size(1)
    print("Target sequence length:", seq_len)
    if len(final_labels) > seq_len:
        final_labels = final_labels[:seq_len]
        final_boxes = final_boxes[:seq_len]
        print("Truncated final labels and boxes to match sequence length.")
    else:
        pad_length = seq_len - len(final_labels)
        final_labels.extend([-100] * pad_length)
        final_boxes.extend([[0, 0, 0, 0]] * pad_length)
        print("Padded final labels and boxes to match sequence length.")
    print("Adjusted Final Labels:", final_labels)
    print("Adjusted Final Boxes:", final_boxes)

    # ----- Step 6: Convert lists to tensors and finalize the encoding -----
    print("\n=== Step 6: Convert to Tensors and Finalize ===")
    label_tensor = torch.tensor(final_labels, dtype=torch.long)
    bbox_tensor = torch.tensor(final_boxes, dtype=torch.long)
    # Add the labels and bounding boxes to the encoding
    encoding["labels"] = label_tensor
    encoding["bbox"] = bbox_tensor
    # Remove the extra batch dimension
    encoding = {key: val.squeeze(0) for key, val in encoding.items()}
    print("Final encoding dictionary:")
    for key, value in encoding.items():
        print(f"{key}:")
        print(value)

    return encoding

# ----- Define a small example -----
example = {
    "words": ["Hello", "world!"],
    "bboxes": [[0, 0, 50, 50], [60, 60, 100, 100]],
    "ner_tags": [1, 2]
}

# ----- Run the verbose preprocessing function on the example -----
print("\n\n=== Running Preprocessing on Example ===")
processed_encoding = preprocess_data_verbose(example)



=== Running Preprocessing on Example ===
=== Step 1: Extract Input Data ===
Words: ['Hello', 'world!']
Bounding Boxes: [[0, 0, 50, 50], [60, 60, 100, 100]]
NER Tags: [1, 2]

=== Step 2: Tokenization with Dummy Tokenizer ===
Tokenization output (input_ids tensor):
tensor([[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

=== Step 3: Generate Token-Level Boxes and Labels ===
Word: Hello
  Tokens: ['Hello']
  Bounding Box: [0, 0, 50, 50] Label: 1
Word: world!
  Tokens: ['world', '!']
  Bounding Box: [60, 60, 100, 100] Label: 2
Token-level bounding boxes: [[0, 0, 50, 50], [60, 60, 100, 100], [60, 60, 100, 100]]
Token-level labels: [1, 2, -100]

=== Step 4: Adding Special Tokens ===
After adding special tokens:
Final Labels: [-100, 1, 2, -100, -100]
Final Boxes: [[0, 0, 0, 0], [0, 0, 50, 50], [60, 60, 100, 100], [60, 60, 100, 100], [0, 0, 0, 0]]

=== Step 5: Adjusting to Sequence Length ===
Target sequence length: 20
Padded final labels and boxes to match sequence length.
Ad

##### Preprocess the Dataset

In [23]:
# Apply the preprocess_data function to each sample in the training and test datasets
train_dataset = funsd_train_dataset.map(preprocess_data, batched=False)
test_dataset = funsd_test_dataset.map(preprocess_data, batched=False)

# Convert selected fields to PyTorch tensors for model compatibility
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'bbox', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'bbox', 'labels'])
# Although preprocess_data returns some tensors, set_format ensures all specified fields are PyTorch tensors

print("Data preprocessing complete!\n")

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Data preprocessing complete!



##### Metrics Computation Functions

In [21]:
def compute_metrics(eval_predictions):
    """
    Compute evaluation metrics for NER token classification tasks

    Args:
        eval_predictions (tuple): A tuple containing:
            - predictions (np.ndarray): The model's raw output logits of shape (batch_size, seq_length, num_classes)
            - labels (np.ndarray): The ground truth labels of shape (batch_size, seq_length)

    Returns:
        dict: A dictionary containing the computed metrics:
            - "precision": The precision score
            - "recall": The recall score
            - "f1": The F1 score
    """
    # Unpack predictions and labels from the evaluation tuple
    logits, ground_truth_labels = eval_predictions

    # Convert logits to predicted class labels by taking the argmax along the last dimension
    predicted_labels = np.argmax(logits, axis=2)

    # Initialize empty lists to collect valid predictions and true labels
    valid_predicted_labels = []
    valid_ground_truth_labels = []

    # Iterate over each pair of predicted and ground truth label sequences
    for predicted_sequence, ground_truth_sequence in zip(predicted_labels, ground_truth_labels):
        # Iterate over each pair of predicted and true label in the current sequence
        for predicted_label, ground_truth_label in zip(predicted_sequence, ground_truth_sequence):
            # Exclude special tokens and ignored labels (marked as -100)
            if ground_truth_label != -100:
                valid_predicted_labels.append(predicted_label)  # Append the valid prediction
                valid_ground_truth_labels.append(ground_truth_label)  # Append the valid true label

    # Compute precision, recall, and F1 score using micro averaging (considering all instances equally)
    precision, recall, f1, _ = precision_recall_fscore_support(
        valid_ground_truth_labels, valid_predicted_labels, average="micro"
    )

    # Print a detailed classification report for each label
    print(classification_report(valid_ground_truth_labels, valid_predicted_labels))

    # Return the calculated metrics as a dictionary
    return {"precision": precision, "recall": recall, "f1": f1}

##### Training Setup and Trainer Configuration

In [25]:
# Define training arguments for the fine-tuning process
training_args = TrainingArguments(
    output_dir="./lilt-finetuned", # Directory where the fine-tuned model and checkpoints will be saved
    evaluation_strategy="epoch", # Evaluate the model at the end of each epoch
    save_strategy="epoch", # Save a checkpoint after each epoch
    load_best_model_at_end=True, # Automatically load the best model (as per the specified metric) at the end of training
    metric_for_best_model="f1", # Use F1 score as the metric to determine the best model checkpoint
    learning_rate=2e-5, # Learning rate for the optimizer
    per_device_train_batch_size=4, # Training batch size per device (e.g., per GPU or CPU)
    per_device_eval_batch_size=4, # Evaluation batch size per device
    num_train_epochs=7, # Total number of training epochs
    weight_decay=0.01, # Weight decay (L2 regularization) to apply to the optimizer
    logging_dir="./logs", # Directory to store training logs
    logging_steps=50, # Log training details every 50 steps
    save_total_limit=1, # Limit the total number of saved checkpoints (older checkpoints will be deleted)
    report_to="all", # Report training metrics to all supported logging platforms (e.g., TensorBoard, WandB)
    logging_strategy="epoch", # Logging frequency is based on the epochs
    disable_tqdm=False, # Enable tqdm progress bars during training
    dataloader_num_workers=2, # Number of subprocesses to use for data loading
)

# Initialize the Trainer with the model, training arguments, datasets, and metric computation function.
trainer = Trainer(
    model=model, # The pre-trained model that will be fine-tuned which is LiLT model
    args=training_args, # The training configuration defined above
    train_dataset=train_dataset, # The training dataset
    eval_dataset=test_dataset, # The evaluation dataset
    compute_metrics=compute_metrics, # Function to compute evaluation metrics (e.g., precision, recall, F1 score) during evaluation
)



##### Fine-Tuning the Model

In [26]:
print("Starting training...")
trainer.train() # Start the training process

print("Evaluating the model...")
trainer.evaluate() # Evaluate the model on the test dataset



Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myahia-n-ahmed[0m ([33myahia-n-ahmed-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.9286,1.018199,0.72,0.72,0.72
2,0.9238,0.731034,0.72,0.72,0.72
3,0.6515,0.715934,0.76,0.76,0.76
4,0.4301,0.576721,0.82,0.82,0.82
5,0.3345,0.483927,0.84,0.84,0.84
6,0.2442,0.643221,0.84,0.84,0.84
7,0.1885,0.602488,0.84,0.84,0.84


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.72      1.00      0.84        36
           3       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2

    accuracy                           0.72        50
   macro avg       0.18      0.25      0.21        50
weighted avg       0.52      0.72      0.60        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.72      1.00      0.84        36
           3       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2

    accuracy                           0.72        50
   macro avg       0.18      0.25      0.21        50
weighted avg       0.52      0.72      0.60        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      0.11      0.20         9
           1       0.80      1.00      0.89        36
           3       0.25      0.33      0.29         3
           5       0.00      0.00      0.00         2

    accuracy                           0.76        50
   macro avg       0.51      0.36      0.34        50
weighted avg       0.77      0.76      0.69        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.67      0.44      0.53         9
           1       0.88      0.97      0.92        36
           3       0.50      0.67      0.57         3
           5       0.00      0.00      0.00         2

    accuracy                           0.82        50
   macro avg       0.51      0.52      0.51        50
weighted avg       0.78      0.82      0.79        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.71      0.56      0.62         9
           1       0.90      0.97      0.93        36
           3       0.50      0.67      0.57         3
           5       0.00      0.00      0.00         2

    accuracy                           0.84        50
   macro avg       0.53      0.55      0.53        50
weighted avg       0.80      0.84      0.82        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.71      0.56      0.62         9
           1       0.90      0.97      0.93        36
           3       0.67      0.67      0.67         3
           5       0.00      0.00      0.00         2

    accuracy                           0.84        50
   macro avg       0.57      0.55      0.56        50
weighted avg       0.81      0.84      0.82        50

              precision    recall  f1-score   support

           0       0.71      0.56      0.62         9
           1       0.90      0.97      0.93        36
           3       0.67      0.67      0.67         3
           5       0.00      0.00      0.00         2

    accuracy                           0.84        50
   macro avg       0.57      0.55      0.56        50
weighted avg       0.81      0.84      0.82        50

Evaluating the model...


              precision    recall  f1-score   support

           0       0.71      0.56      0.62         9
           1       0.90      0.97      0.93        36
           3       0.50      0.67      0.57         3
           5       0.00      0.00      0.00         2

    accuracy                           0.84        50
   macro avg       0.53      0.55      0.53        50
weighted avg       0.80      0.84      0.82        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4839273691177368,
 'eval_precision': 0.84,
 'eval_recall': 0.84,
 'eval_f1': 0.84,
 'eval_runtime': 1.5283,
 'eval_samples_per_second': 32.717,
 'eval_steps_per_second': 8.506,
 'epoch': 7.0}

##### Save the Fine-Tuned Model and Tokenizer

In [27]:
model.save_pretrained("./lilt-finetuned")
tokenizer.save_pretrained("./lilt-finetuned")
print("Fine-tuned model and tokenizer saved!")

Fine-tuned model and tokenizer saved!


### Fine-Tuning on the XFUND Datasets

##### Data Loading

In [15]:
def load_language_data(lang_dir):
    """
    Load training and testing data from the specified language data directory

    Args:
        lang_dir (str): The base directory containing the subdirectories "training_data" and "testing_data"

    Returns:
        tuple: A tuple of two lists (training_data, testing_data), each containing structured data loaded from JSON files
    """
    # Construct paths for training and testing data subdirectories
    train_dir = os.path.join(lang_dir, "training_data")
    test_dir = os.path.join(lang_dir, "testing_data")

    # Load all JSON files from the training and testing directories using load_json_files function from earlier
    train_data = load_json_files(train_dir)
    test_data = load_json_files(test_dir)

    # Return the loaded training and testing data as a tuple, the tuple contains two lists each for training and testing data, and the lists contain dictionaries for each document
    return train_data, test_data

def load_multilingual_xfund(xfund_dir):
    """
    Load multilingual training and testing data from the specified base directory

    Args:
        xfund_dir (str): The base directory containing subdirectories for each language

    Returns:
        dict: A dictionary where each key is a language name (derived from the subdirectory name),
              and the value is another dictionary containing:
              - "train": Training data loaded from JSON files
              - "test": Testing data loaded from JSON files
    """
    # Initialize an empty dictionary to store data for each language
    data = {}

    # Iterate through each subdirectory (representing a language) within the base directory
    for lang in os.listdir(xfund_dir):
        lang_path = os.path.join(xfund_dir, lang)

        # Check if the current item is a directory (skip files or other items)
        if os.path.isdir(lang_path):
            # Load training and testing data for the current language
            train, test = load_language_data(lang_path)

            # Only add data to the dictionary if both training and testing sets are non-empty
            if train and test:
                data[lang] = {"train": train, "test": test}

    # Return the complete multilingual data dictionary
    return data

In [16]:
# Load the dataset directory
xfund_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Final_Format"

# Load multilingual data from the specified directory
multilingual_data = load_multilingual_xfund(xfund_dir)

if not multilingual_data:
    print("No data found in the provided dataset directory.")

##### Label Mapping Functions

In [17]:
def get_unique_labels(examples):
    """
    Extract unique NER (Named Entity Recognition) labels from a list of examples

    Args:
        examples (list): A list of dictionaries, where each dictionary contains an "ner_tags" key
                         with a list of integer labels (including -100 for ignored tags)

    Returns:
        list: A sorted list of unique NER labels, excluding the ignored label (-100).
    """
    # Initialize an empty set to collect unique labels
    unique_labels = set()

    # Iterate through each example and update the set with unique labels, ignoring -100
    for sample in examples:
        unique_labels.update(label for label in sample["ner_tags"] if label != -100)

    # Return the unique labels as a sorted list
    return sorted(list(unique_labels))

##### Dataset Validation Function

In [19]:
def validate_labels(dataset, num_labels):
    """
    Validate the labels in a given dataset to ensure they are within the expected range

    This function is primarily used for debugging purposes to identify any issues
    related to label values, such as out-of-bound labels or incorrect label formatting

    Args:
        dataset (list): A list of examples, where each example is a dictionary containing:
            - "labels" (list or tensor): A list or tensor of integer labels for each token
        num_labels (int): The number of valid label classes (expected labels are in the range [0, num_labels-1])

    Raises:
        ValueError: If any label is out of the valid range or if there is a data formatting issue
    """
    # Iterate through each example in the dataset
    for i in range(len(dataset)):
        labels = dataset[i]["labels"]

        # Convert tensor labels to a list if necessary
        if hasattr(labels, "tolist"):
            labels = labels.tolist()

        # Check each label in the current example
        for label in labels:
            # Ignore special ignored tokens (-100) but check if valid labels are within range
            if label != -100 and (label < 0 or label >= num_labels): # Check if the label is out of range [0, num_labels-1]
                raise ValueError(f"Example {i} has out-of-bound label {label} "
                                 f"(expected 0 <= label < {num_labels}).")

##### Training and Evaluation for XFUND Multilingual Dataset

###### Multi-task Fine-tuning

In [None]:
# Training and Evaluation for XFUND Multilingual Dataset


# Load the dataset directory
xfund_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Final_Format"

# Load multilingual data from the specified directory
multilingual_data = load_multilingual_xfund(xfund_dir)

if not multilingual_data: # Check if any data was loaded
    print("No data found in the provided dataset directory.")
    exit()


# Combine training examples from all languages and collect test examples per language

global_train_examples = [] # List to hold training examples from all languages
test_examples_by_lang = {} # Dictionary to hold test examples for each language
global_unique_labels = set() # Set to hold unique labels across all languages

# Iterate over each language and its corresponding data splits (train and test)
for lang, splits in multilingual_data.items():
    train_examples = splits["train"]
    test_examples = splits["test"]

    # Append training examples for global training
    global_train_examples.extend(train_examples)
    # Store test examples for later per-language evaluation
    test_examples_by_lang[lang] = test_examples

    # Update the set of unique labels (from both train and test)
    global_unique_labels.update(get_unique_labels(train_examples))
    global_unique_labels.update(get_unique_labels(test_examples))

# Compute the global label mapping
all_unique = sorted(list(global_unique_labels))
print(f"Global unique labels: {all_unique}")


# Label Mapping

# Create a mapping from original labels to new indices (0, 1, 2, ...), as the model expects labels to be in a certain range
label2id = {orig: new for new, orig in enumerate(all_unique)}
# Create a reverse mapping from indices to original labels (id2label)
id2label = {v: k for k, v in label2id.items()}
# Calculate the number of unique labels
num_labels = len(label2id)

print(f"Original labels: {all_unique}")
print(f"Remapped to indices: {list(range(num_labels))}")


# Dataset Conversion to Hugging Face Format

# Convert the combined training examples to a Hugging Face Dataset
train_dataset = Dataset.from_list(global_train_examples)


# Load Pre-trained Model and Tokenizer

# Define the model name to be loaded
model_name = "SCUT-DLVCLab/lilt-infoxlm-base"

# Load model configuration with customized label mappings
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)
# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# Load the pre-trained model with the configured settings
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)


# Preprocess the training dataset

# Define a preprocessing function to map examples with the tokenizer
preprocess_fn = lambda data: preprocess_data(data)

# Apply the preprocessing function to the training dataset
train_dataset = train_dataset.map(preprocess_fn, batched=False)
# Format the training dataset to be compatible with PyTorch tensors
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "bbox", "labels"])


# Label Validation (Debugging Step)

# Validate the labels in both training dataset to ensure it is within the valid range
validate_labels(train_dataset, num_labels)


# Data Collator for Efficient Batching

# Define a data collator to dynamically pad input batches and return tensors
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, # The tokenizer used for encoding the input data
    padding=True, # Enable padding to the maximum length of the batch
    return_tensors="pt" # Return the output as PyTorch tensors
)

# Configure training arguments and create the Trainer (using a global output directory)
training_args = TrainingArguments(
    output_dir="./lilt-finetuned-all", # Directory to save the fine-tuned model and checkpoints
    eval_strategy="no",  # Disable in-training evaluation, since evaluate separately later
    save_strategy="epoch", # Save model checkpoints at the end of each epoch
    save_total_limit=1,
    learning_rate=2e-5, # Learning rate for the optimizer
    per_device_train_batch_size=4, # Training batch size per device (e.g., per GPU)
    per_device_eval_batch_size=4, # Evaluation batch size per device
    num_train_epochs=7, # Total number of training epochs
    weight_decay=0.01, # Weight decay for L2 regularization
    logging_steps=50, # Log training metrics every 50 steps
    load_best_model_at_end=False, # Disable best model loading (requires eval_dataset)
    metric_for_best_model="f1" # Use F1 score as the metric to select the best model
)

# Initialize the Trainer


# Set up the Trainer with the model, training arguments, datasets, and data collator
trainer = Trainer(
    model=model, # The pre-trained model to be fine-tuned
    args=training_args, # Configuration for training and evaluation
    train_dataset=train_dataset, # Training data
    data_collator=data_collator, # Data collator for efficient batching
    compute_metrics=compute_metrics # Function to compute evaluation metrics
)


# Training the Model

# Start model training and display progress
print(f"Training on {len(train_dataset)} examples from all languages...")
trainer.train()


# Evaluating the Model

# Evaluate the global model on each language's test dataset separately
for lang, test_examples in test_examples_by_lang.items():
    print(f"\n=== Evaluating language: {lang} with {len(test_examples)} examples ===")
    test_dataset = Dataset.from_list(test_examples)
    test_dataset = test_dataset.map(preprocess_fn, batched=False)
    test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "bbox", "labels"])

    # Validate labels in the test dataset
    validate_labels(test_dataset, num_labels)

    results = trainer.evaluate(test_dataset)
    print(f"Results for language {lang}: {results}")

# Save the fine-tuned model and tokenizer for all languages
trainer.save_model(f"./lilt-finetuned-all")
tokenizer.save_pretrained(f"./lilt-finetuned-all")
print("Saved fine-tuned model for all languages.")

Global unique labels: [0, 1, 2, 3, 4, 5, 6]
Original labels: [0, 1, 2, 3, 4, 5, 6]
Remapped to indices: [0, 1, 2, 3, 4, 5, 6]


Some weights of LiltForTokenClassification were not initialized from the model checkpoint at SCUT-DLVCLab/lilt-infoxlm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Training on 1043 examples from all languages...


Step,Training Loss
50,1.2858
100,1.0084
150,1.0507
200,0.94
250,0.89
300,0.894
350,0.7999
400,0.7894
450,0.7778
500,0.7906



=== Evaluating language: ja with 50 examples ===


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.71      0.92      0.80        24
           1       0.00      0.00      0.00         2
           5       0.89      0.71      0.79        24

    accuracy                           0.78        50
   macro avg       0.53      0.54      0.53        50
weighted avg       0.77      0.78      0.76        50

Results for language ja: {'eval_loss': 1.0172622203826904, 'eval_precision': 0.78, 'eval_recall': 0.78, 'eval_f1': 0.78, 'eval_runtime': 3.2203, 'eval_samples_per_second': 15.527, 'eval_steps_per_second': 4.037, 'epoch': 7.0}

=== Evaluating language: it with 50 examples ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.83      0.97      0.90        36
           1       0.00      0.00      0.00         1
           5       0.88      0.54      0.67        13

    accuracy                           0.84        50
   macro avg       0.57      0.50      0.52        50
weighted avg       0.83      0.84      0.82        50

Results for language it: {'eval_loss': 0.9083238840103149, 'eval_precision': 0.84, 'eval_recall': 0.84, 'eval_f1': 0.84, 'eval_runtime': 2.6316, 'eval_samples_per_second': 19.0, 'eval_steps_per_second': 4.94, 'epoch': 7.0}

=== Evaluating language: pt with 50 examples ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.83      0.90      0.86        39
           1       0.00      0.00      0.00         1
           5       0.57      0.40      0.47        10

    accuracy                           0.78        50
   macro avg       0.47      0.43      0.44        50
weighted avg       0.76      0.78      0.77        50

Results for language pt: {'eval_loss': 1.111588478088379, 'eval_precision': 0.78, 'eval_recall': 0.78, 'eval_f1': 0.78, 'eval_runtime': 2.6515, 'eval_samples_per_second': 18.857, 'eval_steps_per_second': 4.903, 'epoch': 7.0}

=== Evaluating language: zh with 50 examples ===


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.70      0.93      0.80        15
           1       0.67      0.40      0.50         5
           3       0.00      0.00      0.00         3
           5       0.93      0.93      0.93        27

    accuracy                           0.82        50
   macro avg       0.57      0.56      0.56        50
weighted avg       0.78      0.82      0.79        50

Results for language zh: {'eval_loss': 0.7660517692565918, 'eval_precision': 0.82, 'eval_recall': 0.82, 'eval_f1': 0.82, 'eval_runtime': 2.6466, 'eval_samples_per_second': 18.892, 'eval_steps_per_second': 4.912, 'epoch': 7.0}

=== Evaluating language: fr with 50 examples ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.97      0.90      0.93        39
           1       0.50      1.00      0.67         1
           5       0.75      0.90      0.82        10

    accuracy                           0.90        50
   macro avg       0.74      0.93      0.81        50
weighted avg       0.92      0.90      0.90        50

Results for language fr: {'eval_loss': 0.5443682670593262, 'eval_precision': 0.9, 'eval_recall': 0.9, 'eval_f1': 0.9, 'eval_runtime': 2.6331, 'eval_samples_per_second': 18.989, 'eval_steps_per_second': 4.937, 'epoch': 7.0}

=== Evaluating language: es with 50 examples ===


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.76      1.00      0.87        29
           1       0.00      0.00      0.00         5
           5       1.00      0.75      0.86        16

    accuracy                           0.82        50
   macro avg       0.59      0.58      0.57        50
weighted avg       0.76      0.82      0.78        50

Results for language es: {'eval_loss': 0.9392833113670349, 'eval_precision': 0.82, 'eval_recall': 0.82, 'eval_f1': 0.82, 'eval_runtime': 2.6646, 'eval_samples_per_second': 18.765, 'eval_steps_per_second': 4.879, 'epoch': 7.0}

=== Evaluating language: de with 50 examples ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.79      0.88      0.84        26
           1       0.40      0.25      0.31         8
           5       0.62      0.62      0.62        16

    accuracy                           0.70        50
   macro avg       0.61      0.59      0.59        50
weighted avg       0.68      0.70      0.68        50

Results for language de: {'eval_loss': 1.3979443311691284, 'eval_precision': 0.7, 'eval_recall': 0.7, 'eval_f1': 0.7, 'eval_runtime': 2.6541, 'eval_samples_per_second': 18.839, 'eval_steps_per_second': 4.898, 'epoch': 7.0}
Saved fine-tuned model for all languages.
