# Fine-Tuning

## Load Dataset into Google Colab

### Import Necessary Libraries

In [1]:
# Import necessary libraries
from google.colab import drive # Library to mount Google Drive in Colab
import os # Provides functions to interact with the operating system (file paths)
import json # Library to handle JSON data

### Mount Google Drive and Dataset Accessibility

In [2]:
# Mount Google Drive in Google Colab
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Check if the dataset is accessible in Colab
dataset_path = "/content/drive/MyDrive/Final Project/Fine Tuning "

# List files to verify dataset accessibility
print(os.listdir(dataset_path))

['FUND_LiLT_Format', 'XFUND_LiLT_Format']


### Define Dataset Paths

In [4]:
# Base path to the project in Google Drive
base_path = "/content/drive/MyDrive/Final Project/Fine Tuning "

# FUNSD dataset (English)
funsd_path = os.path.join(base_path, "FUND_LiLT_Format/dataset")
funsd_train_path = os.path.join(funsd_path, "training_data")
funsd_test_path = os.path.join(funsd_path, "testing_data")

# XFUND dataset (Multilingual)
xfund_path = os.path.join(base_path, "XFUND_LiLT_Format")

# Print directories to verify
print("FUNSD Train Path:", funsd_train_path)
print("FUNSD Test Path:", funsd_test_path)
print("XFUND Path:", xfund_path)

FUNSD Train Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/training_data
FUNSD Test Path: /content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset/testing_data
XFUND Path: /content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Format


###  Load FUNSD (English) Dataset

In [5]:
def load_funsd_dataset(data_path):
    """
    Loads the FUNSD dataset from a directory path

    Args:
        data_path (str): Path to the directory containing FUNSD JSON files

    Returns:
        list: A list of dictionaries where each dictionary represents a document
              in the FUNSD dataset format
    """
    dataset = [] # Initialize an empty list to store dataset samples

    # Iterate through all files in the specified directory in a sorted order
    for file_name in sorted(os.listdir(data_path)): # Sorting ensures consistent order
        file_path = os.path.join(data_path, file_name) # Construct the full file path

        # Open and read the JSON file
        with open(file_path, "r") as f:
            data = json.load(f) # Load JSON data into a Python dictionary
            dataset.append(data) # Append the loaded data to the dataset list

    return dataset # Return the complete dataset as a list of dictionaries

# Load FUNSD training and testing datasets
funsd_train = load_funsd_dataset(funsd_train_path) # Load training data
funsd_test = load_funsd_dataset(funsd_test_path) # Load testing data

# Print confirmation message and display one sample document for verification
print("Loaded FUNSD dataset:")
print(json.dumps(funsd_train[0], indent=2)) # Print the first sample document for inspection

Loaded FUNSD dataset:
{
  "id": "0000971160",
  "words": [
    "R&D",
    ":",
    "Suggestion:",
    "Date:",
    "Licensee",
    "",
    "Yes",
    "No",
    "597005708",
    "R&D",
    "QUALITY",
    "IMPROVEMENT",
    "SUGGESTION/",
    "",
    "SOLUTION",
    "FORM",
    "Name",
    "/",
    "Phone",
    "Ext.",
    ":",
    "M.",
    "Hamann",
    "P.",
    "Harper,",
    "P.",
    "Martinez",
    "9/",
    "3/",
    "92",
    "R&D",
    "Group:",
    "J.",
    "S.",
    "Wigand",
    "Supervisor",
    "/",
    "Manager",
    "Discontinue",
    "coal",
    "retention",
    "analyses",
    "on",
    "licensee",
    "submitted",
    "product",
    "samples",
    "(Note",
    ":",
    "Coal",
    "Retention",
    "testing",
    "is",
    "not",
    "performed",
    "by",
    "most",
    "licensees.",
    "Other",
    "B&W",
    "physical",
    "measurements",
    "as",
    "ends",
    "stability",
    "and",
    "inspection",
    "for",
    "soft",
    "spots",
    "in",
    "cipare

### Load XFUND (Multilingual) Dataset

In [6]:
def load_xfund_dataset(xfund_path):
    """
    Loads the multilingual XFUND dataset from directory

    Args:
        xfund_path (str): Path to the directory containing language-specific subdirectories

    Returns:
        dict: A dictionary where each key is a language code (e.g., "de", "es"),
              and the value is another dictionary with "train" and "test" datasets.
    """
    xfund_data = {} # Dictionary to store data for each language

    # Iterate through all language directories in the XFUND dataset path
    for lang in os.listdir(xfund_path):
        lang_path = os.path.join(xfund_path, lang) # Construct full path to the language directory

        # Ensure that only directories (languages) are processed
        if os.path.isdir(lang_path):
            train_path = os.path.join(lang_path, "training_data") # Path to training data
            test_path = os.path.join(lang_path, "testing_data") # Path to testing data

            # Load training and testing data for the current language using the FUNSD loader function
            xfund_data[lang] = {
                "train": load_funsd_dataset(train_path), # Load training data
                "test": load_funsd_dataset(test_path) # Load testing data
            }

    return xfund_data # Return the loaded dataset as a dictionary

# Load XFUND dataset
xfund_data = load_xfund_dataset(xfund_path) # Call function to load all languages

# Print sample from one language (e.g., German "de") to verify the data structure
print("Loaded XFUND dataset (German example):")
print(json.dumps(xfund_data["de"]["train"][0], indent=2)) # Pretty-print first training sample of German dataset

Loaded XFUND dataset (German example):
{
  "id": "de_train_0",
  "words": [
    "Bezeichnung,",
    "Ort",
    "und",
    "Gesch\u00e4ftsnummer",
    "des",
    "Gerichts:",
    "Erkl\u00e4rung",
    "\u00fcber",
    "die",
    "pers\u00f6nlichen",
    "und",
    "wirtschaftlichen",
    "Verh\u00e4ltnisse",
    "bei",
    "Prozess-",
    "oder",
    "Verfahrenskostenhilfe",
    "-",
    "Belege",
    "sind",
    "in",
    "Kopie",
    "durchnummeriert",
    "beizuf\u00fcgen",
    "-",
    "A",
    "Angaben",
    "zu",
    "Ihrer",
    "Person",
    "Julian",
    "Herrmann",
    "Lehrer",
    "09.08.1987",
    "ledig",
    "Name,",
    "Vorname,",
    "ggf.",
    "Geburtsname",
    "Beruf,",
    "Erwerbst\u00e4tigkeit",
    "Geburtsdatum",
    "Familienstand",
    "55232",
    "Alzey,",
    "Bahnhofstra\u00dfe",
    "19",
    "16724891042",
    "Anschrift",
    "(Stra\u00dfe,",
    "Hausnummer,",
    "Postleitzahl,",
    "Wohnort)",
    "Tags\u00fcber",
    "tel.",
    "erreichbar",
   

###  Verify the Loaded Data

In [7]:
# Print the number of samples in the FUNSD dataset
print(f"FUNSD Train Samples: {len(funsd_train)}") # Display count of training samples
print(f"FUNSD Test Samples: {len(funsd_test)}") # Display count of testing samples

# Iterate over all languages in the XFUND dataset and print dataset sizes
for lang in xfund_data.keys():
    print(f"{lang.upper()} - Train: {len(xfund_data[lang]['train'])}, Test: {len(xfund_data[lang]['test'])}")
    # Convert language code to uppercase and display counts for training & testing samples

FUNSD Train Samples: 149
FUNSD Test Samples: 50
JA - Train: 149, Test: 50
IT - Train: 149, Test: 50
PT - Train: 149, Test: 50
ZH - Train: 149, Test: 50
DE - Train: 149, Test: 50
FR - Train: 149, Test: 50
ES - Train: 149, Test: 50


## Fine-Tuning

In [18]:
import os

def print_tree(directory, prefix=""):
    """Recursively prints a directory tree, excluding JSON files."""
    try:
        files_and_dirs = sorted(os.listdir(directory))
    except PermissionError:
        return  # Skip folders with permission issues
    except FileNotFoundError:
        print(f"Error: Directory '{directory}' not found.")
        return

    # Filter out JSON files but keep other files and directories
    filtered_items = [name for name in files_and_dirs if not name.endswith(".json")]

    for index, name in enumerate(filtered_items):
        path = os.path.join(directory, name)
        is_last = index == len(filtered_items) - 1
        print(f"{prefix}├── {name}" if not is_last else f"{prefix}└── {name}")

        # Recurse into directories only
        if os.path.isdir(path):
            print_tree(path, prefix + ("│   " if not is_last else "    "))

# Set your dataset directories (Make sure the paths are correct)
funsd_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /FUND_LiLT_Format/dataset"
xfund_dir = "/content/drive/MyDrive/Final Project/Fine Tuning /XFUND_LiLT_Format"

print("\nFUNSD Dataset Structure (Excluding JSON Files):")
print_tree(funsd_dir)

print("\nXFUND Dataset Structure (Excluding JSON Files):")
print_tree(xfund_dir)


FUNSD Dataset Structure (Excluding JSON Files):
├── testing_data
└── training_data

XFUND Dataset Structure (Excluding JSON Files):
├── de
│   ├── testing_data
│   └── training_data
├── es
│   ├── testing_data
│   └── training_data
├── fr
│   ├── testing_data
│   └── training_data
├── it
│   ├── testing_data
│   └── training_data
├── ja
│   ├── testing_data
│   └── training_data
├── pt
│   ├── testing_data
│   └── training_data
└── zh
    ├── testing_data
    └── training_data
