### Training data creation:

In [12]:
import os
import torch
from torch_geometric.data import Data

# Define your document types and their data folder paths
task_folders = {
    "Invoice": "data/invoice/",
    "Loan": "data/loan/",
    "Final Bill": "data/final_bill/",
    "Background Verification": "data/background_verification/",
    # "Operative Report": "data/operative_report/"
}

all_graphs = []

for task_name, folder_path in task_folders.items():
    if not os.path.exists(folder_path):
        print(f"⚠️ Folder not found: {folder_path}")
        continue
    graph_files = [f for f in os.listdir(folder_path) if f.endswith(".pt")]

    print(f"🔍 Processing {len(graph_files)} files for task: {task_name}")
    for file in graph_files:
        graph_path = os.path.join(folder_path, file)
        data = torch.load(graph_path)
        if isinstance(data, list):
           for d in data:
               d.task = task_name
               all_graphs.append(d)
        else:
            data.task = task_name
            all_graphs.append(data)

# Save to one master file
os.makedirs("data/training_data", exist_ok=True)
torch.save(all_graphs, "data/training_data/training_dataset_05.pt")
print(f"Saved all {len(all_graphs)} graphs to training_dataset_05.pt")


🔍 Processing 5 files for task: Invoice
🔍 Processing 5 files for task: Loan
🔍 Processing 5 files for task: Final Bill
🔍 Processing 5 files for task: Background Verification
Saved all 20 graphs to training_dataset_05.pt


  data = torch.load(graph_path)


### Testing data creation:

In [13]:
import os
import torch
from torch_geometric.data import Data

# Define your document types and their data folder paths
task_folders = {
    # "Invoice": "data/invoice/",
    # "Loan": "data/loan/",
    # "Final Bill": "data/final_bill/",
    # "Background Verification": "data/background_verification/",
    # "Operative Report": "data/operative_report/"
    "PR2": "data/PR2/"
}

all_graphs = []

for task_name, folder_path in task_folders.items():
    if not os.path.exists(folder_path):
        print(f" Folder not found: {folder_path}")
        continue
    graph_files = [f for f in os.listdir(folder_path) if f.endswith("")]

    print(f"🔍 Processing {len(graph_files)} files for task: {task_name}")
    for file in graph_files:
        graph_path = os.path.join(folder_path, file)
        data = torch.load(graph_path)
        if isinstance(data, list):
           for d in data:
               d.task = task_name
               all_graphs.append(d)
        else:
            data.task = task_name
            all_graphs.append(data)

# Save to one master file
os.makedirs("data/test_data", exist_ok=True)
torch.save(all_graphs, "data/test_data/test_dataset_PR2.pt")
print(f"Saved all {len(all_graphs)} graphs to test_dataset_PR2.pt")


🔍 Processing 5 files for task: PR2
Saved all 5 graphs to test_dataset_PR2.pt


  data = torch.load(graph_path)
