In [1]:
from argparse import Namespace
from my_code.inference import  blocking_inference, run_blocked_inference
from my_code.DK import GeneralDKInjector

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def main(hp, model_path):
    """
    Orchestrate the full blocking + inference pipeline.

    1. Run the blocking stage to generate candidate pairs
    2. Run the inference stage to score those pairs and
       produce the final matched results

    Parameters
    ----------
    hp : object
        Hyperparameter/configuration object containing file paths,
        thresholds, and other settings.
    model_path : str
        Path to the trained matching model to be used during inference.
    """

    # Step 1: Generate candidate pairs via blocking
    blocking_inference(hp)

    #Create DK object
    if hp.dk:
        dk = GeneralDKInjector(hp.task)
    else:
        dk = None

    # Step 2: Score candidate pairs with the trained model and
    #         write final matches to CSV
    run_blocked_inference(
                        model_path=model_path,
                        blocked_pairs_csv=hp.output_pairs_csv,
                        reference_table_csv=hp.table_reference_csv,
                        source_table_csv=hp.table_source_csv,
                        output_csv=hp.output_inference_csv,
                        lm=hp.lm,
                        max_len=hp.max_len,
                        columns_to_use=hp.columns_to_use,
                        inference_txt_path=hp.output_ditto_txt,
                        dk = dk,
                    )

In [None]:
# ---------------------------------------------------------------------------------------------
# Base path for storing all inference-related files
# Update this path to your local project folder for inference outputs
base_path_inference = "D:/Study/ENSIAS/stage_2/ER/ditto/resultat_inference"

# -------------------- task Selection --------------------
# Name of the current inference task.
# You can choose any descriptive name (e.g., "inference_1", "my_test_inference")
task_inference = "inference_1"

# -------------------- Model checkpoint --------------------
# Path to the trained Ditto model checkpoint to use for inference.
# Make sure this path points to an existing .pt file from your training outputs
model_path = "D:/Study/ENSIAS/stage_2/ER/ditto/resultat_training/logs/Generated_data/model_Generated_data_bs32_ep1_lmdistilbert_alpha0.8_date2025-09-11.pt"
# ---------------------------------------------------------------------------------------------

hp_inference = Namespace(
    # -----------------------------------------------------------------------------------------
    # Hyperparameters for the **blocking step**
    model_name_blocking="all-MiniLM-L12-v2",        # Sentence-transformers model used to create embeddings
    top_k_blocking=5,                               # Keep the top-5 most similar candidates for each record
    threshold_blocking=0.95,                        # Cosine similarity threshold to filter candidate pairs
    batch_size_blocking=512,                        # Batch size for encoding/embedding computation
    # -----------------------------------------------------------------------------------------

    # Input CSV files (raw entity tables + ground truth for evaluation)
    table_reference_csv=f"{base_path_inference}/data/{task_inference}/reference_table.csv",  # “Reference” table
    table_source_csv=f"{base_path_inference}/data/{task_inference}/source_table.csv",        # “Source” table

    # Ditto-style TXT files (entity tables converted to text for Ditto model consumption)
    table_reference_txt=f"{base_path_inference}/input_txt_blocking/{task_inference}/reference_table.txt",
    table_source_txt=f"{base_path_inference}/input_txt_blocking/{task_inference}/source_table.txt",

    # Precomputed embedding vector files (to avoid recomputing sentence embeddings)
    table_reference_vec=f"{base_path_inference}/vectors_blocking/{task_inference}/reference_table.txt.mat",
    table_source_vec=f"{base_path_inference}/vectors_blocking/{task_inference}/source_table.txt.mat",

    # Blocking outputs (candidate pairs produced by the blocking step)
    output_pairs_csv=f"{base_path_inference}/blocking/{task_inference}/blocking_pairs.csv",          # Candidate pairs as CSV
    output_ditto_txt=f"{base_path_inference}/blocking/{task_inference}/blocking_pairs_ditto.txt",    # Same pairs in Ditto TXT format

    # Final inference result (predicted matches)
    output_inference_csv=f"{base_path_inference}/inference/{task_inference}/result.csv",             # Final predicted matches

    # Intermediate datasets for Ditto (if you need to re-run or inspect data)
    dataset_csv_dir=f"{base_path_inference}/dataset_ditto_csv",
    dataset_txt_dir=f"{base_path_inference}/dataset_ditto_txt",

    # Misc task info
    task=task_inference,           # Task name, used to organize all path references
    lm="distilbert",               # Language model backbone for Ditto inference
    max_len=256,                   # Maximum token length for each input pair
    dk = True,                     # Whether to inject domain knowledge (DK)
    columns_to_use = None,         # Columns from source/reference tables to include (None = all)
)


In [8]:
main(hp_inference, model_path)

Cleaning mapping json file done.
✅ Normalized and saved CSV at: D:/Study/ENSIAS/2eme_annee/stage_2/ER/ditto/resultat_inference/data/inference_1/reference_table.csv
Cleaning mapping json file done.
✅ Normalized and saved CSV at: D:/Study/ENSIAS/2eme_annee/stage_2/ER/ditto/resultat_inference/data/inference_1/source_table.csv


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 1/1 [00:00<00:00, 500.75it/s]
Processing blocked inference: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]

Saved final results to: D:/Study/ENSIAS/2eme_annee/stage_2/ER/ditto/resultat_inference/inference/inference_1/result.csv



