# Setup

### Creating a Conda Virtual Environment
To create a virtual environment using Conda, follow these steps:

1. Open your terminal or command prompt.
2. Run the following command to create a virtual environment named `ea-rag`:

In [None]:
%conda create --name ea-rag python=3.11

Activate the environment

In [None]:
conda activate ea_rag
conda env list          # check that ea_rag* is activated 
python --version        # should be 3.11.x
# from top right corner please select the corect environment and restart the kernel 

### Installing dependencies

In [85]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


Create a `.env` file with the following details:
- `OPENAI_API_KEY`
- `PINECONE_API_KEY`

# Download the dataset
The DBP15K dataset can be downloaded [here](https://huggingface.co/datasets/HackCz/DBP15K_raw/blob/main/DBP_raw.zip). Change the name of the foldder to 'fr_en'.

# Preprocessing

### Relation triples
Convert tabular data in `en_rel_triples` and `fr_rel_triples` to N-Triples format. Example:
- input: `http://dbpedia.org/resource/Virton	http://dbpedia.org/property/nw	http://dbpedia.org/resource/Tintigny`
- output: `<http://dbpedia.org/resource/Virton>	<http://dbpedia.org/property/nw>	<http://dbpedia.org/resource/Tintigny> .`

In [86]:
def convert_to_ntriples(input_file_path, output_file_path):
    """
    Converts a TSV RDF dataset to N-Triples format.
    
    Parameters:
    - input_file_path (str): Path to the input TSV file.
    - output_file_path (str): Path to save the processed N-Triples file.
    """
    try:
        with open(input_file_path, "r", encoding="utf-8") as infile, open(output_file_path, "w", encoding="utf-8") as outfile:
            for line in infile:
                # Split each line using tab as the delimiter
                parts = line.strip().split("\t")
                
                # Ensure the line has exactly 3 parts
                if len(parts) == 3:
                    subject, predicate, obj = parts
                    # Format the line according to the N-Triples format
                    n_triple_line = f"<{subject}> <{predicate}> <{obj}> .\n"
                    outfile.write(n_triple_line)
                else:
                    print(f"Skipping invalid line: {line.strip()}")

        print(f"N-Triples conversion complete for '{input_file_path}'. Output saved to '{output_file_path}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# List of datasets to process
datasets = [
    ("fr_en/en_rel_triples", "fr_en/en_rel_triples_preprocessed"),
    ("fr_en/fr_rel_triples", "fr_en/fr_rel_triples_preprocessed")
]

# Loop through the datasets and process them
for input_path, output_path in datasets:
    convert_to_ntriples(input_path, output_path)

N-Triples conversion complete for 'fr_en/en_rel_triples'. Output saved to 'fr_en/en_rel_triples_preprocessed'.
N-Triples conversion complete for 'fr_en/fr_rel_triples'. Output saved to 'fr_en/fr_rel_triples_preprocessed'.


### Attribute triples
Converting negative dates to a format that can be processed using Python. Example:
- input: `-0043-12-07`
- output: `0043-12-07 BCE`

In [87]:
import os

def preprocess_dates_in_file(input_file_path, output_file_path):
    """
    Preprocesses the RDF file to convert negative xsd:date values to a BCE format as plain strings.

    Parameters:
    - input_file_path (str): Path to the original RDF file.
    - output_file_path (str): Path to save the preprocessed RDF file.
    """
    with open(input_file_path, "r", encoding="utf-8") as infile, open(output_file_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            # Detecting negative dates and converting them to string format
            if '^^<http://www.w3.org/2001/XMLSchema#date>' in line:
                start_index = line.find('"') + 1
                end_index = line.find('"', start_index)
                
                if start_index != -1 and end_index != -1:
                    date_string = line[start_index:end_index]
                    if date_string.startswith("-"):
                        # Convert negative date to BCE string and remove type declaration
                        sanitized_date = f'"{date_string[1:]} BCE"'
                        # Remove the xsd:date type and keep it as a plain string
                        line = line[:start_index-1] + sanitized_date + " .\n"  # Added newline character
            # Ensure each line ends with a newline character even if unmodified
            if not line.endswith("\n"):
                line += "\n"
            outfile.write(line)
    
    print(f"Preprocessed RDF data saved to: {output_file_path}")

# Preprocess both datasets
datasets_to_preprocess = [
    ("fr_en/en_att_triples", "fr_en/en_att_triples_preprocessed"),
    ("fr_en/fr_att_triples", "fr_en/fr_att_triples_preprocessed")
]

for input_file, output_file in datasets_to_preprocess:
    preprocess_dates_in_file(input_file, output_file)

Preprocessed RDF data saved to: fr_en/en_att_triples_preprocessed
Preprocessed RDF data saved to: fr_en/fr_att_triples_preprocessed


# Graph creation & description generation
Processes the datasets with the following steps:
1) Creates a graph comprised of relation and attribute triples;
2) Extracts all pieces of information about a node and stores it in a `.txt` file

In [89]:
from rdflib import Graph
import os

def create_merged_graph(relation_file_path, attribute_file_path):
    """Creates and returns an RDF graph by merging relation and attribute files."""
    graph = Graph()
    graph.parse(relation_file_path, format="nt")
    graph.parse(attribute_file_path, format="nt")

    print(f"Merged graph created with {len(graph)} triples.")
    return graph

def format_triples_for_embedding(graph, entity_uri, language_prefix):
    """
    Formats RDF triples where the given entity is a subject or object into a readable format for embeddings.

    Parameters:
    - graph: RDFLib Graph object
    - entity_uri: URI of the entity to query for
    - language_prefix: Prefix to indicate language (e.g., "FR-" or "EN-")

    Returns:
    - formatted_text: A single string containing all triples where the entity is subject or object, ready for embedding generation.
    """

    def safe_split(uri):
        """Returns the language-prefixed label of a URI."""
        if uri is None:
            return f"{language_prefix}{entity_uri.split('/')[-1]}"
        return f"{language_prefix}{uri.split('/')[-1]}"

    # Prepare SPARQL queries for both subject and object positions
    query_subject = f"""
    SELECT ?s ?p ?o
    WHERE {{
        <{entity_uri}> ?p ?o .
    }}
    """
    
    query_object = f"""
    SELECT ?s ?p ?o
    WHERE {{
        ?s ?p <{entity_uri}> .
    }}
    """

    # Execute the queries
    subject_results = graph.query(query_subject)
    object_results = graph.query(query_object)

    # Prepare the formatted text for embeddings
    formatted_text = []

    # Format triples where the entity is the subject
    formatted_text.append(f"# Triples where '{safe_split(entity_uri)}' is the subject:\n")
    for s, p, o in subject_results:
        formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

    # Format triples where the entity is the object
    formatted_text.append(f"\n# Triples where '{safe_split(entity_uri)}' is the object:\n")
    for s, p, o in object_results:
        formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

    # Combine all triples into a single formatted string
    formatted_text = "\n".join(formatted_text)
    return formatted_text

def describe_node_for_embedding_per_subject(graph, output_file_prefix, language_prefix):
    """
    Extracts all attributes and relations for every node and saves them in ten separate text files
    (splitting the entities into ten equal parts for easier visualization).
    """
    subjects = list(set(graph.subjects()))
    total_subjects = len(subjects)
    chunk_size = total_subjects // 10

    os.makedirs(output_file_prefix, exist_ok=True)

    # Split into ten files for easier management
    for i in range(10):
        start_index = i * chunk_size
        end_index = (start_index + chunk_size) if (i < 9) else total_subjects
        output_file = os.path.join(output_file_prefix, f"part{i+1}.txt")
        
        with open(output_file, "w", encoding="utf-8") as outfile:
            for subject in subjects[start_index:end_index]:
                # Use the function to generate formatted triples for the subject
                formatted_text = format_triples_for_embedding(graph, subject, language_prefix)
                # Write the formatted data to the file with a separator for each entity
                outfile.write(f"\n{'='*80}\nEntity: {language_prefix}{subject.split('/')[-1]}\n{'='*80}\n")
                outfile.write(formatted_text + "\n")
        print(f"File saved: '{output_file}'.")

def process_multiple_datasets(datasets):
    """Processes multiple RDF datasets and saves results for each in the specified folder."""
    for relation_file, attribute_file, output_folder, language_prefix in datasets:
        graph = create_merged_graph(relation_file, attribute_file)
        describe_node_for_embedding_per_subject(graph, output_folder, language_prefix)

# List of datasets including both relation and attribute triples with the specified output folder and language prefix
datasets_to_process = [
    ("fr_en/en_rel_triples_preprocessed", "fr_en/en_att_triples_preprocessed", "fr_en/en_combined", "EN-"),
    ("fr_en/fr_rel_triples_preprocessed", "fr_en/fr_att_triples_preprocessed", "fr_en/fr_combined", "FR-")
]

# Run batch processing
process_multiple_datasets(datasets_to_process)

KeyboardInterrupt: 

# Creating the Pinecone index

In [4]:
import os
import time
from pinecone import Pinecone, ServerlessSpec 
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for Pinecone
pc = Pinecone(
    api_key=os.environ['PINECONE_API_KEY']
)

# Create Index if not already created
pinecone_index_name = "dl-proj-4"
if pinecone_index_name not in pc.list_indexes().names():
    pc.create_index(
        name=pinecone_index_name, 
        dimension=1536, # '1536' is the dimension for text-embedding-3-small
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
     
    while not pc.describe_index(pinecone_index_name).index.status['ready']:
        time.sleep(1)
    
    print("✅ Pinecone Index provisioned")
else:
    print("Pinecone Index Already Provisioned")

Pinecone Index provisioned


### English dataset conversion to embeddings
Create and insert embeddings for the english dataset (~ 19 mins)

In [5]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Specify which embedding model

# Load all text files from a directory
directory_path = "fr_en/en_combined"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "dl-proj-4"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("🇬🇧 Embeddings created and inserted in Pinecone Vector Database successfully!")

Embeddings created, and inserted in Pinecone Vector Database successfully!


### French dataset conversion to embeddings
Create and insert embeddings for the french dataset (~ 18 min)

In [6]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Specify which embedding model

# Load all text files from a directory
directory_path = "fr_en/fr_combined" 
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "dl-proj-4"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("🇫🇷 Embeddings created and inserted in Pinecone Vector Database successfully.")

Embeddings created, and inserted in Pinecone Vector Database successfully!


# Generating predictions
Uses the `gpt-4o-mini` pre-trained Large Language Model through OpenAI's API. The model receives the context retrieved from the Pinecone index.

### Splitting the dataset
Splits the dataset into 10 smaller parts for easier processing.

In [None]:
def split_dataset(file_path, output_folder="fr_en/split_parts", num_parts=10):
    """Split a large dataset into smaller parts for batch processing with multiple API keys.
    
    Args:
    - file_path (str): Path to the dataset file.
    - output_folder (str): Folder to save the split parts.
    - num_parts (int): Number of parts to split the dataset into.
    """
    import os

    # Ensure the output directory exists and is empty
    if os.path.exists(output_folder):
        # Clear the folder before saving new parts
        for file in os.listdir(output_folder):
            os.remove(os.path.join(output_folder, file))
    else:
        os.makedirs(output_folder)

    # Read all lines from the dataset
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Calculate the number of lines per part
    total_lines = len(lines)
    lines_per_part = total_lines // num_parts
    print(f"Total lines: {total_lines}, Lines per part: {lines_per_part}")

    # Split the dataset into multiple parts
    for i in range(num_parts):
        start_idx = i * lines_per_part
        # For the last file, include all remaining lines
        end_idx = start_idx + lines_per_part if i < num_parts - 1 else total_lines
        part_lines = lines[start_idx:end_idx]

        # Save each part as a separate file in the specified folder
        part_file_path = os.path.join(output_folder, f"ent_ILLs_part_{i + 1}.txt")
        with open(part_file_path, "w", encoding="utf-8") as part_file:
            part_file.writelines(part_lines)
        print(f"✅ Part {i + 1} saved to '{part_file_path}' with {len(part_lines)} lines.")

# Usage Example
split_dataset("fr_en/ent_ILLs_medium.txt", num_parts=10)

### Querying the model
Uses 5 API keys to avoid hitting the rate limit.

In [None]:
import os
import openai
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm
import time
import threading
import glob

# Load environment variables for multiple API keys
load_dotenv()
API_KEYS = [
    os.environ.get('OPENAI_API_KEY_01'),
    os.environ.get('OPENAI_API_KEY_02'),
    os.environ.get('OPENAI_API_KEY_03'),
    os.environ.get('OPENAI_API_KEY_04'),
    os.environ.get('OPENAI_API_KEY_05')
]

# Initialize token buckets for rate and daily limits
RATE_LIMIT = 500  # per minute per API key
DAILY_LIMIT = 10000  # per day per API key
api_key_index = 0
rate_limit_buckets = [threading.Semaphore(RATE_LIMIT) for _ in API_KEYS]
daily_token_buckets = [threading.Semaphore(DAILY_LIMIT) for _ in API_KEYS]
lock = threading.Lock()  # For thread-safe key rotation

def refill_tokens():
    """Refill the rate limit buckets every minute."""
    while True:
        time.sleep(60)
        for bucket in rate_limit_buckets:
            for _ in range(RATE_LIMIT):
                bucket.release()

def reset_daily_tokens():
    """Reset the daily token buckets at midnight."""
    while True:
        now = time.localtime()
        seconds_until_midnight = (24 * 3600) - (now.tm_hour * 3600 + now.tm_min * 60 + now.tm_sec)
        time.sleep(seconds_until_midnight)
        for bucket in daily_token_buckets:
            for _ in range(DAILY_LIMIT):
                bucket.release()

# Start the token refill threads
threading.Thread(target=refill_tokens, daemon=True).start()
threading.Thread(target=reset_daily_tokens, daemon=True).start()

def get_next_api_key():
    """Rotate between API keys with thread safety."""
    global api_key_index
    with lock:
        api_key_index = (api_key_index + 1) % len(API_KEYS)
    return API_KEYS[api_key_index], rate_limit_buckets[api_key_index], daily_token_buckets[api_key_index]

def initialize_langchain(api_key):
    """Initialize LangChain components with a specific API key."""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=api_key)
    pinecone_index_name = "dl-proj-4"
    vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=api_key, temperature=0.4)

    prompt_template = PromptTemplate(
        template="""
        Use the following context to identify the most similar entity in the French dataset (prefixed with FR-) for the given entity in the English dataset (prefixed with EN-):
        Context: {context}
        Given Entity: {question}
        Provide only the name of the most similar entity from the dataset, prefixed with 'FR-'. Use the entity name, the provided context and your knowledge to identify the best answer.
        Answer:""",
        input_variables=["context", "question"]
    )

    llm_chain = prompt_template | llm | StrOutputParser()
    return retriever, llm_chain

def process_file(file_path):
    """Read the input file and return the modified entities for both EN and FR."""
    en_entities = []
    fr_entities = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                _, uri = parts
                modified_uri = uri.replace("http://dbpedia.org/resource/", "")
                if uri.startswith("http://dbpedia.org/resource/FR-"):
                    fr_entities.append(f"FR-{modified_uri}")
                else:
                    en_entities.append(f"EN-{modified_uri}")
    return en_entities, fr_entities

def process_single_entity(entity, retries=3, delay=5):
    """Process a single entity with retries on failure."""
    for attempt in range(retries):
        try:
            # API Key Handling
            api_key, rate_bucket, daily_bucket = get_next_api_key()
            rate_bucket.acquire()
            daily_bucket.acquire()
            
            # Initialize LangChain Components
            retriever, llm_chain = initialize_langchain(api_key)
            
            # Query Preparation
            query = f"{entity}"
            docs = retriever.invoke(query)
            context = "\n\n".join([doc.page_content for doc in docs])

            # Query the API
            answer = llm_chain.invoke({"context": context, "question": entity})
            if not answer: 
                print(f"Empty response for {entity}")
                continue  # Retry if the answer is empty

            # Prepare Result URIs
            english_uri = f"http://dbpedia.org/resource/{entity.replace('EN-', '')}"
            french_uri = f"http://fr.dbpedia.org/resource/{answer.replace('FR-', '').replace(' ', '_')}"
            return english_uri, french_uri

        except Exception as e:
            print(f"Error processing entity {entity} (attempt {attempt + 1}/{retries}): {e}")
            time.sleep(delay)  # Wait before retrying

    # If all retries fail, return None and log it
    print(f"❌ Failed to process {entity} after {retries} attempts.")
    return None, None

def save_failed_entities(failed_entities, output_file):
    """Save the entities that failed processing to a file for later reprocessing."""
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as file:
        for entity in failed_entities:
            file.write(f"{entity}\n")
    print(f"\n❌ Failed entities saved to {output_file}")

# In the parallel processing function, collect failed entities
def query_llm_for_entity_pairing_parallel(en_entities, max_workers=10):
    results = {}
    failed_entities = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_single_entity, entity): entity for entity in en_entities}

        for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
            try:
                english_uri, french_uri = future.result()
                if english_uri and french_uri:
                    results[english_uri] = french_uri
                else:
                    failed_entities.append(futures[future])
            except Exception as e:
                print(f"Error: {e}")
                failed_entities.append(futures[future])

    # Save failed entities for retry later
    if failed_entities:
        save_failed_entities(failed_entities, "fr_en/failed_entities.txt")

    return results

def save_results_to_txt(results, output_file):
    """Save the alignment results to a .txt file."""
    # Ensure the directory exists before saving
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, "w", encoding="utf-8") as file:
        for en_uri, fr_uri in results.items():
            file.write(f"{fr_uri}\t{en_uri}\n")
    print(f"\n✅ Results saved to {output_file}")

# === Process All Split Files ===
split_files = sorted(glob.glob("fr_en/split_parts/ent_ILLs_part*.txt"))

for split_file in split_files:
    output_file = split_file.replace("split_parts", "aligned_results").replace(".txt", "_aligned.txt")
    
    # Skip if already processed
    if os.path.exists(output_file):
        print(f"✅ {split_file} already processed. Skipping...")
        continue

    print(f"\n🚀 Processing: {split_file}")
    
    # Load the current part
    en_entities, fr_entities = process_file(split_file)

    # Perform parallel processing
    results = query_llm_for_entity_pairing_parallel(en_entities, max_workers=10)

    # Save results immediately after processing each file
    save_results_to_txt(results, output_file)


🚀 Processing: fr_en/split_parts/ent_ILLs_part_1.txt


100%|██████████| 1500/1500 [05:21<00:00,  4.66it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_1_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_10.txt


100%|██████████| 1500/1500 [05:30<00:00,  4.54it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_10_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_2.txt


100%|██████████| 1500/1500 [05:12<00:00,  4.80it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_2_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_3.txt


100%|██████████| 1500/1500 [05:11<00:00,  4.82it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_3_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_4.txt


100%|██████████| 1500/1500 [05:45<00:00,  4.34it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_4_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_5.txt


100%|██████████| 1500/1500 [05:20<00:00,  4.69it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_5_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_6.txt


100%|██████████| 1500/1500 [05:30<00:00,  4.54it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_6_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_7.txt


100%|██████████| 1500/1500 [05:18<00:00,  4.71it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_7_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_8.txt


100%|██████████| 1500/1500 [05:21<00:00,  4.67it/s]



✅ Results saved to fr_en/aligned_results/ent_ILLs_part_8_aligned.txt

🚀 Processing: fr_en/split_parts/ent_ILLs_part_9.txt


100%|██████████| 1500/1500 [12:24<00:00,  2.01it/s]


✅ Results saved to fr_en/aligned_results/ent_ILLs_part_9_aligned.txt





# Entity comparison
Compares the predictions generated by the LLM with the ground truth.

### Prediction merging
Combines the predictions generated by the model into a single file.

In [62]:
import os

# Define the directories and output file path
input_directory = "fr_en/aligned_results/"
output_file = "fr_en/aligned_entities.txt"

# Open the output file in write mode
with open(output_file, "w") as outfile:
    for i in range(1, 11):  # Loop from 1 to 10
        filename = f"ent_ILLs_part_{i}_aligned.txt"
        filepath = os.path.join(input_directory, filename)
        
        # Check if the file exists before merging
        if os.path.exists(filepath):
            print(f"⏳ Merging {filename}")
            with open(filepath, "r") as infile:
                outfile.write(infile.read())
                outfile.write("\n")  # Add a newline between merged contents
        else:
            print(f"File {filename} not found!")

print(f"✅ Merging complete. Results saved to {output_file}.")

Merging ent_ILLs_part_1_aligned.txt...
Merging ent_ILLs_part_2_aligned.txt...
Merging ent_ILLs_part_3_aligned.txt...
Merging ent_ILLs_part_4_aligned.txt...
Merging ent_ILLs_part_5_aligned.txt...
Merging ent_ILLs_part_6_aligned.txt...
Merging ent_ILLs_part_7_aligned.txt...
Merging ent_ILLs_part_8_aligned.txt...
Merging ent_ILLs_part_9_aligned.txt...
Merging ent_ILLs_part_10_aligned.txt...
Merging complete! Results saved to fr_en/aligned_entities.txt.


### Hits@1
Computes the hits@1 score and saves non-aligned entities inside a `.txt` file. Both the normalized and original URIs are saved inside the `fr_en` folder.

In [84]:
import unicodedata
import re

# Standardize all abbreviations to the same form
ABBREVIATION_MAP = {
    "f.c.": "football_club",
    "fc": "football_club",
    "football club": "football_club",
    "univ.": "university",
    "univ": "university",
    "co.": "company",
    "co": "company",
    "corp.": "corporation",
    "corp": "corporation"
}

def normalize_uri(uri):
    """
    Normalize a URI by:
    - Removing parentheses (but keeping the content inside)
    - Trimming whitespace
    - Lowercasing
    - Removing accents from characters
    - Standardizing both '-' and '_' to '_'
    - Replacing all abbreviations with consistent forms
    - Sorting words **only after the base URL**
    """
    original_uri = uri.strip()
    uri = uri.strip().lower()
    
    # Remove parentheses but keep their content
    uri = uri.replace("(", "").replace(")", "")

    # Convert accented characters to base form (removes accents)
    uri = unicodedata.normalize("NFD", uri)
    uri = "".join(char for char in uri if unicodedata.category(char) != 'Mn')  # Remove accents

    # Replace all abbreviations with consistent forms
    for abbr, full_form in ABBREVIATION_MAP.items():
        uri = uri.replace(abbr, full_form)

    # Normalize separators: Convert both '-' and '_' into a single '_'
    uri = re.sub(r"[-_]+", "_", uri)  # Merge consecutive separators into one

    # Split base URL and resource part for separate handling
    if "/resource/" in uri:
        base_url, entity = uri.split("/resource/", 1)
        # Sort the words **only for the entity part**
        words = entity.split("_")
        words_sorted = sorted(words)
        entity_normalized = "_".join(words_sorted).strip("_")
        uri = f"{base_url}/resource/{entity_normalized}"
    else:
        # If no base URL detected, normalize the whole string
        words = uri.split("_")
        words_sorted = sorted(words)
        uri = "_".join(words_sorted).strip("_")
    
    return uri, original_uri  # Return both normalized and original versions

def read_alignment_file(file_path):
    """
    Read and normalize the alignment results file.
    """
    alignment_results = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) == 2:
                fr_uri, en_uri = parts
                # Normalize both URIs and store original versions too
                fr_uri_normalized, fr_uri_original = normalize_uri(fr_uri)
                en_uri_normalized, en_uri_original = normalize_uri(en_uri)
                alignment_results[en_uri_normalized] = (fr_uri_normalized, fr_uri_original, en_uri_original)
            else:
                print(f"Skipping invalid line: {line.strip()}")
    return alignment_results

def read_ground_truth_file(file_path):
    """
    Read and normalize the ground truth file.
    """
    ground_truth = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) == 2:
                fr_uri, en_uri = parts
                # Normalize both URIs and store original versions too
                fr_uri_normalized, fr_uri_original = normalize_uri(fr_uri)
                en_uri_normalized, en_uri_original = normalize_uri(en_uri)
                ground_truth[en_uri_normalized] = (fr_uri_normalized, fr_uri_original, en_uri_original)
            else:
                print(f"Skipping invalid line: {line.strip()}")
    return ground_truth

def compute_hits_at_1_and_save_misses(alignment_results, ground_truth):
    """
    Compute Hits@1 and save non-aligned entities to separate files:
    - One for normalized results
    - One for original results
    """
    hits = 0
    compared_entities = 0
    non_aligned_normalized = []
    non_aligned_original = []

    # Adding headers for both files
    non_aligned_normalized.append("English URI (normalized)\tGround Truth French URI (normalized)\tPredicted French URI (normalized)")
    non_aligned_original.append("English URI (original)\tGround Truth French URI (original)\tPredicted French URI (original)")

    for en_entity, (actual_fr_entity_norm, actual_fr_entity_orig, en_entity_orig) in ground_truth.items():
        if en_entity in alignment_results:  # Only compare if the entity exists in both sets
            compared_entities += 1
            predicted_fr_entity_norm, predicted_fr_entity_orig, predicted_en_entity_orig = alignment_results[en_entity]
            if predicted_fr_entity_norm == actual_fr_entity_norm:
                hits += 1
            else:
                non_aligned_normalized.append(f"{en_entity}\t{actual_fr_entity_norm}\t{predicted_fr_entity_norm or 'Not Found'}")
                non_aligned_original.append(f"{en_entity_orig}\t{actual_fr_entity_orig}\t{predicted_fr_entity_orig or 'Not Found'}")
        else:
            non_aligned_normalized.append(f"{en_entity}\t{actual_fr_entity_norm}\tNot Found")
            non_aligned_original.append(f"{en_entity_orig}\t{actual_fr_entity_orig}\tNot Found")

    # Save normalized results
    normalized_file = "fr_en/non_aligned_entities_normalized.txt"
    with open(normalized_file, "w", encoding="utf-8") as file:
        file.write("\n".join(non_aligned_normalized))
    
    # Save original results
    original_file = "fr_en/non_aligned_entities_original.txt"
    with open(original_file, "w", encoding="utf-8") as file:
        file.write("\n".join(non_aligned_original))
    
    # Print Hits@1 Score
    print(f"\n✅ Hits@1 Score: {hits / compared_entities:.4f}")
    print(f"❌ Non-aligned normalized entities saved to: {normalized_file}")
    print(f"❌ Non-aligned original entities saved to: {original_file}")
    return hits / compared_entities

# === Running the entire process with strict normalization ===
alignment_results_file = "fr_en/aligned_entities.txt"
ground_truth_file = "fr_en/ent_ILLs.txt"

alignment_results = read_alignment_file(alignment_results_file)
ground_truth = read_ground_truth_file(ground_truth_file)
hits_at_1_score = compute_hits_at_1_and_save_misses(alignment_results, ground_truth)


✅ Hits@1 Score: 0.6465
❌ Non-aligned normalized entities saved to: fr_en/non_aligned_entities_normalized.txt
❌ Non-aligned original entities saved to: fr_en/non_aligned_entities_original.txt
Total entities in ground truth: 15000
Total entities in alignment results: 15000
Matching keys count: 15000
