# Install dependencies

In [None]:
pip install -r requirements.txt

Create a `.env` file with the following details:
- `OPENAI_API_KEY`
- `PINECONE_API_KEY`

# Preprocessing

## Relation triples
Convert tabular data in `en_rel_triples` and `fr_rel_triples` to N-Triples format. Example:
- input: `http://dbpedia.org/resource/Virton	http://dbpedia.org/property/nw	http://dbpedia.org/resource/Tintigny`
- output: `<http://dbpedia.org/resource/Virton>	<http://dbpedia.org/property/nw>	<http://dbpedia.org/resource/Tintigny> .`

In [34]:
def convert_to_ntriples(input_file_path, output_file_path):
    """
    Converts a TSV RDF dataset to N-Triples format.
    
    Parameters:
    - input_file_path (str): Path to the input TSV file.
    - output_file_path (str): Path to save the processed N-Triples file.
    """
    try:
        with open(input_file_path, "r", encoding="utf-8") as infile, open(output_file_path, "w", encoding="utf-8") as outfile:
            for line in infile:
                # Split each line using tab as the delimiter
                parts = line.strip().split("\t")
                
                # Ensure the line has exactly 3 parts
                if len(parts) == 3:
                    subject, predicate, obj = parts
                    # Format the line according to the N-Triples format
                    n_triple_line = f"<{subject}> <{predicate}> <{obj}> .\n"
                    outfile.write(n_triple_line)
                else:
                    print(f"Skipping invalid line: {line.strip()}")

        print(f"N-Triples conversion complete for '{input_file_path}'. Output saved to '{output_file_path}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# List of datasets to process
datasets = [
    ("fr_en/en_rel_triples", "fr_en/en_rel_triples_preprocessed"),
    ("fr_en/fr_rel_triples", "fr_en/fr_rel_triples_preprocessed")
]

# Loop through the datasets and process them
for input_path, output_path in datasets:
    convert_to_ntriples(input_path, output_path)

N-Triples conversion complete for 'fr_en/en_rel_triples'. Output saved to 'fr_en/en_rel_triples_preprocessed'.
N-Triples conversion complete for 'fr_en/fr_rel_triples'. Output saved to 'fr_en/fr_rel_triples_preprocessed'.


## Attribute triples
Converting negative dates to a format that can be processed using Python. Example:
- input: `-0043-12-07`
- output: `0043-12-07 BCE`

In [None]:
import os

def preprocess_dates_in_file(input_file_path, output_file_path):
    """
    Preprocesses the RDF file to convert negative xsd:date values to a BCE format as plain strings.

    Parameters:
    - input_file_path (str): Path to the original RDF file.
    - output_file_path (str): Path to save the preprocessed RDF file.
    """
    with open(input_file_path, "r", encoding="utf-8") as infile, open(output_file_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            # Detecting negative dates and converting them to string format
            if '^^<http://www.w3.org/2001/XMLSchema#date>' in line:
                start_index = line.find('"') + 1
                end_index = line.find('"', start_index)
                
                if start_index != -1 and end_index != -1:
                    date_string = line[start_index:end_index]
                    if date_string.startswith("-"):
                        # Convert negative date to BCE string and remove type declaration
                        sanitized_date = f'"{date_string[1:]} BCE"'
                        # Remove the xsd:date type and keep it as a plain string
                        line = line[:start_index-1] + sanitized_date + " .\n"  # Added newline character
            # Ensure each line ends with a newline character even if unmodified
            if not line.endswith("\n"):
                line += "\n"
            outfile.write(line)
    
    print(f"Preprocessed RDF data saved to: {output_file_path}")

# Preprocess both datasets
datasets_to_preprocess = [
    ("fr_en/en_att_triples", "fr_en/en_att_triples_preprocessed"),
    ("fr_en/fr_att_triples", "fr_en/fr_att_triples_preprocessed")
]

for input_file, output_file in datasets_to_preprocess:
    preprocess_dates_in_file(input_file, output_file)

Preprocessed RDF data saved to: fr_en/en_att_triples_preprocessed
Preprocessed RDF data saved to: fr_en/fr_att_triples_preprocessed


# Graph creation & description generation
Processes the datasets with the following steps:
1) Creates a graph comprised of relation and attribute triples;
2) Extracts all pieces of information about a node and stores it in a `.txt` file

this version is deprecated

In [None]:
# from rdflib import Graph
# import os

# def create_merged_graph(relation_file_path, attribute_file_path):
#     """Creates and returns an RDF graph by merging relation and attribute files."""
#     graph = Graph()
#     graph.parse(relation_file_path, format="nt")
#     graph.parse(attribute_file_path, format="nt")

#     print(f"Merged graph created with {len(graph)} triples.")
#     return graph

# def format_triples_for_embedding(graph, entity_uri):
#     """
#     Formats RDF triples where the given entity is a subject or object into a readable format for embeddings.

#     Parameters:
#     - graph: RDFLib Graph object
#     - entity_uri: URI of the entity to query for

#     Returns:
#     - formatted_text: A single string containing all triples where the entity is subject or object, ready for embedding generation.
#     """

#     # Extract the entity label from the URI
#     entity_label = entity_uri.split('/')[-1]

#     def safe_split(uri):
#         """Replaces None with the current entity label and extracts labels from URIs."""
#         if uri is None:
#             return entity_label
#         return uri.split('/')[-1] if '/' in str(uri) else str(uri)

#     # Prepare SPARQL queries for both subject and object positions
#     query_subject = f"""
#     SELECT ?s ?p ?o
#     WHERE {{
#         <{entity_uri}> ?p ?o .
#     }}
#     """
    
#     query_object = f"""
#     SELECT ?s ?p ?o
#     WHERE {{
#         ?s ?p <{entity_uri}> .
#     }}
#     """

#     # Execute the queries
#     subject_results = graph.query(query_subject)
#     object_results = graph.query(query_object)

#     # Prepare the formatted text for embeddings
#     formatted_text = []

#     # Format triples where the entity is the subject
#     formatted_text.append(f"# Triples where '{entity_label}' is the subject:\n")
#     for s, p, o in subject_results:
#         formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

#     # Format triples where the entity is the object
#     formatted_text.append(f"\n# Triples where '{entity_label}' is the object:\n")
#     for s, p, o in object_results:
#         formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

#     # Combine all triples into a single formatted string
#     formatted_text = "\n".join(formatted_text)
#     return formatted_text

# def describe_node_for_embedding_per_subject(graph, output_folder):
#     """Extracts all attributes and relations for every node and saves them as text files."""
#     os.makedirs(output_folder, exist_ok=True)
#     for subject in set(graph.subjects()):
#         subject_label = subject.split('/')[-1]
#         subject_file_path = os.path.join(output_folder, f"{subject_label}.txt")

#         # Use the new function to generate formatted triples for the subject
#         formatted_text = format_triples_for_embedding(graph, subject)

#         # Write the formatted data to the file
#         with open(subject_file_path, "w", encoding="utf-8") as outfile:
#             outfile.write(formatted_text)
#     print(f"Files saved in '{output_folder}'.")

# def process_multiple_datasets(datasets):
#     """Processes multiple RDF datasets and saves results for each."""
#     for relation_file, attribute_file, output_folder in datasets:
#         graph = create_merged_graph(relation_file, attribute_file)
#         describe_node_for_embedding_per_subject(graph, output_folder)

# # List of datasets including both relation and attribute triples
# datasets_to_process = [
#     ("fr_en/en_rel_triples_preprocessed", "fr_en/en_att_triples_preprocessed", "fr_en/en_combined_triples_folder2"),
#     ("fr_en/fr_rel_triples_preprocessed", "fr_en/fr_att_triples_preprocessed", "fr_en/fr_combined_triples_folder2")
# ]

# # Run batch processing
# process_multiple_datasets(datasets_to_process)

Merged graph created with 855133 triples.
Files saved in 'fr_en/en_combined_triples_folder2'.
Merged graph created with 720856 triples.
Files saved in 'fr_en/fr_combined_triples_folder2'.


# Generate node descriptions (~ 2 mins)

In [11]:
from rdflib import Graph
import os

def create_merged_graph(relation_file_path, attribute_file_path):
    """Creates and returns an RDF graph by merging relation and attribute files."""
    graph = Graph()
    graph.parse(relation_file_path, format="nt")
    graph.parse(attribute_file_path, format="nt")

    print(f"Merged graph created with {len(graph)} triples.")
    return graph

def format_triples_for_embedding(graph, entity_uri, language_prefix):
    """
    Formats RDF triples where the given entity is a subject or object into a readable format for embeddings.

    Parameters:
    - graph: RDFLib Graph object
    - entity_uri: URI of the entity to query for
    - language_prefix: Prefix to indicate language (e.g., "FR-" or "EN-")

    Returns:
    - formatted_text: A single string containing all triples where the entity is subject or object, ready for embedding generation.
    """

    def safe_split(uri):
        """Returns the language-prefixed label of a URI."""
        if uri is None:
            return f"{language_prefix}{entity_uri.split('/')[-1]}"
        return f"{language_prefix}{uri.split('/')[-1]}"

    # Prepare SPARQL queries for both subject and object positions
    query_subject = f"""
    SELECT ?s ?p ?o
    WHERE {{
        <{entity_uri}> ?p ?o .
    }}
    """
    
    query_object = f"""
    SELECT ?s ?p ?o
    WHERE {{
        ?s ?p <{entity_uri}> .
    }}
    """

    # Execute the queries
    subject_results = graph.query(query_subject)
    object_results = graph.query(query_object)

    # Prepare the formatted text for embeddings
    formatted_text = []

    # Format triples where the entity is the subject
    formatted_text.append(f"# Triples where '{safe_split(entity_uri)}' is the subject:\n")
    for s, p, o in subject_results:
        formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

    # Format triples where the entity is the object
    formatted_text.append(f"\n# Triples where '{safe_split(entity_uri)}' is the object:\n")
    for s, p, o in object_results:
        formatted_text.append(f"{safe_split(s)} {safe_split(p)} {safe_split(o)}.")

    # Combine all triples into a single formatted string
    formatted_text = "\n".join(formatted_text)
    return formatted_text

def describe_node_for_embedding_per_subject(graph, output_file_prefix, language_prefix):
    """
    Extracts all attributes and relations for every node and saves them in ten separate text files
    (splitting the entities into ten equal parts for easier visualization).
    """
    subjects = list(set(graph.subjects()))
    total_subjects = len(subjects)
    chunk_size = total_subjects // 10

    os.makedirs(output_file_prefix, exist_ok=True)

    # Split into ten files for easier management
    for i in range(10):
        start_index = i * chunk_size
        end_index = (start_index + chunk_size) if (i < 9) else total_subjects
        output_file = os.path.join(output_file_prefix, f"part{i+1}.txt")
        
        with open(output_file, "w", encoding="utf-8") as outfile:
            for subject in subjects[start_index:end_index]:
                # Use the function to generate formatted triples for the subject
                formatted_text = format_triples_for_embedding(graph, subject, language_prefix)
                # Write the formatted data to the file with a separator for each entity
                outfile.write(f"\n{'='*80}\nEntity: {language_prefix}{subject.split('/')[-1]}\n{'='*80}\n")
                outfile.write(formatted_text + "\n")
        print(f"File saved: '{output_file}'.")

def process_multiple_datasets(datasets):
    """Processes multiple RDF datasets and saves results for each in the specified folder."""
    for relation_file, attribute_file, output_folder, language_prefix in datasets:
        graph = create_merged_graph(relation_file, attribute_file)
        describe_node_for_embedding_per_subject(graph, output_folder, language_prefix)

# List of datasets including both relation and attribute triples with the specified output folder and language prefix
datasets_to_process = [
    ("fr_en/en_rel_triples_preprocessed", "fr_en/en_att_triples_preprocessed", "fr_en/en_combined", "EN-"),
    ("fr_en/fr_rel_triples_preprocessed", "fr_en/fr_att_triples_preprocessed", "fr_en/fr_combined", "FR-")
]

# Run batch processing
process_multiple_datasets(datasets_to_process)

Merged graph created with 855133 triples.
File saved: 'en_combined/part1.txt'.
File saved: 'en_combined/part2.txt'.
File saved: 'en_combined/part3.txt'.
File saved: 'en_combined/part4.txt'.
File saved: 'en_combined/part5.txt'.
File saved: 'en_combined/part6.txt'.
File saved: 'en_combined/part7.txt'.
File saved: 'en_combined/part8.txt'.
File saved: 'en_combined/part9.txt'.
File saved: 'en_combined/part10.txt'.
Merged graph created with 720856 triples.
File saved: 'fr_combined/part1.txt'.
File saved: 'fr_combined/part2.txt'.
File saved: 'fr_combined/part3.txt'.
File saved: 'fr_combined/part4.txt'.
File saved: 'fr_combined/part5.txt'.
File saved: 'fr_combined/part6.txt'.
File saved: 'fr_combined/part7.txt'.
File saved: 'fr_combined/part8.txt'.
File saved: 'fr_combined/part9.txt'.
File saved: 'fr_combined/part10.txt'.


## Creating the Pinecone index

In [12]:
import os
import time
from pinecone import Pinecone, ServerlessSpec 
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for Pinecone
pc = Pinecone(
    api_key=os.environ['PINECONE_API_KEY']
)

# Create Index if not already created
pinecone_index_name = "dl-proj-4"
if pinecone_index_name not in pc.list_indexes().names():
    pc.create_index(
        name=pinecone_index_name, 
        dimension=1536, # '1536' is the dimension for text-embedding-3-small
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
     
    while not pc.describe_index(pinecone_index_name).index.status['ready']:
        time.sleep(1)
    
    print("Pinecone Index provisioned")
else:
    print("Pinecone Index Already Provisioned")

Pinecone Index provisioned


## English dataset conversion to embeddings
Create and insert embeddings for the english dataset (~ 13 min)

In [13]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Specify which embedding model

# Load all text files from a directory
directory_path = "fr_en/en_combined"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "dl-proj-4"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("Embeddings created, and inserted in Pinecone Vector Database successfully!")

Embeddings created, and inserted in Pinecone Vector Database successfully!


## French dataset conversion to embeddings
Create and insert embeddings for the french dataset (~ 12 min)

In [6]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Specify which embedding model

# Load all text files from a directory
directory_path = "fr_en/fr_combined" 
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "dl-proj-4"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("Embeddings created, and inserted in Pinecone Vector Database successfully!")

Embeddings created, and inserted in Pinecone Vector Database successfully!


# Retrieving results from the database

In [2]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
import os
import openai

from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Set your API keys for OpenAI and Pinecone
openai.api_key = os.environ['OPENAI_API_KEY']

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Specify which embedding model

# Connect to the Pinecone index using LangChain's Pinecone wrapper
pinecone_index_name = "dl-proj-4"
vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)

# Define the retrieval mechanism
retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top-3 relevant documents

# Initialize GPT-4 with OpenAI
llm = ChatOpenAI( model="gpt-4o-mini", openai_api_key=openai.api_key, temperature=0.7 )

# Define Prompt Template
prompt_template = PromptTemplate(
    template="""
    Use the following context to answer the question as accurately as possible:
    Context: {context}
    Question: {question}
    Answer:""",
    input_variables=["context", "question"]
)

# Create LLM Chain
llm_chain = prompt_template | llm | StrOutputParser()

# Retrieve documents
query = "Which entity is most similar to Cicéron? Provide only entity name"
docs = retriever.invoke(query)
context = "\n\n".join([doc.page_content for doc in docs])
    
# Run LLM chain with the retrieved context
answer = llm_chain.invoke({"context": context, "question": query})

# Output the Answer and Sources
print("Answer:", answer)
print("Context: ", context)

Answer: Marcus Tullius Cicero


# Aligning entities

# ? should there be any validation done

# Computing evaluation metrics