In [0]:
import pandas as pd

# Load the datasets
relationship_df = pd.read_csv('relationship-export.csv')
node_df = pd.read_csv('node-export.csv')
graph_df = pd.read_csv('graph-export.csv')

# Merge node properties
merged_node_df = pd.merge(node_df, graph_df, how='outer', left_on='~id', right_on='~start_node_id', suffixes=('_node', '_graph'))

# Inspect the columns
print("Columns after merging node and graph datasets:")
print(merged_node_df.columns)

# Select only the existing columns
columns_to_keep = ['~id', '~labels', 
                   'text_node', 'embedding_node', 'position_node', 
                   'length_node', 'content_offset_node', 'page_number_node', 
                   '~start_node_property_text', '~start_node_property_embedding']

# Filter columns dynamically
columns_to_keep_existing = [col for col in columns_to_keep if col in merged_node_df.columns]

# Create the cleaned node dataframe
cleaned_node_df = merged_node_df[columns_to_keep_existing]

# Merge the cleaned node dataset with the relationship dataset using the '~start_node_id' and '~end_node_id'
# attach the corresponding node properties to each relationship
merged_relationship_df = pd.merge(relationship_df, cleaned_node_df, how='left', left_on='~start_node_id', right_on='~id')
merged_relationship_df = pd.merge(merged_relationship_df, cleaned_node_df, how='left', left_on='~end_node_id', right_on='~id', suffixes=('_start', '_end'))

# Inspect the columns after merging the relationships and node data
print("Columns after merging relationships with node data:")
print(merged_relationship_df.columns)

# select the columns for start and end node text and embeddings
final_columns_to_keep = ['~start_node_id', '~end_node_id', '~relationship_type']

# Filter columns that start with 'text_' or 'embedding_' for both start and end nodes
final_columns_to_keep += [col for col in merged_relationship_df.columns if 'text_' in col or 'embedding_' in col]

# Filter the final merged DataFrame
final_merged_df = merged_relationship_df[final_columns_to_keep]

# Clean up any remaining NaN or unnecessary rows
final_cleaned_df = final_merged_df.dropna(subset=['~start_node_id', '~end_node_id'])

# Save dataset
final_cleaned_df.to_csv('final_merged_cleaned_dataset.csv', index=False)

# Display
print("First few rows of the cleaned and merged dataset:")
print(final_cleaned_df.head())

Columns after merging node and graph datasets:
Index(['~id', '~labels', 'fileName', 'errorMessage', 'fileSource',
       'total_chunks', 'processingTime', 'createdAt', 'fileSize', 'nodeCount',
       'model', 'processed_chunk', 'fileType', 'relationshipCount',
       'is_cancelled', 'status', 'updatedAt', 'content_offset', 'page_number',
       'length', 'id', 'text', 'position', 'embedding', 'description',
       '~start_node_id', '~start_node_labels', '~start_node_property_fileName',
       '~start_node_property_content_offset',
       '~start_node_property_page_number', '~start_node_property_length',
       '~start_node_property_id', '~start_node_property_text',
       '~start_node_property_position', '~start_node_property_embedding',
       '~relationship_type', '~end_node_id', '~end_node_labels',
       '~end_node_property_fileName', '~end_node_property_errorMessage',
       '~end_node_property_fileSource', '~end_node_property_total_chunks',
       '~end_node_property_processingTi

In [0]:
import networkx as nx
import pandas as pd

# Load dataset
final_cleaned_df = pd.read_csv('final_merged_cleaned_dataset.csv')

# Inspect the columns to find the correct names for text and embedding columns
print("Columns in the final dataset:")
print(final_cleaned_df.columns)

# Create an empty directed graph
G = nx.DiGraph()

# Add nodes with their properties (text and embeddings)
# Convert text and embedding values to strings, using placeholders for None values
for index, row in final_cleaned_df.iterrows():
    # Convert to string or use empty string if None
    start_text = str(row.get('~start_node_property_text_start', ''))
    start_embedding = str(row.get('~start_node_property_embedding_start', ''))
    
    end_text = str(row.get('~start_node_property_text_end', ''))
    end_embedding = str(row.get('~start_node_property_embedding_end', ''))
    
    # Add start node with its properties
    G.add_node(row['~start_node_id'], text=start_text, embedding=start_embedding)
    
    # Add end node with its properties
    G.add_node(row['~end_node_id'], text=end_text, embedding=end_embedding)

# Add edges (relationships) between nodes
for index, row in final_cleaned_df.iterrows():
    G.add_edge(row['~start_node_id'], row['~end_node_id'], relationship=row['~relationship_type'])

# Save the graph structure (now it will work because all attributes are strings)
nx.write_gml(G, 'kg_graph.gml')

# Querying the graph
# Query the graph to find relationships or node information
example_node = final_cleaned_df.iloc[0]['~start_node_id']  
print(f"Node {example_node} text:", G.nodes[example_node]['text'])
print(f"Node {example_node} embedding:", G.nodes[example_node]['embedding'])

connected_nodes = list(G.successors(example_node))
print(f"Nodes connected to {example_node}:", connected_nodes)

Columns in the final dataset:
Index(['~start_node_id', '~end_node_id', '~relationship_type',
       '~start_node_property_text_start',
       '~start_node_property_embedding_start', '~start_node_property_text_end',
       '~start_node_property_embedding_end'],
      dtype='object')
Node 1 text: Definitions accepted offer is acceptance is entering a written agreement the offer of placement is accepted within the nominated tim navigate accepted offers offer status a ie accepted includes cancelled studylink ltigtpending conditionsltigt created by australian government department of education esos framework admission is students admission into a program covering the point of admission and record of ongoing status of their adm does not include cancelled ltigtall conditions have been metltigt created by australian government dictionary advanced diploma is the purpose of the advanced diploma qualification type is to qualify individuals who apply specialised uk level 4 equivalent qualification

In [0]:
pip install faiss-cpu

Collecting faiss-cpu
  Obtaining dependency information for faiss-cpu from https://files.pythonhosted.org/packages/76/6c/256239bd142101cd2ce50d920622ab6d5a03742eabc462db49d7910c69c7/faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/27.0 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/27.0 MB[0m [31m7.5 MB/s[0m eta [36m0:00:04[0m
[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/27.0 MB[0m [31m45.0 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/27.0 MB[0m [31m51.8 MB/s[0m eta [36m0:00:01[0m
[2K   [91

In [0]:
import networkx as nx
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import time
import psutil
from functools import lru_cache
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Define Document class
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

# Load definitions from a text file
def load_definitions(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

# Define the KG-augmented retriever class
class KGAndFAISSRetriever:
    def __init__(self, definitions, kg, num_retrieved_docs=5):
        # FAISS document retrieval
        all_documents = [Document(definition) for definition in definitions]
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.db = FAISS.from_documents(all_documents, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})
        
        # Knowledge Graph (KG)
        self.kg = kg
        self.num_retrieved_docs = num_retrieved_docs

    def search(self, query):
        # FAISS retrieval
        faiss_docs = self.retriever.get_relevant_documents(query)
        
        # KG retrieval
        kg_info = self.query_kg(query)
        
        return faiss_docs, kg_info

    def query_kg(self, query):
        # Query the KG for relevant nodes
        relevant_nodes = []
        for node in self.kg.nodes:
            if query.lower() in self.kg.nodes[node]['text'].lower():
                relevant_nodes.append(self.kg.nodes[node]['text'])
        return " ".join(relevant_nodes[:self.num_retrieved_docs])

# Define the T5 Assistant class for generation
class T5Assistant:
    def __init__(self, model_name='t5-small'):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def create_prompt(self, query, retrieved_info):
        return (f"Explain the concept or answer the question in a detailed manner using simple words and examples.\n"
                f"Instruction: {query}\n"
                f"Relevant information: {retrieved_info}\n"
                f"Output:")

    def generate_reply(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids
        outputs = self.model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Calculate BLEU score
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

# Print memory usage
def print_memory_usage():
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.2f} MB")

# LRU cache to speed up repeated queries
@lru_cache(maxsize=10)
def cached_generate_reply(assistant, query, retrieved_info):
    return assistant.generate_reply(query, retrieved_info)

if __name__ == "__main__":
    # Load definitions and KG
    definitions = load_definitions('ctx_pd.txt')
    
    # Load the knowledge graph
    kg = nx.read_gml('kg_graph.gml')
    
    # Initialize the retriever (with KG and FAISS)
    retriever = KGAndFAISSRetriever(definitions, kg, num_retrieved_docs=5)
    assistant = T5Assistant(model_name='t5-small')

    # Sample queries
    generated_queries = [
        "What is accreditation?",
        "How is assessment done?",
        "What is a bachelor's degree?",
        "What is a campus?"
    ]

    # Process each query
    for query in generated_queries:
        start_time = time.time()
        
        # Retrieve from both FAISS and KG
        faiss_docs, kg_info = retriever.search(query)
        faiss_info = " ".join([doc.page_content for doc in faiss_docs])
        retrieved_info = f"FAISS info: {faiss_info}\nKG info: {kg_info}"

        # Find the reference answer
        reference_answer = faiss_docs[0].page_content if faiss_docs else ""
        
        # Generate reply
        generated_reply = cached_generate_reply(assistant, query, retrieved_info)

        # Calculate BLEU score
        if reference_answer:
            bleu_score = calculate_bleu(reference_answer, generated_reply)
        else:
            bleu_score = "N/A"

        # Output the results
        print(f"Query: {query}")
        print(f"Generated Reply:\n{generated_reply}")
        print(f"Reference Answer:\n{reference_answer}")
        print(f"BLEU Score: {bleu_score}")

        print_memory_usage()

        end_time = time.time()
        print(f"Time taken for this query: {end_time - start_time:.2f} seconds\n")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  warn_deprecated(


Query: What is accreditation?
Generated Reply:
: What is accreditation? Relevant information: FAISS info: What is accreditation? Relevant information: FAISS info: What is accreditation? Relevant information: FAISS info: Academic Accreditation is Approval process by an accrediting authority of a learning program leading to an AQF qualification. OR the evaluation of a course of study against course requirements specified in the Higher Education Standards Framework.
Reference Answer:
Academic Accreditation is Approval process by an accrediting authority of a learning program leading to an AQF qualification using the quality assurance standards for the relevant education and training sector OR the evaluation of a course of study (either by TEQSA or by a self-accrediting provider such as a university) against course requirements specified in the Higher Education Standards Framework. created by AQF (Australian Qualifications Framework)
BLEU Score: 0.4674996214456577
Memory Usage: 1706.05 MB


[Trace(request_id=tr-9fc6b9f9d2a64216b04e7adb9d816df7), Trace(request_id=tr-8cae4326ca6d4e389bfdf111b0386f68), Trace(request_id=tr-2663d7722c864d6599b7bcd8ee13ef9e), Trace(request_id=tr-02c9782bdba148358e2555aa6127bbc0)]