**Data Processing**

In [0]:
import pandas as pd

# Load the datasets
relationship_df = pd.read_csv('relationship-export.csv')
node_df = pd.read_csv('node-export.csv')
graph_df = pd.read_csv('graph-export.csv')

# Merge node properties
merged_node_df = pd.merge(node_df, graph_df, how='outer', left_on='~id', right_on='~start_node_id', suffixes=('_node', '_graph'))

# Inspect the columns
print("Columns after merging node and graph datasets:")
print(merged_node_df.columns)

# Select only the existing columns
columns_to_keep = ['~id', '~labels', 
                   'text_node', 'embedding_node', 'position_node', 
                   'length_node', 'content_offset_node', 'page_number_node', 
                   '~start_node_property_text', '~start_node_property_embedding']

# Filter columns dynamically
columns_to_keep_existing = [col for col in columns_to_keep if col in merged_node_df.columns]

# Create the cleaned node dataframe
cleaned_node_df = merged_node_df[columns_to_keep_existing]

# Merge the cleaned node dataset with the relationship dataset using the '~start_node_id' and '~end_node_id'
# attach the corresponding node properties to each relationship
merged_relationship_df = pd.merge(relationship_df, cleaned_node_df, how='left', left_on='~start_node_id', right_on='~id')
merged_relationship_df = pd.merge(merged_relationship_df, cleaned_node_df, how='left', left_on='~end_node_id', right_on='~id', suffixes=('_start', '_end'))

# Inspect the columns after merging the relationships and node data
print("Columns after merging relationships with node data:")
print(merged_relationship_df.columns)

# select the columns for start and end node text and embeddings
final_columns_to_keep = ['~start_node_id', '~end_node_id', '~relationship_type']

# Filter columns that start with 'text_' or 'embedding_' for both start and end nodes
final_columns_to_keep += [col for col in merged_relationship_df.columns if 'text_' in col or 'embedding_' in col]

# Filter the final merged DataFrame
final_merged_df = merged_relationship_df[final_columns_to_keep]

# Clean up any remaining NaN or unnecessary rows
final_cleaned_df = final_merged_df.dropna(subset=['~start_node_id', '~end_node_id'])

# Save dataset
final_cleaned_df.to_csv('final_merged_cleaned_dataset.csv', index=False)

# Display
print("First few rows of the cleaned and merged dataset:")
print(final_cleaned_df.head())

Columns after merging node and graph datasets:
Index(['~id', '~labels', 'fileName', 'errorMessage', 'fileSource',
       'total_chunks', 'processingTime', 'createdAt', 'fileSize', 'nodeCount',
       'model', 'processed_chunk', 'fileType', 'relationshipCount',
       'is_cancelled', 'status', 'updatedAt', 'content_offset', 'page_number',
       'length', 'id', 'text', 'position', 'embedding', 'description',
       '~start_node_id', '~start_node_labels', '~start_node_property_fileName',
       '~start_node_property_content_offset',
       '~start_node_property_page_number', '~start_node_property_length',
       '~start_node_property_id', '~start_node_property_text',
       '~start_node_property_position', '~start_node_property_embedding',
       '~relationship_type', '~end_node_id', '~end_node_labels',
       '~end_node_property_fileName', '~end_node_property_errorMessage',
       '~end_node_property_fileSource', '~end_node_property_total_chunks',
       '~end_node_property_processingTi

**Knowledge Graph Construction and Querying**

In [0]:
import networkx as nx
import pandas as pd

# Load dataset
final_cleaned_df = pd.read_csv('final_merged_cleaned_dataset.csv')

# Inspect the columns to find the correct names for text and embedding columns
print("Columns in the final dataset:")
print(final_cleaned_df.columns)

# Create an empty directed graph
G = nx.DiGraph()

# text and embeddings
# Convert text and embedding values to strings, using placeholders for None values
for index, row in final_cleaned_df.iterrows():
    # Convert to string or use empty string if None
    start_text = str(row.get('~start_node_property_text_start', ''))
    start_embedding = str(row.get('~start_node_property_embedding_start', ''))
    
    end_text = str(row.get('~start_node_property_text_end', ''))
    end_embedding = str(row.get('~start_node_property_embedding_end', ''))
    
    # Add start node with its properties
    G.add_node(row['~start_node_id'], text=start_text, embedding=start_embedding)
    
    # Add end node with its properties
    G.add_node(row['~end_node_id'], text=end_text, embedding=end_embedding)

# Add edges (relationships) between nodes
for index, row in final_cleaned_df.iterrows():
    G.add_edge(row['~start_node_id'], row['~end_node_id'], relationship=row['~relationship_type'])

# Save the graph structure
nx.write_gml(G, 'kg_graph.gml')

# Querying the graph
# Query the graph to find relationships or node information
example_node = final_cleaned_df.iloc[0]['~start_node_id']  
print(f"Node {example_node} text:", G.nodes[example_node]['text'])
print(f"Node {example_node} embedding:", G.nodes[example_node]['embedding'])

connected_nodes = list(G.successors(example_node))
print(f"Nodes connected to {example_node}:", connected_nodes)

Columns in the final dataset:
Index(['~start_node_id', '~end_node_id', '~relationship_type',
       '~start_node_property_text_start',
       '~start_node_property_embedding_start', '~start_node_property_text_end',
       '~start_node_property_embedding_end'],
      dtype='object')
Node 1 text: Definitions accepted offer is acceptance is entering a written agreement the offer of placement is accepted within the nominated tim navigate accepted offers offer status a ie accepted includes cancelled studylink ltigtpending conditionsltigt created by australian government department of education esos framework admission is students admission into a program covering the point of admission and record of ongoing status of their adm does not include cancelled ltigtall conditions have been metltigt created by australian government dictionary advanced diploma is the purpose of the advanced diploma qualification type is to qualify individuals who apply specialised uk level 4 equivalent qualification

**Import Libraries and Define Classes**

In [0]:
pip install ragas

Collecting numpy (from ragas)
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/18.3 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/18.3 MB[0m [31m84.7 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━

In [0]:
pip install faiss-cpu

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import networkx as nx
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import time
import psutil
from functools import lru_cache
from ragas.metrics import context_precision, faithfulness, answer_relevancy, context_recall
from ragas import evaluate
from datasets import Dataset
import pandas as pd

# Document class definition
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

# Function to load definitions from a text file
def load_definitions(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

**Knowledge Graph & FAISS Retriever**

In [0]:
# KG and FAISS retriever class
class KGAndFAISSRetriever:
    def __init__(self, definitions, kg, num_retrieved_docs=5):
        # FAISS retrieval setup
        all_documents = [Document(definition) for definition in definitions]
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.db = FAISS.from_documents(all_documents, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})
        
        # Knowledge Graph (KG)
        self.kg = kg
        self.num_retrieved_docs = num_retrieved_docs

    def search(self, query):
        # FAISS retrieval
        faiss_docs = self.retriever.get_relevant_documents(query)
        # KG retrieval
        kg_info = self.query_kg(query)
        return faiss_docs, kg_info

    def query_kg(self, query):
        # Query the KG by matching text
        relevant_nodes = []
        for node in self.kg.nodes:
            if query.lower() in self.kg.nodes[node]['text'].lower():
                relevant_nodes.append(self.kg.nodes[node]['text'])
        return " ".join(relevant_nodes[:self.num_retrieved_docs])

**Flan-T5 Assistant Model**

In [0]:
# Flan-T5 assistant class for generation
class FlanT5Assistant:
    def __init__(self, model_name='google/flan-t5-small'):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def create_prompt(self, query, retrieved_info):
        return (f"Explain the concept or answer the question in a detailed manner using simple words and examples.\n"
                f"Instruction: {query}\n"
                f"Relevant information: {retrieved_info}\n"
                f"Output:")

    def generate_reply(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids
        outputs = self.model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

**T5 Assistant Model**

In [0]:
# T5 Assistant class for generation
class T5Assistant:
    def __init__(self, model_name='t5-small'):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def create_prompt(self, query, retrieved_info):
        return (f"Explain the concept or answer the question in a detailed manner using simple words and examples.\n"
                f"Instruction: {query}\n"
                f"Relevant information: {retrieved_info}\n"
                f"Output:")

    def generate_reply(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids
        outputs = self.model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


**Utility Functions**

In [0]:
# Calculate BLEU score
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

# Print memory usage
def print_memory_usage():
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.2f} MB")

# LRU cache to speed up repeated queries
@lru_cache(maxsize=10)
def cached_generate_reply(assistant, query, retrieved_info):
    return assistant.generate_reply(query, retrieved_info)

**Process Queries and Evaluate**

In [0]:
import networkx as nx
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import time
import psutil
from functools import lru_cache
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

if __name__ == "__main__":
    # Load definitions and KG
    definitions = load_definitions('ctx_pd.txt')
    
    # Load the knowledge graph
    kg = nx.read_gml('kg_graph.gml')
    
    # Initialize the retriever (with KG and FAISS)
    retriever = KGAndFAISSRetriever(definitions, kg, num_retrieved_docs=5)

    # Sample queries
    generated_queries = [
        "What is the definition of 'Articulation'?",
        "Which accreditation framework is mentioned under the definition of 'Doctoral Degree'?",
        "Which organization is referenced in the definition of 'Assessment'?",
        "What is the difference between a 'Bachelor Degree' and a 'Bachelor Honours Degree' with reference to qualification level in the AQF?",
        "If a student studies in a foreign educational institution and is not a citizen or permanent resident of Australia, what term would the dataset use to categorize them as per the definitions?",
        "If a student completes a Certificate III and intends to directly pursue a Bachelor Degree, which concept from this dataset would likely apply to their transition?",
        "Describe the relationship between 'Foundation Course' and 'Pathway Course' based on their respective definitions.",
        "What percentage of modules studied that received a pass grade is referred to in the dataset, and under what name is this metric captured?",
        "If a student qualifies under the 'Doctoral Degree (Research)' category of the AQF, what learning outcome is significant in their qualification process?",
        "Based on the definitions provided, how would the process of 'Admission' differ from the process of an 'Application', and what criteria must a student meet to progress from one to the other?"
    ]

    ### Run queries using `FlanT5Assistant`
    print("Running with Flan-T5-Small Model")
    assistant_flan = FlanT5Assistant(model_name='google/flan-t5-small')

    for query in generated_queries:
        start_time = time.time()
        
        # Retrieve from both FAISS and KG
        faiss_docs, kg_info = retriever.search(query)
        faiss_info = " ".join([doc.page_content for doc in faiss_docs])
        retrieved_info = f"FAISS info: {faiss_info}\nKG info: {kg_info}"

        # Find the reference answer
        reference_answer = faiss_docs[0].page_content if faiss_docs else ""
        
        # Generate reply
        generated_reply = cached_generate_reply(assistant_flan, query, retrieved_info)

        # Calculate BLEU score
        if reference_answer:
            bleu_score = calculate_bleu(reference_answer, generated_reply)
        else:
            bleu_score = "N/A"

        # Output the results
        print(f"Query: {query}")
        print(f"Generated Reply (Flan-T5):\n{generated_reply}")
        print(f"Reference Answer:\n{reference_answer}")
        print(f"BLEU Score: {bleu_score}")

        print_memory_usage()

        end_time = time.time()
        print(f"Time taken for this query: {end_time - start_time:.2f} seconds\n")

    ### Run queries using `T5Assistant`
    print("Running with T5-Small Model")
    assistant_t5 = T5Assistant(model_name='t5-small')

    for query in generated_queries:
        start_time = time.time()
        
        # Retrieve from both FAISS and KG
        faiss_docs, kg_info = retriever.search(query)
        faiss_info = " ".join([doc.page_content for doc in faiss_docs])
        retrieved_info = f"FAISS info: {faiss_info}\nKG info: {kg_info}"

        # Find the reference answer
        reference_answer = faiss_docs[0].page_content if faiss_docs else ""
        
        # Generate reply
        generated_reply = cached_generate_reply(assistant_t5, query, retrieved_info)

        # Calculate BLEU score
        if reference_answer:
            bleu_score = calculate_bleu(reference_answer, generated_reply)
        else:
            bleu_score = "N/A"

        # Output the results
        print(f"Query: {query}")
        print(f"Generated Reply (T5-Small):\n{generated_reply}")
        print(f"Reference Answer:\n{reference_answer}")
        print(f"BLEU Score: {bleu_score}")

        print_memory_usage()

        end_time = time.time()
        print(f"Time taken for this query: {end_time - start_time:.2f} seconds\n")



Running with Flan-T5-Small Model


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Query: What is the definition of 'Articulation'?
Generated Reply (Flan-T5):
KG
Reference Answer:
Articulation is Arrangements enabling students to progress from a completed qualification to another with admission and/or credit in a defined qualification pathway  Or qualifying to enter the Bachelor program. created by TEQSA
BLEU Score: 0
Memory Usage: 2425.71 MB
Time taken for this query: 3.06 seconds

Query: Which accreditation framework is mentioned under the definition of 'Doctoral Degree'?
Generated Reply (Flan-T5):
professional or highly skilled work
Reference Answer:
"Doctoral Degree is Course with major research component: comprised of two-thirds or more research leading to a thesis/dissertation OR qualifies individuals who apply a substantial body of knowledge to research, investigate and develop new knowledge, in one or more fields of investigation, scholarship or professional practice. Two forms of Doctoral Degree with the same descriptor within the Doctoral Degree qualificati

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Query: What is the definition of 'Articulation'?
Generated Reply (T5-Small):
'Articulation'? Relevant information: FAISS info: Articulation is Arrangements enabling students to progress from a completed qualification to another with admission and/or credit in a defined qualification pathway Or qualifying to enter the Bachelor program.
Reference Answer:
Articulation is Arrangements enabling students to progress from a completed qualification to another with admission and/or credit in a defined qualification pathway  Or qualifying to enter the Bachelor program. created by TEQSA
BLEU Score: 0.8459547884985757
Memory Usage: 2569.15 MB
Time taken for this query: 4.08 seconds

Query: Which accreditation framework is mentioned under the definition of 'Doctoral Degree'?
Generated Reply (T5-Small):
a major research component: comprised of two-thirds or more research leading to a thesis/dissertation OR qualifies individuals who apply a substantial body of knowledge to research, investigate and d

[Trace(request_id=tr-8416d425d19b4c23aac172355dfdd152), Trace(request_id=tr-4bb9b4e994d8431fa3f4ed2e1726bb57), Trace(request_id=tr-11c0dadd9a89421db2743504c5825517), Trace(request_id=tr-f5817a55deb548cdaad1781fc6f15cae), Trace(request_id=tr-b0a117bcd3a94022a9a96c99220169cf), Trace(request_id=tr-eed4b9cca9394854ac742602bd8e11fd), Trace(request_id=tr-3c733ef4488b4dfd9c0cb600a3d19a3f), Trace(request_id=tr-3da249f9f9574c0388e4a44d5be49634), Trace(request_id=tr-c8a7ede2726043018ca68e5c94b7acf0), Trace(request_id=tr-eccb726f306d4172b83f43f391d4e4c2)]

In [0]:
# Define the function to create a comparison table
def create_comparison_table(queries, generated_replies, file_name="comparison_table.csv"):
    # Create a DataFrame with the queries and generated replies
    data = {
        "Query": queries,
        "Generated Reply": generated_replies
    }

    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(file_name, index=False)
    print(f"Comparison table saved to {file_name}")
    return df

# Initialize an empty list for the generated replies
generated_replies = []

# Assuming `generated_queries` contains your sample queries
for query in generated_queries:
    # Retrieve from both FAISS and KG
    faiss_docs, kg_info = retriever.search(query)
    faiss_info = " ".join([doc.page_content for doc in faiss_docs])
    
    # Generate reply
    retrieved_info = f"FAISS info: {faiss_info}\nKG info: {kg_info}"
    generated_reply = cached_generate_reply(assistant, query, retrieved_info)
    
    # Store the generated reply
    generated_replies.append(generated_reply)

# Now that `generated_replies` are collected, we can create the comparison table
create_comparison_table(generated_queries, generated_replies)

Comparison table saved to comparison_table.csv


Unnamed: 0,Query,Generated Reply
0,What is the definition of 'Articulation'?,KG
1,Which accreditation framework is mentioned und...,professional or highly skilled work
2,Which organization is referenced in the defini...,Assessment
3,What is the difference between a 'Bachelor Deg...,Bachelor Honours Degree
4,If a student studies in a foreign educational ...,KG
5,If a student completes a Certificate III and i...,Bachelor degree
6,Describe the relationship between 'Foundation ...,Foundation Course)
7,What percentage of modules studied that receiv...,Assessment Result is the mark given for an ass...
8,If a student qualifies under the 'Doctoral Deg...,Diploma
9,"Based on the definitions provided, how would t...","Not commenced i.e. """"""A"""""""". Offer status"


[Trace(request_id=tr-41683e31064f4cfe836510a1780b123d), Trace(request_id=tr-46bc7b4a7dd747b4955c8556abd2fdf2), Trace(request_id=tr-7e1d92af08374981a568260d1eb2afa3), Trace(request_id=tr-1fd89d5d67c6417893726de9cdd8d64e), Trace(request_id=tr-3157b9226f5846958b4cb5463ebd8363), Trace(request_id=tr-d6786062086e4c91a1a776ecd8cdd6ae), Trace(request_id=tr-5886b034d2584f4f8381f53668865e38), Trace(request_id=tr-9de1a267bcce4c44a9a195c181d0713d), Trace(request_id=tr-94ad3e009c5044aaaf5b0e011e10cf94), Trace(request_id=tr-ce6b4f57046745519b2375076b7d2c0e)]