In [1]:
import os
import json
from groq import Groq
from elasticsearch import Elasticsearch
from tqdm import tqdm  # tqdm used for progress visualization


In [2]:
h=os.getcwd()
print(h)

C:\Users\EstifanosT


In [3]:
# Directory containing all JSON files
json_folder_path = 'D:\\NBEDirectivesAssist\\ConvertedToJson\\'

# List to hold all documents from all JSON files
all_documents = []

# Loop through each JSON file in the directory
for json_file in os.listdir(json_folder_path):
    if json_file.endswith(".json"):
        json_file_path = os.path.join(json_folder_path, json_file)
        with open(json_file_path, 'rt', encoding='utf-8') as f_in:
            docs_raw = json.load(f_in)
        
        # Extract documents from the current JSON file
        documents = []
        for directive_id_dict in docs_raw:
            for doc in directive_id_dict['sections']:
                # Add additional fields to the doc
                doc['document_id'] = directive_id_dict['document_id']
                doc['title'] = directive_id_dict['title']
                documents.append(doc)

        # Append documents from this file to the global list
        all_documents.extend(documents)

In [4]:
# This is a new library compared to the previous modules. 
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
#created the dense vector using the pre-trained model
operations = []
for doc in all_documents:
    # Transforming the title into an embedding using the model
    doc["content_vector"] = model.encode(doc["content"]).tolist()
    operations.append(doc)

In [None]:
# Initialize Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

# Define index settings and mappings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "section_title": {"type": "text"},
            "content": {"type": "text"},
            "document_id": {"type": "keyword"},
            "section_id": {"type": "integer"}, # This section is the only integer field
            "content_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

# Create index
index_name = "directivesanalysissvector"
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
# Index documents (`all_documents` is a list of dictionaries)

for doc in tqdm(all_documents):
    es_client.index(index=index_name, document=doc)

In [None]:
# Search query
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["content^10", "title", "section_title", "document_id"],  # Only text fields
                        "type": "best_fields"
                    }
                }
            }
        }
    }

# Perform search
    response = es_client.search(index=index_name, body=search_query)
    result_docs=[]
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [None]:
# Search query
def elastic_search(search_term):
    vector_search_term = model.encode(search_term)
    query = {
    "field": "content_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}
   
# Perform search
    response = es_client.search(index=index_name, knn=query, source=["content", "section_id", "section_title", "document_id","title"])
    result_docs=[]
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [None]:
def build_prompt(search_term,search_results):
    # Create the prompt for the LLM
    prompt_template = """
You're National Bank of Ethiopia Directives assistant. Answer the QUESTION based on the CONTEXT from the directive document.
Use  facts from the CONTEXT when answering the QUESTION.
If the CONTEXT does not contain the answer, Answer from your general knowledge.

QUESTION: {question}
CONTEXT: {context}
     """.strip()

# Generate the context from the search results
    context = ""

    for doc in search_results:
        context += f"section_id: {doc['section_id']}\nsection_title: {doc['section_title']}\npage_number: {doc['page_number']}\ncontent: {doc['content']}\ndocument_id: {doc['document_id']}\ntitle: {doc['title']}\n\n"

# Format the prompt
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def build_prompt(search_term, search_results):
    prompt_template = """
    You're National Bank of Ethiopia Directives assistant. Answer the QUESTION based on the CONTEXT from the directive document.
Use  facts from the CONTEXT when answering the QUESTION.
If the CONTEXT does not contain the answer, Answer from your general knowledge.
    Question: {question}

    Context:
    {context}
    """

    context = ""

    for doc in search_results:
        # Use .get() to safely access dictionary keys
        context += (
            f"section_id: {doc.get('section_id', 'N/A')}\n"
            f"section_title: {doc.get('section_title', 'N/A')}\n"
            f"page_number: {doc.get('page_number', 'N/A')}\n"
            f"content: {doc.get('content', 'N/A')}\n"
            f"document_id: {doc.get('document_id', 'N/A')}\n"
            f"title: {doc.get('title', 'N/A')}\n\n"
        )

    # Format the prompt
    prompt = prompt_template.format(question=search_term, context=context).strip()
    return prompt


In [None]:
def llm(prompt):
    # Initialize Groq client
    client = Groq(api_key="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")

# Get the answer from the LLM model
    chat_completion = client.chat.completions.create(
        messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
    )
    llm_answer=chat_completion.choices[0].message.content
# return the reslut
    return llm_answer

In [None]:
def rag(query):
    search_results=elastic_search(search_term)
    prompt=build_prompt(query,search_results)
    answer=llm(prompt)
    return(answer)

In [None]:
rag('Tell me about board of directors size and composition?')