In [2]:
import json
import os
from tqdm.auto import tqdm
from dotenv import load_dotenv
import pandas as pd
from litellm import completion
from groq import Groq
from elasticsearch import Elasticsearch
# Load environment variables and set API keys
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
import warnings
warnings.filterwarnings('ignore')

In [3]:
with open('/home/nkama/LLM_and_RAG_Course/LLM_and_RAG-/personal_assg_project/interview_qa.json', 'rt') as f_in:
    documents = json.load(f_in)

In [4]:
documents[1]

{'doc_id': 'qa_002',
 'question': 'Given an array, find all the duplicates in this array? For example: input: [1,2,3,1,3,6,5] output: [1,3]',
 'answer': 'set1=set()\nres=set()\nfor i in list:\n  if i in set1:\n    res.add(i)\n  else:\n    set1.add(i)\nprint(res)',
 'course': 'python'}

## Implementing Key Word Search with Elasticsearch

In [5]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [7]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "question":{"type": "text"},
            "answer": {"type": "text"},
            "course": {"type": "keyword"}
            }
        }
    }

index_name = 'interview_qa_kw'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'interview_qa_kw'})

In [8]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 175/175 [00:01<00:00, 122.68it/s]


In [9]:
query = "What is Data Science?"

In [19]:
def keyword_search(client, query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", "course"],
                        "type": "best_fields"
                    }
                },
                }
            }
        }


    response = client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [20]:
keyword_search(es_client, query)

[{'doc_id': 'qa_0052',
  'question': 'What is Bayes’ Theorem and when is it used in data science?',
  'answer': 'The Bayes theorem predicts the probability that an event connected to any condition would occur. It is also taken into account in the situation of conditional probability. The probability of “causes” formula is another name for the Bayes theorem.\nIn data science, Bayes’ Theorem is used primarily in:\nBayesian Inference\nMachine Learning\nText Classification\nMedical Diagnosis\nPredictive Modeling\nWhen working with ambiguous or sparse data, Bayes’ Theorem is very helpful since it enables data scientists to continually revise their assumptions and come to more sensible conclusions.',
  'course': 'data_science'},
 {'doc_id': 'qa_00126',
  'question': 'Explain multivariate distribution in data science.',
  'answer': 'A vector with several normally distributed variables is said to have a multivariate normal distribution if any linear combination of the variables likewise has a 

In [16]:
from groq import Groq
from litellm import completion
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a Technical Interview Assistant. Your role is to Assist candidates 
preparing for interviews by providing detailed 
explanations, sample answers, and coding examples for Data Science, 
Python, and SQL-related interview questions. 
Use only the facts from the CONTEXT when answering the QUESTION. Do not answer from
own knowledge. If you do not find an appropriate answer to the query, just return the text
"No suitable answer found"

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"question: {doc['question']}\nanswer: {doc['answer']}\ncourse: {doc['course']}\n\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm_response(prompt, model="groq/llama3-8b-8192"):
    response = completion(
    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": prompt}
   ],
    )
    return response.choices[0].message.content

In [None]:
def rag(es_client, query, search="keyword search"):
    if search=
    search_results = keyword_search(es_client, query)
    prompt = build_prompt(query, search_results)
    answer = llm_response(prompt)
    return answer

In [26]:
def build_prompt(query):
    prompt_template = """
You're a Technical Interview Assistant. Your role is to Assist candidates 
preparing for interviews by providing detailed 
explanations, sample answers, and coding examples for Data Science, 
Python, and SQL-related interview questions. 
Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION. Do not answer from
own knowledge. If you do not find an appropriate answer to the query, just return the text
"No suitable answer found"

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""
    search_results = elastic_search(query)
    for doc in search_results:
        context = context + f"course: {doc['course']}\nquestion: {doc['question']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

prompt = build_prompt(query)
def llm(prompt):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content":prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    # print the response
    print(response.choices[0].message.content)


def keyword_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query)
    answer = llm(prompt)
    return answer

In [27]:
prompt = build_prompt(query)
def llm(prompt):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content":prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    # print the response
    print(response.choices[0].message.content)


In [28]:
def keyword_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query)
    answer = llm(prompt)
    return answer

In [29]:
keyword_rag("What is data science?")

According to the course on Data Science, the QUESTION "What is data science?" is answered as follows:

Data Science is an interdisciplinary field that combines statistics, computer science, and domain-specific knowledge to extract insights and knowledge from data. Data Science involves the process of creating and deploying predictive models, analyzing and interpreting complex data, and visualizing the insights gleaned from the data to inform business decisions.


## Implementing Vector Search with Elastic Search

In [2]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [3]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "index_mapping": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "question_vector": {"type": "dense_vector", "dims": 384},
            "answer_vector": {"type": "dense_vector", "dims": 384},
            "question_answer_vector": {"type": "dense_vector", "dims": 384},
            "doc_id": {"type": "keyword"}
        }
    }
}

index_name = 'interview_qa'

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

with open('interview_qa.json','r') as f:
    documents = json.load(f)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'interview_qa'})

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# from tqdm import tqdm
# from elasticsearch.helpers import bulk

# def index_documents(es_client, documents, index_name='interview_qa', model=None):
#     def generate_actions():
#         for doc in documents:
#             if model:
#                 # Encode vector fields if a model is provided
#                 question = doc['question']
#                 answer = doc['answer']
#                 qa = question + ' ' + answer
#                 doc['question_vector'] = model.encode(question).tolist()
#                 doc['answer_vector'] = model.encode(answer).tolist()
#                 doc['question_answer_vector'] = model.encode(qa).tolist()

#             yield {
#                 "_index": index_name,
#                 "_id": doc['doc_id'],
#                 "_source": doc
#             }

#     # Use bulk indexing for better performance
#     success, failed = bulk(es_client, generate_actions(), stats_only=True, raise_on_error=False)

#     print(f"Indexed {success} documents successfully.")
#     if failed:
#         print(f"Failed to index {failed} documents.")


In [None]:
from tqdm import tqdm
import json
from sentence_transformers import SentenceTransformer
from elasticsearch.helpers import bulk

# Prepare and index the documents
def index_documents(es_client, index_name, documents, model):
    for doc in tqdm(documents, desc="Indexing documents"):
        # Encode the text fields
        question_vector = model.encode(doc['question']).tolist()
        answer_vector = model.encode(doc['answer']).tolist()
        question_answer_vector = model.encode(doc['question'] + " " + doc['answer']).tolist()

        # Prepare the document for indexing
        index_doc = {
            "question": doc['question'],
            "answer": doc['answer'],
            "question_vector": question_vector,
            "answer_vector": answer_vector,
            "question_answer_vector": question_answer_vector,
            "doc_id": doc.get('doc_id', None)  # Use None if 'doc_id' is not present
        }

        # Index the document
        es_client.index(index=index_name, body=index_doc)

    # Refresh the index to make the documents searchable immediately
    es_client.indices.refresh(index=index_name)

In [None]:


# Load the model (you've already done this)
model = SentenceTransformer("all-mpnet-base-v2")

# Load the documents (you've already done this)
with open('interview_qa.json', 'r') as f:
    documents = json.load(f)
# Call the function to index the documents
# Use bulk indexing for better performance
success, failed = bulk(es_client, index_documents(), stats_only=True, raise_on_error=False)

print(f"Indexed {success} documents successfully.")
if failed:
    print(f"Failed to index {failed} documents.")

In [5]:
# import json
# # Usage
# from sentence_transformers import SentenceTransformer

# # Load your documents
# with open('/home/nkama/LLM_and_RAG_Course/LLM_and_RAG-/personal_assg_project/interview_qa.json','r') as f:
#     documents = json.load(f)

# # If you need to encode vector fields
# model = SentenceTransformer('all-MiniLM-L6-v2')  # or your preferred model

# # Index the documents
# index_documents(es_client, documents, model=model)

  from .autonotebook import tqdm as notebook_tqdm


Indexed 175 documents successfully.


In [34]:
# for doc in tqdm(documents):
#     try:
#         es_client.index(index=index_name, document=doc)
#     except Exception as e:
#         print(e)

Error executing search: BadRequestError(400, 'illegal_argument_exception', '[knn] queries cannot be provided directly, use the [knn] body parameter instead')
No results found or an error occurred.


In [70]:
def perform_search(es_client, query, model='all-MiniLM-L6-v2', course=None, field='question_answer_vector',
                             index_name='interview_qa', k=5, num_candidates=10000):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_vector = model.encode(query).tolist()  # Encode query to vector
    
    search_query = {
        "size": k,
        "query": {
            "script_score": {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "match_all": {}
                            }
                        ],
                        "filter": [{"term": {"course": course}}] if course else []
                    }
                },
                "script": {
                    "source": f"cosineSimilarity(params.query_vector, '{field}') + 1.0",
                    "params": {
                        "query_vector": query_vector
                    }
                }
            }
        },
        "_source": ["answer", "question", "course", "doc_id"]
    }

    # Perform the search in Elasticsearch
    try:
        response = es_client.search(index=index_name, body=search_query)
    except Exception as e:
        print(f"Error executing search: {str(e)}")
        return None
    if response:
        result = []
        for hit in response['hits']['hits']:
            result.append(hit['_source'])
        return result


In [71]:
# Example usage:
query = "Tell me about python"
course = "python"  # Specify the course
perform_search(es_client, query)




[{'question': 'What does one understand by the term Data Science?',
  'answer': 'An interdisciplinary field that constitutes various scientific processes, algorithms, tools, and machine learning techniques working to help find common patterns and gather sensible insights from the given raw input data using statistical and mathematical analysis is called Data Science. The following The life cycle of data science starts with gathering the business requirements and relevant data. Once the data is acquired, it is maintained by performing data cleaning, data warehousing, data staging, and data architecture. Data processing does the task of exploring the data, mining it, analyzing it which can be finally used to generate the summary of the insights extracted from the data. Once the exploratory steps are completed, the cleansed data is subjected to various algorithms like predictive analysis, regression, text mining, recognition patterns, etc depending on the requirements. In the final stage,

In [80]:
def rag(es_client, query):
    search_results = perform_search(es_client, query)
    prompt = build_prompt(query, search_results)
    answer = llm_response(prompt)
    return answer

In [85]:
rag(es_client,"tell me about sql and what it stands for")

The answer to the QUESTION "tell me about sql and what it stands for" based on the CONTEXT is:

SQL stands for Structured Query Language. It is a specialized programming language used for managing and manipulating relational databases. It is designed for tasks related to database management, data retrieval, data manipulation, and data definition.
