In [9]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-22 19:31:54--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.2’


2024-06-22 19:31:55 (23.7 MB/s) - ‘minsearch.py.2’ saved [3832/3832]



In [10]:
import minsearch, json

In [11]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [12]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [13]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
q = 'the course has already started, can I still enroll?'
index.fit(documents)

<minsearch.Index at 0x7aa601b4e1a0>

In [14]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

# OPENAI

In [21]:
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from dotenv import load_dotenv


load_dotenv()
api_key = os.getenv("MISTRAL_API_KEY_2")
print(api_key)
model = "mistral-large-latest"

client = MistralClient(api_key=api_key)

lGEQvNP3ruNUGpY6vENQ1y9wBjCpJWzg


In [22]:
def llm(prompt):
    chat_response = client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=prompt)]
    )

    return chat_response.choices[0].message.content

In [23]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [24]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
rag(query)

"To run Kafka, you can follow the instructions for running a Java Kafka producer or a Python Kafka producer, depending on your preference.\n\nFor Java Kafka:\n1. Navigate to your project directory in the terminal.\n2. Run the following command:\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nReplace `<jar_name>` with the actual name of your jar file.\n\nFor Python Kafka:\n1. Create a virtual environment by running the following command:\n```\npython -m venv env\n```\n2. Activate the virtual environment using one of the following commands, depending on your operating system:\n   - For MacOS, Linux, or Windows:\n     ```\n     source env/bin/activate\n     ```\n   - For Windows (if the above command doesn't work):\n     ```\n     env/Scripts/activate\n     ```\n3. Install the required packages by running:\n```\npip install -r ../requirements.txt\n```\n4. Now you can run your Python Kafka producer or consumer within the activated

## ELASTICSEARCH

In [32]:
from elasticsearch import Elasticsearch


In [35]:
es_client = Elasticsearch("http://localhost:9200")

In [36]:
es_client.info()

ObjectApiResponse({'name': 'ab877cb8fbfe', 'cluster_name': 'docker-cluster', 'cluster_uuid': '6lrC2wraQOWuUpkvz_TN7Q', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [37]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


In [38]:
index_name = "course-questions"
es_client.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [39]:
from tqdm.auto import tqdm 

for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████| 948/948 [00:24<00:00, 39.16it/s]


In [46]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [47]:
query = "I just discovered the course. Can I still join?"

In [48]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [49]:
rag(query)

"Yes, you can still join the course even if it has already started. You're eligible to submit homeworks regardless of registration. However, please note that there will be deadlines for turning in the final projects. It's advisable not to leave everything for the last minute. If you're just starting, you might want to begin by installing and setting up all the dependencies and requirements such as Google Cloud account, Google Cloud SDK, Python 3 (installed with Anaconda), Terraform, and Git."