In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

In [37]:
import minsearch
import json
from tqdm import tqdm
from elasticsearch import Elasticsearch
import tiktoken

In [2]:
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ['MISTRAL_API_KEY']
model = "mistral-large-latest"

client = MistralClient(api_key=api_key)

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
        
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=[]
)

index.fit(documents)

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

SyntaxError: expected 'except' or 'finally' block (531745374.py, line 37)

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict= boost,
        num_results= 5
    )
    
    return results

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        
    return result_docs

def build_prompt(query, search_results):
    prompt_template = """
    Tu es un assistant qui répond aux questions sur des cours en ligne. Répond à la QUESTION basé sur le CONTEXT issu des données de la FAQ.
    Utilise seulement les données présent dans le CONTEXT pour répondre à la QUESTION.
    Si le CONTEXT ne continet pas la réponse, réponds NONE.

    Tu me répondras en Français.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ''

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context)
    return prompt

def llm(prompt):
    chat_response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=prompt)]
    )

    return chat_response.choices[0].message.content

def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    
    return answer

In [12]:
#answer = rag('How do I run Kafka ?')
answer = rag('I just discovered the course, can I still enroll ?')
print(answer)

Yes, you can still enroll in the course even if you discovered it after the start date. You're eligible to submit the homeworks, but be aware that there will be deadlines for turning in the final projects.


In [29]:
#--------------------------
#--------------------------
##Homework :
#--------------------------
#--------------------------

In [14]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-faq'

es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)    

100%|██████████| 948/948 [00:19<00:00, 48.94it/s]


In [42]:
homework_query = 'How do I execute a command in a running docker container?'

search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": homework_query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

response = es_client.search(index=index_name, body=search_query)

homework_answers = []
for hit in response['hits']['hits']:
    homework_answers.append(hit['_source'])
    print(hit['_score'])
    
print(homework_answers[2])


84.050095
51.04628
49.938507
{'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t\t\t\t\t\t\tGopakumar Gopinathan', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I copy files from a different folder into docker container’s working directory?', 'course': 'machine-learning-zoomcamp'}


In [36]:
context_template = """
Q: {question}
A: {text}
""".strip()

homework_context = ''

for homework_answer in homework_answers:
        homework_context = homework_context + f"{context_template.format(question=homework_answer['question'], text=homework_answer['text'])}\n\n"
        

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

homework_prompt = prompt_template.format(question=homework_query, context=homework_context)
print(len(homework_prompt))


1464


In [40]:
encoding = tiktoken.encoding_for_model('gpt-4o')
len(encoding.encode(homework_prompt))

323

In [43]:
chat_response = client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=homework_prompt)]
)

print(chat_response.choices[0].message.content)

To execute a command in a running Docker container, you first need to find the container's ID. You can do this by running the command `docker ps`, which will list all running containers along with their IDs. Once you have the container's ID, you can use the `docker exec` command to run a command in that container. The syntax for this command is `docker exec -it <container-id> bash`, where `<container-id>` is the ID of the container in which you want to execute the command. This will start a bash shell in the running container where you can execute your desired command.


In [48]:
(1000 * 150 / 1000 * 0.005) + (1000 * 250 / 1000 * 0.015)
150 * 0.005 + 250 * 0.015

(1000 * 1464 / 1000 * 0.005) + (1000 * 323 / 1000 * 0.015)
(1464 * 0.005) + (323 * 0.015)

12.165