In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
import openai
from openai import OpenAI
client = OpenAI()
import os
response = client.chat.completions.create(
        model='gpt-4o',
        messages = [{"role":"user","content":"how is it to join the course?"}]
    )
response.choices[0].message.content

"Joining a course can be an enriching experience, but there are several factors to consider to ensure you make the most out of it. Here's a step-by-step guide to help you with the process:\n\n### 1. Determine Your Objectives\n- **Understand Your Goals:** Before enrolling, clearly define what you hope to achieve. Whether it’s gaining specific skills, earning a certification, or personal enrichment, knowing your objectives can help you choose the right course.\n\n### 2. Research Options\n- **Course Content:** Look into what the course offers. Ensure the syllabus and modules align with your learning goals.\n- **Reputation:** Check reviews, testimonials, and the institution's reputation.\n- **Instructor Credentials:** Look into the instructor’s background, experience, and teaching style.\n\n### 3. Evaluate Practical Considerations\n- **Duration:** Ensure you have the time commitment required to complete the course.\n- **Cost:** Assess the tuition fees, and if necessary, look into financial

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
import minsearch

In [9]:
index = minsearch.Index(
    text_fields=["question","text","section"],
    keyword_fields = ["course"]
)

In [10]:
q =  "How do I execute a command in a running docker container?"

In [11]:
index.fit(documents)

<minsearch.Index at 0x7489e42541f0>

In [26]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('https://elastic:ONprwUBmcjEchjLpmLaYU3TE@my-deployment-806fcf.es.us-central1.gcp.cloud.es.io:9243')




In [27]:


index_settings = {
    "settings":{
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"}
        }
    }
}

index_name = "course-questions4"
es_client.indices.create(index = index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions4'})

In [28]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document = doc)
    

  0%|          | 0/948 [00:00<?, ?it/s]

In [29]:
def search(query):
    boost = {'question': 4.0,  'text': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=1
        
    )

    return results

In [30]:
def elastic_serach(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body = search_query)
    print(response)
    print("\n ***************************************\n")
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [32]:
response = elastic_serach(q)
print(response)

{'took': 4, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 390, 'relation': 'eq'}, 'max_score': 75.54128, 'hits': [{'_index': 'course-questions4', '_id': 'x23fSpABorkyMAIni0Lg', '_score': 75.54128, '_source': {'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4.0.1\nHome: http://pgcli.com\nroot@pg-database:ny_taxi> \\dt\n+--------+------------------+-------+-------+\n| Schema | Name             | Type  | Owner |\n|--------+--------

In [33]:

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

Q: {question}

A: {text}

""".strip()



In [42]:

context = ""

for doc in results:
    context = context + f"question: {doc['question']}\n\nanswer: {doc['text']}\n\n"


In [37]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [39]:

context = ""

for doc in results:
    context = context + f"section:{doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"


In [43]:
print(len(prompt_template.format(question=q, context=context)))

1163


In [44]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [45]:
import tiktoken

In [49]:
import tiktoken

# Get the encoding for the model specified in the prompt
encoding = tiktoken.encoding_for_model("gpt-4o")

# The prompt text
prompt = "encoding = tiktoken.encoding_for_model(\"gpt-4o\") Use the encode function. How many tokens does our prompt have? 122 222 322 422 Note: to decode back a token into a word, you can use the decode_single_token_bytes function:"

# Encode the prompt
tokens = encoding.encode(prompt)

# Count the number of tokens
token_count = len(tokens)

print(f"Number of tokens: {token_count}")

Number of tokens: 57
