<h1>Fetch and Process Documents from JSON URL</h1>

In [1]:
import requests 

# Fetch the JSON data from the specified URL, parse it, and iterate over each course in the data.
# For each course, iterate over its documents, add the course name to each document, 
# and store the updated documents in a list.

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


<h1>Elasticsearch Index Creation for Course Questions</h1>

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client= Elasticsearch('http://localhost:9200')

In [4]:
index_settings= {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
es_client.indices.create(index= index_name , body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

<h1>Indexing Documents into Elasticsearch and Performing a Search Query</h1>

In [5]:
from tqdm import tqdm

In [6]:
for doc in tqdm(documents) :
    es_client.index(index= index_name , document=doc )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 39.89it/s]


In [7]:
query= 'I just discovered the course. Can I still join it?'

In [8]:
  search_query={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [9]:
response= es_client.search(index=index_name, body=search_query)

In [10]:
response['hits']['hits']

[{'_index': 'course-questions',
  '_id': 'IpiuOpABAJ2DeV4MSrBP',
  '_score': 96.28143,
  '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'section': 'General course-related questions',
   'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp'}},
 {'_index': 'course-questions',
  '_id': 'J5iuOpABAJ2DeV4MSrDb',
  '_score': 71.28208,
  '_source': {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
   'section': 'General course-related questions',
   'question': 'Course - Can I follow the course after it

<h1>Search and Retrieve Documents from Elasticsearch</h1>

In [11]:
# Define the search term (one of the multiple choice questions)
query = "How do I debug a docker container?"

# Define the search query to limit results to the 'machine-learning-zoomcamp' course and return 3 results
search_query = {
    "size": 3,  # Return the top 3 search results
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,  # The search query you want to use
                    "fields": ["question^4", "text"],  # Boost the 'question' field by a factor of 4
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"  # Filter to only include results from this course
                }
            }
        }
    }
}

# Execute the search query
response = es_client.search(index=index_name, body=search_query)

result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [12]:
result_docs[2]['question']

{'text': 'I wanted to understand how lambda container images work in depth and how lambda functions are initialized, for this reason, I found the following documentation\nhttps://docs.aws.amazon.com/lambda/latest/dg/images-create.html\nhttps://docs.aws.amazon.com/lambda/latest/dg/runtimes-api.html\nAdded by Alejandro aponte',
 'section': '9. Serverless Deep Learning',
 'question': 'How do Lambda container images work?',
 'course': 'machine-learning-zoomcamp'}

<h1>Create Formatted Prompt for Teaching Assistant</h1>

In [13]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [14]:
context = ""
    
for doc in result_docs:
     context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

prompt=prompt_template.format(question=query, context=context).strip()

In [15]:
len(prompt)

1538

<h1>Encode Prompt Using TikToken for GPT-4</h1>

In [16]:
import tiktoken

In [17]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [18]:
# Encode the prompt to get the tokens
encoded_prompt = encoding.encode(prompt)

In [19]:
 len(encoded_prompt)

333

<h1>Communicate with OpenAI's Ollama Chat Service</h1>

In [20]:
import  ollama

In [21]:
from openai import OpenAI

In [22]:
client = OpenAI(
    base_url='http://localhost:11434/v1',
    api_key='ollama'  # required, but unused
)

In [23]:
response = client .chat.completions.create(
           model="phi3",
           messages=[{"role":"user", "content":prompt}]
    )

In [24]:
response.choices[0].message.content

" To debug a Docker container, you can launch the container image in interactive mode and overriding the entrypoint using:\n\ndocker run -it --entrypoint bash <image>\n\nIf the container is already running, execute a command in the specific container by first finding its ID with:\n\ndocker ps (find the container-id)\n\nThen use the following command to enter the container's bash shell for debugging purposes:\n\ndocker exec -it <container-id> bash"