In [102]:
import minsearch
import json

In [103]:
with open('documents-llm.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [104]:
documents = []

In [105]:
for course_dict in docs_raw:
    for docs in course_dict['documents']:
        docs['course'] = course_dict['course']
        documents.append(docs)

In [106]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [107]:
# We are creating a search index using minsearch library based on the structure of the document. This makes the document easy to search using feilds like 'text' and 'question'

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course"]
)

In [108]:
q = 'the course has already started, can I still enroll?'

In [109]:
index.fit(documents)

<minsearch.minsearch.Index at 0x700ed28737a0>

In [110]:
from openai import OpenAI

In [111]:
client = OpenAI(api_key="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", base_url="https://api.groq.com/openai/v1")

In [112]:
response = client.chat.completions.create(
    model='llama3-70b-8192',
    messages=[{"role":"user", "content": q}]
)
print(response.choices[0].message.content)

Whether you can still enroll in a course that has already started depends on the institution's policies and the instructor's discretion. Here are some possible scenarios:

1. **Maybe**: Some institutions may allow late enrollments, but you'll need to check with the course instructor or the registrar's office to see if it's possible. They may consider your request if there's still space available in the course and you can catch up on the missed material.
2. **Limited access**: In some cases, you might be allowed to enroll, but you may not have access to all the course materials or assignments that have already been covered. You may need to work with the instructor to create a plan to catch up on the missed content.
3. **Self-study or audit**: If the course is not fully enrolled, you might be allowed to audit the course (attend classes without receiving credit) or access course materials as a self-study student. This way, you can still learn from the course, but you won't receive academi

In [113]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'llm-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [114]:
search('what is data warehouse')

[{'text': 'Cosine similarity is a measure used to calculate the similarity between two non-zero vectors, often used in text analysis to determine how similar two documents are based on their content. This metric computes the cosine of the angle between two vectors, which are typically word counts or TF-IDF values of the documents. The cosine similarity value ranges from -1 to 1, where 1 indicates that the vectors are identical, 0 indicates that the vectors are orthogonal (no similarity), and -1 represents completely opposite vectors.',
  'section': 'Module 3: X',
  'question': 'What is the cosine similarity?',
  'course': 'llm-zoomcamp'},
 {'text': 'A “document” is a collection of fields, which are the key-value pairs that contain your data, that have been serialized as a JSON object.',
  'section': 'Module 3: X',
  'question': 'What are documents in ElasticSearch?',
  'course': 'llm-zoomcamp'},
 {'text': 'When you stop the container, the data you previously added to elastic will be go

In [115]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [116]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3-70b-8192',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [117]:
query = 'can I still join the course?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [118]:
rag(query)

'According to the context, the answer to the question "can I still join the course?" is simply "Yes". There are no restrictions mentioned in the provided context that would prevent someone from joining the course.'

In [119]:
rag('the course has already started, can I still enroll?')

'According to the FAQ database, yes, you can still enroll in the course even though it has already started. However, if you want to receive a certificate, you need to submit your project while the course is still accepting submissions.'

In [120]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [121]:
from elasticsearch import Elasticsearch

In [122]:
from elasticsearch import Elasticsearch

In [123]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [127]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [128]:
from tqdm.auto import tqdm

In [129]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/86 [00:00<?, ?it/s]

In [134]:
query = 'Can I still join the course??'

In [135]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [136]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [137]:
rag(query)

"I apologize, but there is no context provided for me to answer your question. Please provide the context from the FAQ database, and I'll be happy to help you with your question."