In [14]:
import minsearch
import json

In [31]:
with open('documents-llm.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [32]:
documents = []

In [33]:
for course_dict in docs_raw:
    for docs in course_dict['documents']:
        docs['course'] = course_dict['course']
        documents.append(docs)

In [34]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [35]:
# We are creating a search index using minsearch library based on the structure of the document. This makes the document easy to search using feilds like 'text' and 'question'

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course"]
)

In [36]:
q = 'the course has already started, can I still enroll?'

In [37]:
index.fit(documents)

<minsearch.minsearch.Index at 0x70b0195a7cb0>

In [38]:
from openai import OpenAI

In [42]:
client = OpenAI(api_key= your_key, base_url="https://api.groq.com/openai/v1")

In [43]:
import os
os.environ["OPENAI_API_KEY"] = "your-key"


In [44]:
response = client.chat.completions.create(
    model='llama3-70b-8192',
    messages=[{"role":"user", "content": q}]
)
print(response.choices[0].message.content)

Whether you can still enroll in a course that has already started depends on several factors, which I'll outline below:

**Factors to consider:**

1. **Course format**: If the course is self-paced or has flexible start dates, you might be able to enroll at any time. However, if it's a traditional, instructor-led course with fixed start and end dates, it's less likely you can join mid-way.
2. **Instructor permission**: Reach out to the course instructor or teaching assistant to ask if they can accommodate a late enrollment. They might require you to catch up on missed material or assignments, or they might not allow late enrollment at all.
3. **Course platform or institution**: Check the course platform or institution's policy on late enrollments. Some might have specific rules or procedures for enrolling in a course that has already started.
4. **Coursework and assignments**: If the course has already started, you might have missed important lectures, assignments, or discussions. You'l

In [45]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'llm-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [46]:
search('what is data warehouse')

[{'text': 'Cosine similarity is a measure used to calculate the similarity between two non-zero vectors, often used in text analysis to determine how similar two documents are based on their content. This metric computes the cosine of the angle between two vectors, which are typically word counts or TF-IDF values of the documents. The cosine similarity value ranges from -1 to 1, where 1 indicates that the vectors are identical, 0 indicates that the vectors are orthogonal (no similarity), and -1 represents completely opposite vectors.',
  'section': 'Module 3: X',
  'question': 'What is the cosine similarity?',
  'course': 'llm-zoomcamp'},
 {'text': 'A “document” is a collection of fields, which are the key-value pairs that contain your data, that have been serialized as a JSON object.',
  'section': 'Module 3: X',
  'question': 'What are documents in ElasticSearch?',
  'course': 'llm-zoomcamp'},
 {'text': 'When you stop the container, the data you previously added to elastic will be go

In [47]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [48]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3-70b-8192',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [49]:
query = 'can I still join the course?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [50]:
rag(query)

'According to the provided FAQ database, the answer to your question "can I still join the course?" is yes. However, if you want to receive a certificate, you need to submit your project while the course is still accepting submissions.'

In [51]:
rag('the course has already started, can I still enroll?')

'Based on the context, the answer to the question "Can I still enroll?" is YES. Although the course has already started, you can still join and even receive a certificate if you submit your project while submissions are still being accepted.'

In [None]:
documents[0]

 ## RAG with Vector Search

In [52]:
from qdrant_client import QdrantClient, models

In [53]:
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [54]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [57]:
collection_name = "zoomcamp-faq"

In [58]:
qd_client.delete_collection(collection_name=collection_name)

True

In [61]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [64]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [65]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [66]:
question = 'I just discovered the course. Can I still join it?'

In [67]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [68]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [69]:
rag('how do I run kafka?')

vector_search is used


"Unfortunately, the context is empty, so I don't have enough information to provide a specific answer on how to run Kafka. If you can provide more context or details about the environment, setup, or Kafka version, I'd be happy to help!"