In [1]:
import openai
import elasticsearch

In [2]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-06-25 08:22:02--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-25 08:22:02--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-25 08:22:02 (33.1 MB/s) - ‘documents.json’ saved [658332/658332]



In [3]:
import json

In [4]:
with open('./documents.json', 'rt') as f_in:
    documents_all = json.load(f_in)

In [7]:
documents_all[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [9]:
documents = []

for course in documents_all:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
len(documents)

948

In [13]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp'}

In [15]:
from elasticsearch import Elasticsearch

In [16]:
# Starting an Elasticsearch instance (connecting to our existing one)
es_client = Elasticsearch(hosts = ['http://localhost:9200'])

In [17]:
es_client.info()

ObjectApiResponse({'name': '54db729ef7cd', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'feMkOA19S1aKyxyx-D9aCg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [18]:
# creating an index in ElasticSearch
# We want to filter on the course attribute like "Select * From ... Where course = 'xxx' "

index_settings = {
    "settings": {
        # Kind a cluster in elastic search
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            # we want to break s text into separate words
            "question": {"type": "text"},
            # the ability to filter on this
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es_client.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [19]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:24<00:00, 39.02it/s]


In [23]:
query = "I just discovered th ecourse. can I still join?"

# It is by default case insensetive
# There are other analyzers, which we can use for case sensitive problems (where we need it)

In [24]:
search_query = {
    # retrieves top 5
    "size": 5,
    "query": {
        "bool": {
            # There are must and should queries in elastic search
            # In our case we take one of them
            "must": {
                # We have 3 field in this case, where the search query can be included
                # The HAT 3 means that the question is 3 times more important than other sections (^3).
                ## So firstly it will search the question sectionand then the other ones, also called BOOST
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    # Here it will take a hogh score for the fields it found
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    # here we only consider documents of this course
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [25]:
response = es_client.search(index=index_name, body=search_query)

In [27]:
response['hits']

{'total': {'value': 470, 'relation': 'eq'},
 'max_score': 53.894825,
 'hits': [{'_index': 'course-questions',
   '_id': 'avuGTpAB9aNjXEtHMxKX',
   '_score': 53.894825,
   '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
    'section': 'General course-related questions',
    'question': 'Course - Can I still join the course after the start date?',
    'course': 'data-engineering-zoomcamp'}},
  {'_index': 'course-questions',
   '_id': 'HvuHTpAB9aNjXEtHLhbJ',
   '_score': 53.894825,
   '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
    'section': 'General course-related questions',
    'question': 'Course - Can I still join

In [29]:
# Now we clean the output so that it's more fancy and readable

relevant_docs = []

for hit in response['hits']['hits']:
    doc = hit['_source']
    relevant_docs.append(doc)
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I get support if I take the course in the self-paced mode?
Answer: Yes, the slack channel remains open and you can ask question...

Section: General course-related questions
Question: Course - Can I get support if I take the course in the self-paced mode?
Answer: Yes, the slack channel remains open and you can ask question...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...



In [30]:
relevant_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questio