In [None]:
!pip install elasticsearch

In [10]:
import os 
import dotenv 
from elasticsearch import Elasticsearch, helpers

dotenv.load_dotenv()

client = Elasticsearch(
    "https://my-elasticsearch-project-f29d56.es.us-central1.gcp.elastic.cloud:443",
    api_key=os.getenv('ELASTICSEARCH_API_KEY')
)
# -----------------------------
# Define the index name and create index (if it doesn't exist)
# -----------------------------
index_name = "national-parks-questions"
if not client.indices.exists(index=index_name):
    create_response = client.indices.create(index=index_name)
    print("Index created:", create_response)
else:
    print(f"Index '{index_name}' already exists.")
# -----------------------------
# Add or update mappings for the index
# -----------------------------
mappings = {
    "properties": {
        "text": {
            "type": "semantic_text"
        }
    }
}
mapping_response = client.indices.put_mapping(
    index=index_name,
    body=mappings
)
print("Mappings updated:", mapping_response)
# -----------------------------
# Sample documents to ingest
# -----------------------------
docs = [
    {
        "text": "Yellowstone National Park is one of the largest national parks in the United States. It ranges from the Wyoming to Montana and Idaho, and contains an area of 2,219,791 acres across three different states. Its most famous for hosting the geyser Old Faithful and is centered on the Yellowstone Caldera, the largest super volcano on the American continent. Yellowstone is host to hundreds of species of animal, many of which are endangered or threatened. Most notably, it contains free-ranging herds of bison and elk, alongside bears, cougars and wolves. The national park receives over 4.5 million visitors annually and is a UNESCO World Heritage Site."
    },
    {
        "text": "Yosemite National Park is a United States National Park, covering over 750,000 acres of land in California. A UNESCO World Heritage Site, the park is best known for its granite cliffs, waterfalls and giant sequoia trees. Yosemite hosts over four million visitors in most years, with a peak of five million visitors in 2016. The park is home to a diverse range of wildlife, including mule deer, black bears, and the endangered Sierra Nevada bighorn sheep. The park has 1,200 square miles of wilderness, and is a popular destination for rock climbers, with over 3,000 feet of vertical granite to climb. Its most famous and cliff is the El Capitan, a 3,000 feet monolith along its tallest face."
    },
    {
        "text": "Rocky Mountain National Park is one of the most popular national parks in the United States. It receives over 4.5 million visitors annually, and is known for its mountainous terrain, including Longs Peak, which is the highest peak in the park. The park is home to a variety of wildlife, including elk, mule deer, moose, and bighorn sheep. The park is also home to a variety of ecosystems, including montane, subalpine, and alpine tundra. The park is a popular destination for hiking, camping, and wildlife viewing, and is a UNESCO World Heritage Site."
    }
]
# -----------------------------
# Bulk ingest documents
# -----------------------------
ingestion_timeout=300 # Allow time for semantic ML model to load
bulk_response = helpers.bulk(
    client.options(request_timeout=ingestion_timeout),
    docs,
    index=index_name,
    refresh="wait_for" # Wait until indexed documents are visible for search before returning the response
)
print(bulk_response)
# -----------------------------
# Define semantic search query
# -----------------------------
retriever_object = {
    "standard": {
        "query": {
            "semantic": {
                "field": "text",
                "query": "Sierra Nevada"
            }
        }
    }
}
search_response = client.search(
    index=index_name,
    retriever=retriever_object,
)
print(search_response['hits']['hits'])

Index created: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'national-parks-questions'}
Mappings updated: {'acknowledged': True}
(3, [])
[{'_index': 'national-parks-questions', '_id': 'LN-qA5wBhIsKZOuoCsUA', '_score': 9.606189, '_source': {'text': 'Yosemite National Park is a United States National Park, covering over 750,000 acres of land in California. A UNESCO World Heritage Site, the park is best known for its granite cliffs, waterfalls and giant sequoia trees. Yosemite hosts over four million visitors in most years, with a peak of five million visitors in 2016. The park is home to a diverse range of wildlife, including mule deer, black bears, and the endangered Sierra Nevada bighorn sheep. The park has 1,200 square miles of wilderness, and is a popular destination for rock climbers, with over 3,000 feet of vertical granite to climb. Its most famous and cliff is the El Capitan, a 3,000 feet monolith along its tallest face.'}}, {'_index': 'national-parks-questions', 

In [None]:
%pip uninstall -y elasticsearch
%pip install "elasticsearch>=8,<9"

In [1]:
#Docker 
from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200/")
es_client.info()

ObjectApiResponse({'name': '94d25bd7d523', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'WwIEJnfbT5uBidHTKQSsXw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course_faq"
es_client.indices.create(index=index_name,body=index_settings)
 

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_faq'})

In [11]:
import json 

with open('documents.json','r') as f:
          docs = json.load(f)

documents_flatten = []

for doc in docs:
    for q_a in doc.get('documents'):
        q_a['course'] = doc.get('course')
        documents_flatten.append(q_a)

len(documents_flatten)

948

In [14]:
from tqdm.auto import tqdm 

for doc in tqdm(documents_flatten):
    es_client.index(index=index_name,document=doc)


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:03<00:00, 259.98it/s]


In [15]:
query = 'When is the deadline of course?' 
search_query =  {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [None]:
search_response = es_client.search(index=index_name,body=search_query)


In [19]:
print(search_response['hits']['hits']) 

[{'_index': 'course_faq', '_id': 'SFXLA5wBfbegxpJiWXkl', '_score': 30.219374, '_source': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course_faq', '_id': 'aVXLA5wBfbegxpJiWXmz', '_score': 23.583492, '_source': {'text': 'Yes, this applies if you want to use Airflow or Prefect instead of Mage, AWS or Snowflake instead of GCP products or Tableau instead of Metabase or Google data studio.\nThe course covers 2 alterna

In [21]:
print(search_response['hits']['hits'][0].get('_source').get('text'))

The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.


In [22]:
def searchwithelastic(query):
    search_query =  {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    search_response = es_client.search(index=index_name,body=search_query)
    return search_response['hits']['hits'][0].get('_source').get('text')

In [26]:
searchwithelastic('When is homework deadline?')

'Answer: In short, it’s your repository on github, gitlab, bitbucket, etc\nIn long, your repository or any other location you have your code where a reasonable person would look at it and think yes, you went through the week and exercises.'