#### **Import Libraries**

In [None]:
%cd D:/VNM-Multimodal-Video-Search/dataset

In [2]:
import os
import json
from elasticsearch import Elasticsearch, helpers, NotFoundError, RequestError

#### **Elastic Database**

In [3]:
class ElasticDB:
    def __init__(self, index_name, user_name, password, es_host="http://localhost:9200",  request_timeout=60):
        self.index_name = index_name
        self.es = Elasticsearch(es_host, basic_auth=(user_name, password), request_timeout=request_timeout)

    def create_index(self, settings=None, mappings=None):
        """Tạo index mới nếu chưa tồn tại, nếu tồn tại thì xóa và tạo index mới."""
        try:
            if not self.es.indices.exists(index=self.index_name):
                body = {}
                if settings:
                    body["settings"] = settings
                if mappings:
                    body["mappings"] = mappings
                self.es.indices.create(index=self.index_name, body=body)
                print(f"Index '{self.index_name}' created successfully.")
            else:
                print(f"Index '{self.index_name}' already exists. So, we'll delete and create new index.")
                self.delete_index(self.index_name)
                self.create_index(settings, mappings)
        except RequestError as e:
            print(f"Error creating index: {e.info}")


    def generate_id(self, L, V, ID_FRAME):
        """Tạo ID từ các trường L, V, ID_FRAME."""
        return f"./distilled_keyframe/{L}/{V}/{ID_FRAME}.jpg"


    def index_document(self, document):
        """Index một document vào Elasticsearch."""
        try:
            doc_id = self.generate_id(document['L'], document['V'], document['ID_FRAME'])

            # Kiểm tra xem document với ID này đã tồn tại hay chưa
            if self.es.exists(index=self.index_name, id=doc_id):
                print(f"Document with ID: {doc_id} already exists. Skipping indexing.")
                return None

            response = self.es.index(index=self.index_name, id=doc_id, body=document)
            print(f"Document indexed with ID: {doc_id}")
            return response
        except RequestError as e:
            print(f"Error indexing document: {e.info}")


    def bulk_index_documents(self, documents):
        """Index nhiều document vào Elasticsearch nếu ID không tồn tại."""
        try:
            actions = []
            for doc in documents:
                doc_id = self.generate_id(doc['L'], doc['V'], doc['ID_FRAME'])
                actions.append({
                    "_index": self.index_name,
                    "_id": doc_id,
                    "_source": doc
                })

            if actions:
                helpers.bulk(self.es, actions)
                print(f"Bulk indexed {len(actions)} documents successfully.")
            else:
                print("No new documents to index.")
        except RequestError as e:
            print(f"Error bulk indexing documents: {e.info}")

    def load_data_from_directory(self, directory_path):
        """Load data từ thư mục và trả về danh sách các documents."""
        documents = []
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.json'):
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        for item in data:
                            documents.append(item)
        return documents


    def delete_index(self, index_name):
        if self.es.indices.exists(index=index_name):
            try:
                response = self.es.indices.delete(index=index_name)
                print(f"Index '{index_name}' deleted successfully.")
                return response
            except Exception as e:
                print(f"Delete '{index_name}' Fail. Error: {e}")
                return None
        else:
            print(f"Index '{index_name}' not exist.")
            return None


    def delete_document(self, index_name, id):
        try:
            response = self.es.delete(index=index_name, id=id)
            print(f"Document with id '{id}' deleted successfully from index '{index_name}'.")
            return response
        except Exception as e:
            print(f"Delete document with id '{id}' Fail. Error: {e}")
            return None

    def list_all_indices(self):
        return self.es.indices.get_alias("*")

#### **Create Elastic Database**

In [None]:
es = ElasticDB(index_name="ocr_engine", user_name="elastic", password="123456")
es.delete_index('ocr_engine')

In [None]:
es = ElasticDB(index_name="ocr_engine", user_name="elastic", password="123456")

settings = {
        "number_of_shards": 1,
        "number_of_replicas": 0
}

mappings = {
    "properties": {
        "L": {"type": "keyword"},
        "V": {"type": "keyword"},
        "ID_FRAME": {"type": "keyword"},
        "TEXT": {"type": "text"}
    }
}

es.create_index(settings=settings, mappings=mappings)

directory_path = r"./filter/ocr_features"
documents = es.load_data_from_directory(directory_path)
es.bulk_index_documents(documents)