## Env Variables

In [None]:
ELASTIC_ENDPOINT = ""
ELASTIC_USERNAME = "elastic"
ELASTIC_PASSWORD = ""
ELASTIC_INDEX = "dk_semantic_search"

DIGIKALA_DATASET_PATH = "data/dk.csv"
CUSTOM_DATASET_PATH = "data/custom.csv"
SAMPLE_COUNT = 1000
RANDOM_STATE = 42

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es = Elasticsearch(
    ELASTIC_ENDPOINT,
    basic_auth=(ELASTIC_USERNAME,ELASTIC_PASSWORD),
)
es.ping()

## Prepare the data

In [None]:
import pandas as pd

df_all = pd.read_csv(DIGIKALA_DATASET_PATH)
df_all.shape

In [None]:
df = df_all.sample(n=SAMPLE_COUNT, random_state=RANDOM_STATE)
del df_all
df.head()

In [None]:
df.isna().value_counts()

In [None]:
df.fillna("None", inplace=True)

In [None]:
custom_csv = pd.read_csv(CUSTOM_DATASET_PATH)
custom_csv.head()

In [None]:
df = pd.concat([df, custom_csv], ignore_index=True)
df.head()

In [None]:
df.tail()

## Convert the title to Vector

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

In [None]:
df["titleVector"] = df["title_fa"].apply(lambda x: model.encode(x))
df.head()

In [None]:
df.columns

In [None]:
df.to_csv("data/dk_small.csv", index=False)

## Create new index in ElasticSearch

In [None]:
from indexMapping import indexMapping

es.indices.create(index=ELASTIC_INDEX, mappings=indexMapping)

## Ingest the data into index

In [None]:
record_list = df.to_dict("records")

In [None]:
for record in record_list:
    try:
        es.index(index=ELASTIC_INDEX, document=record, id=record["id"])
    except Exception as e:
        print(e)

In [None]:
es.count(index=ELASTIC_INDEX)

## Search the data

In [None]:
input_keyword = "چاپگر"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field" : "titleVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 5,
    "num_candidates" : SAMPLE_COUNT, 
}

res = es.knn_search(index=ELASTIC_INDEX, knn=query, source=["id", "title_fa", "Category1", "Category2"])
res["hits"]["hits"]

Now edit `searchApp.py` and run `streamlit run searchApp.py`