In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import pandas as pd
import numpy as np
from tqdm import tqdm


In [2]:
es = Elasticsearch("http://localhost:9200")


In [3]:
df = pd.read_csv('df.csv')


In [4]:
df = df[df['sys_lang'].isin(['en', 'ru', 'kk', 'qq'])]
df

Unnamed: 0.1,Unnamed: 0,id,sys_lang,url,text1,text2
0,0,3087,en,https://beta2.egov.kz/services/3087?lang=en,rules for providing state services dear citize...,"issuance of passports, identification cards fo..."
1,1,3081,en,https://beta2.egov.kz/services/3081?lang=en,rules for providing state services information...,acceptance of documents for issuance of passpo...
2,5,3263,en,https://beta2.egov.kz/services/3263?lang=en,rules for providing state services information...,obtaining a certificate on the status of the i...
3,6,3040,en,https://beta2.egov.kz/services/3040?lang=en,rules for providing state services dear citize...,removal from registration at the place of resi...
4,7,3759,en,https://beta2.egov.kz/services/3759?lang=en,rules for providing state service what is temp...,issuance of a list on temporary work incapacit...
...,...,...,...,...,...,...
196129,207287,225,kk,https://beta2.egov.kz/situations/225/618?lang=kk,қр президенті жанындағы мемлекеттік басқару ак...,қазақстан республикасы президенті жанындағы ме...
196130,207288,225,kk,https://beta2.egov.kz/situations/225/619?lang=kk,қр президенті жанындағы мемлекеттік басқару ак...,қазақстан республикасы президенті жанындағы ме...
196131,207289,225,kk,https://beta2.egov.kz/situations/225/621?lang=kk,қр президенті жанындағы мемлекеттік басқару ак...,қазақстан республикасы президенті жанындағы ме...
196132,207290,225,kk,https://beta2.egov.kz/situations/225/623?lang=kk,қр президенті жанындағы мемлекеттік басқару ак...,қазақстан республикасы президенті жанындағы ме...


In [5]:
if es.indices.exists(index='my_index'):
    es.indices.delete(index='my_index')

In [6]:
# Define the settings and mappings for the index
settings = {
    "settings": {
        "analysis": {
            "tokenizer": {
                "custom_tokenizer": {
                    "type": "standard",
                    "max_token_length": 30
                }
            },
            "analyzer": {
                "custom_analyzer": {
                    "type": "custom",
                    "tokenizer": "custom_tokenizer"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text1": {
                "type": "text",
                "analyzer": "custom_analyzer",
                "similarity": "BM25"
            },
            "text2": {
                "type": "text",
                "analyzer": "custom_analyzer",
                "similarity": "BM25"
            }
        }
    }
}

# Create the index
es.indices.create(index="my_index", body=settings)

  es.indices.create(index="my_index", body=settings)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'}

In [7]:
def generate_bulk_data(df):
    for index, row in tqdm(df.iterrows(), total=len(df)):
        yield {
            "_op_type": "index",
            "_index": "my_index",
            "_id": index,  # Use the DataFrame index as the Elasticsearch document ID
            "_source": {
                "text1": row['text1'],
                "text2": row['text2']
            }
        }

# Bulk indexing
bulk(es, generate_bulk_data(df))

100%|█████████████████████████████████| 195917/195917 [01:03<00:00, 3078.54it/s]


(195917, [])

In [8]:
def search(query):
    search_body = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["text1", "text2"]
            }
        }
    }

    response = es.search(index="my_index", body=search_body)
    res = response["hits"]["hits"]
    if len(res) == 0:
        return 0
    else:
        return int(res[0]['_id'])
    

In [None]:
epir_test = pd.read_csv('case1-datasaur/epir_test.csv')
epir_test

In [None]:
url_idxs = []

for i, row in epir_test.iterrows():
    df_idx = search(row['question'])
    url_idxs.append(df.iloc[df_idx]['Unnamed: 0'].item())    


In [None]:
res = pd.DataFrame({'id': epir_test['id'], 'index': url_idxs})
res


In [None]:
res.to_csv('res.csv', index=None)