# Dictionary 적재

In [1]:
from pathlib import Path
import os

os.chdir(Path(os.getcwd()).parent.parent)

In [49]:
from os.path import join

import json
import ast
import timeit

import pandas as pd
import numpy as np
from elasticsearch import AsyncElasticsearch
from collections import Counter

In [3]:
root_path = os.getcwd()
data_path = join(os.getcwd(), "data")
raw_path = join(data_path, "raw")
pre_path = join(data_path, "preprocessing")
bulk_path = join(data_path, "bulk")

## Elasticsearch client

In [13]:
async def close(es_client: AsyncElasticsearch):
    await es_client.close()

In [15]:
es_config = {
    "hosts": "http://localhost:9200",
    "http_auth": ("elastic", "42maru"),
    "scheme": "http"
}

In [16]:
es_client = AsyncElasticsearch(**es_config)

In [None]:
es_client.close()

## Load news_data_20000.csv

In [4]:
target_name = "news"
target_file_name = "news_data_20000.csv"

In [10]:
target_df = pd.read_csv(join(raw_path, target_file_name))[["title", "content"]]

In [11]:
target_df.head()

Unnamed: 0,title,content
0,"안산시, 생활임금 시급 1만500원 의결",[헤럴드경제(안산)=박정규 기자]안산시(시장 윤화섭) 생활임금위원회는 2022년 생...
1,"울산 하이본 병원, 6월부터 본격 진료 ""척추와 관절 분야 전문""",척추와 관절 분야 전문인 울산 하이본 병원이 6월부터 본격 진료를 시작했다. \r\...
2,금융권 최초 공동 정보지갑서비스 출시…1호 증명서는 '뱅크아이디',[아시아경제 송승섭 기자]금융분산ID 추진 협의회는 국내 최초 정보지갑인 마이인포에...
3,“총장 후보자 수사심의위 신청은 모순” “정치적 수단 전락”,서울 서초구 대검찰청 검찰기. 연합뉴스 “정치적 중립성을 지키고 수사 결정 전 과정...
4,[인사]현대해상,◆현대해상 ◇임원 전보 △리스크관리본부장 홍사경 △고객지원본부장 황미은 △CCO 윤...


## ddd

In [20]:
INDEX = "dictionary"

In [42]:
async def get_terms(query: str) -> list[str]:
    response = await es_client.indices.analyze(
        index=INDEX,
        body={
            "analyzer": "korean_mixed_analyzer",
            "text": query
        }
    )
    terms = [token["token"] for token in response["tokens"]]
    return list(Counter(terms))

In [43]:
await get_terms("안산시, 생활임금 시급 1만500원 의결")

['안산시', '안산', '시', '생활', '임금', '시급', '원', '의결']

## fff

In [44]:
dictionary_df = pd.DataFrame(columns=["dictoinary"])

In [51]:
start = timeit.default_timer()

for column, series in target_df.items():
    print(f"column: {column}")
    for index, value in series.items():
        if index % 100 == 0:
            stop = timeit.default_timer()
            print(f"{index}th row, {stop - start}")
        terms = await get_terms(value)
        dictionary_df = pd.concat([dictionary_df, pd.DataFrame(terms)], ignore_index=True)

column: title
0th row, 0.000537999999778549
100th row, 9.456434499999887
200th row, 18.995236100000056
300th row, 28.787248899999668
400th row, 38.58944519999977
500th row, 48.246252399999776
600th row, 58.12640209999972
700th row, 68.12610349999977
800th row, 78.28880290000006
900th row, 89.08022170000004
1000th row, 99.44027989999995
1100th row, 109.75192870000001
1200th row, 120.17947860000004
1300th row, 130.60229459999982


CancelledError: 

In [None]:
dictionary_df = dictionary_df.sort_values("dictionary").drop_duplicates("dictionary").sort_index()

In [None]:
dictionary_df.to_csv(join(pre_path, "dictionary.csv"), index=False, encoding="utf-8-sig")

## bulk

In [None]:
action_str = {
    "index": {
        "_index": "dictionary"
    }
}

action = json.dumps(action_str)

In [None]:
bulk_data = ""

count = 0
for index, row in dictionary_df.iterrows():
    row_dict = {"dictionary": row["dictionary"]}
    row_bulk = json.dumps(row_dict, ensure_ascii=False)
    bulk_data += f"{action}\n{row_bulk}\n"

In [None]:
with open(join(bulk_path, f"dictionary.json"), "w", encoding="utf-8") as json_file:
    json_file.write(bulk_data)