In [1]:
COLUMN_NAME = "content"
I = 1
START = (I - 1) * 20000
END = START + 19999

# Dictionary 적재

In [2]:
from pathlib import Path
import os

os.chdir(Path(os.getcwd()).parent.parent)

In [3]:
from os.path import join

import json
import ast
import timeit

import pandas as pd
import numpy as np
from elasticsearch import AsyncElasticsearch
from collections import Counter

In [4]:
root_path = os.getcwd()
data_path = join(os.getcwd(), "data")
raw_path = join(data_path, "raw")
pre_path = join(data_path, "preprocessing")
bulk_path = join(data_path, "bulk")

## Elasticsearch client

In [5]:
async def close(es_client: AsyncElasticsearch):
    await es_client.close()

In [6]:
es_config = {
    "hosts": "http://localhost:9200",
    "http_auth": ("elastic", "42maru"),
    "scheme": "http"
}

In [7]:
es_client = AsyncElasticsearch(**es_config)

## Load news_data_20000.csv

In [8]:
target_name = "news"
target_file_name = "news_data_20000.csv"

In [9]:
target_df = pd.read_csv(join(raw_path, target_file_name))[["title", "content"]]

In [10]:
target_df.head()

Unnamed: 0,title,content
0,"안산시, 생활임금 시급 1만500원 의결",[헤럴드경제(안산)=박정규 기자]안산시(시장 윤화섭) 생활임금위원회는 2022년 생...
1,"울산 하이본 병원, 6월부터 본격 진료 ""척추와 관절 분야 전문""",척추와 관절 분야 전문인 울산 하이본 병원이 6월부터 본격 진료를 시작했다. \n\...
2,금융권 최초 공동 정보지갑서비스 출시…1호 증명서는 '뱅크아이디',[아시아경제 송승섭 기자]금융분산ID 추진 협의회는 국내 최초 정보지갑인 마이인포에...
3,“총장 후보자 수사심의위 신청은 모순” “정치적 수단 전락”,서울 서초구 대검찰청 검찰기. 연합뉴스 “정치적 중립성을 지키고 수사 결정 전 과정...
4,[인사]현대해상,◆현대해상 ◇임원 전보 △리스크관리본부장 홍사경 △고객지원본부장 황미은 △CCO 윤...


## ddd

In [11]:
async def get_terms(query: str) -> list[str]:
    response = await es_client.indices.analyze(
        index="dictionary",
        body={
            "analyzer": "korean_mixed_analyzer",
            "text": query
        }
    )
    terms = [token["token"] for token in response["tokens"]]
    return list(Counter(terms))

In [12]:
terms = await get_terms("안산시, 생활임금 시급 1만500원 의결")

## fff

In [13]:
dictionary_df = pd.DataFrame(columns=["dictionary"])

In [14]:
partial_series = target_df.loc[START:END, COLUMN_NAME]

start = timeit.default_timer()
for index, value in partial_series.items():
    if index % 100 == 0:
        stop = timeit.default_timer()
        print(f"{index}th row, {stop - start}")
    terms = await get_terms(value)
    dictionary_df = pd.concat([dictionary_df, pd.DataFrame(data=terms, columns=["dictionary"])], ignore_index=True)

0th row, 8.562499999964501e-05
100th row, 0.4319714579999996
200th row, 0.9411792080000003
300th row, 1.4020822079999995
400th row, 1.9199462499999997
500th row, 2.46599275
600th row, 3.134784917
700th row, 3.6873559579999995
800th row, 4.226666583
900th row, 4.7891797920000005
1000th row, 5.40172175
1100th row, 5.920452542
1200th row, 6.433045207999999
1300th row, 7.077325875
1400th row, 7.807831332999999
1500th row, 8.403308707999999
1600th row, 8.979479583
1700th row, 9.537917292
1800th row, 10.270692541999999
1900th row, 10.85922375
2000th row, 11.49976075
2100th row, 12.146882332999999
2200th row, 12.775832792000001
2300th row, 13.463917167
2400th row, 14.133349832999999
2500th row, 14.831604042
2600th row, 15.484729958
2700th row, 16.206206792000003
2800th row, 16.935765916999998
2900th row, 17.647917874999997
3000th row, 18.311192083
3100th row, 18.964965458
3200th row, 19.636684291999998
3300th row, 20.37875925
3400th row, 21.111317375
3500th row, 21.862433333
3600th row, 22.60

In [15]:
dictionary_df.head()

Unnamed: 0,dictionary
0,헤럴드
1,경제
2,안산
3,박정규
4,기자


In [16]:
dictionary_df.shape

(2439344, 1)

In [17]:
dict_series = dictionary_df["dictionary"]
dict_series = dict_series.drop_duplicates(ignore_index=True)
dict_series = dict_series.sort_values(ignore_index=True)
len(dict_series)

71714

In [18]:
new_dict_df = pd.DataFrame(data=dict_series, columns=["dictionary"])
new_dict_df.shape

(71714, 1)

In [19]:
new_dict_df.head()

Unnamed: 0,dictionary
0,read
1,ㄱ
2,ㄴ
3,ㄷ
4,ㄹ


In [20]:
new_dict_df.to_csv(join(pre_path, f"dictionary_{I}.csv"), index=False, encoding="utf-8-sig")

In [21]:
await es_client.close()