In [1]:
COLUMN_NAME = "title"
I = 0
START = 0
END = 19999

# Dictionary 적재

In [2]:
from pathlib import Path
import os

os.chdir(Path(os.getcwd()).parent.parent)

In [3]:
from os.path import join

import json
import ast
import timeit

import pandas as pd
import numpy as np
from elasticsearch import AsyncElasticsearch
from collections import Counter

In [4]:
root_path = os.getcwd()
data_path = join(os.getcwd(), "data")
raw_path = join(data_path, "raw")
pre_path = join(data_path, "preprocessing")
bulk_path = join(data_path, "bulk")

## Elasticsearch client

In [5]:
async def close(es_client: AsyncElasticsearch):
    await es_client.close()

In [6]:
es_config = {
    "hosts": "http://localhost:9200",
    "http_auth": ("elastic", "42maru"),
    "scheme": "http"
}

In [7]:
es_client = AsyncElasticsearch(**es_config)

## Load news_data_20000.csv

In [8]:
target_name = "news"
target_file_name = "news_data_20000.csv"

In [9]:
target_df = pd.read_csv(join(raw_path, target_file_name))[["title", "content"]]

In [10]:
target_df.head()

Unnamed: 0,title,content
0,"안산시, 생활임금 시급 1만500원 의결",[헤럴드경제(안산)=박정규 기자]안산시(시장 윤화섭) 생활임금위원회는 2022년 생...
1,"울산 하이본 병원, 6월부터 본격 진료 ""척추와 관절 분야 전문""",척추와 관절 분야 전문인 울산 하이본 병원이 6월부터 본격 진료를 시작했다. \n\...
2,금융권 최초 공동 정보지갑서비스 출시…1호 증명서는 '뱅크아이디',[아시아경제 송승섭 기자]금융분산ID 추진 협의회는 국내 최초 정보지갑인 마이인포에...
3,“총장 후보자 수사심의위 신청은 모순” “정치적 수단 전락”,서울 서초구 대검찰청 검찰기. 연합뉴스 “정치적 중립성을 지키고 수사 결정 전 과정...
4,[인사]현대해상,◆현대해상 ◇임원 전보 △리스크관리본부장 홍사경 △고객지원본부장 황미은 △CCO 윤...


## ddd

In [11]:
async def get_terms(query: str) -> list[str]:
    response = await es_client.indices.analyze(
        index="dictionary",
        body={
            "analyzer": "korean_mixed_analyzer",
            "text": query
        }
    )
    terms = [token["token"] for token in response["tokens"]]
    return list(Counter(terms))

In [12]:
terms = await get_terms("안산시, 생활임금 시급 1만500원 의결")

## fff

In [13]:
dictionary_df = pd.DataFrame(columns=["dictionary"])

In [14]:
partial_series = target_df.loc[START:END, COLUMN_NAME]

start = timeit.default_timer()
for index, value in partial_series.items():
    if index % 100 == 0:
        stop = timeit.default_timer()
        print(f"{index}th row, {stop - start}")
    terms = await get_terms(value)
    dictionary_df = pd.concat([dictionary_df, pd.DataFrame(data=terms, columns=["dictionary"])], ignore_index=True)

0th row, 9.366699999979744e-05
100th row, 0.3243147500000001
200th row, 0.5984092090000006
300th row, 0.852913
400th row, 1.154368625
500th row, 1.4685454170000005
600th row, 1.7187252500000003
700th row, 2.0154444170000003
800th row, 2.3091982920000005
900th row, 2.567639959
1000th row, 2.8553194590000004
1100th row, 3.1132139589999994
1200th row, 3.3805720420000007
1300th row, 3.665128834000001
1400th row, 3.980722084
1500th row, 4.383577959
1600th row, 4.659521250000001
1700th row, 4.984237542000001
1800th row, 5.283519209
1900th row, 5.649941084
2000th row, 6.035315125
2100th row, 6.398661209
2200th row, 6.738504834
2300th row, 7.163927709000001
2400th row, 7.964307584
2500th row, 8.445130875
2600th row, 8.830494584
2700th row, 9.174216334
2800th row, 9.592409625
2900th row, 9.942711917
3000th row, 10.383452417
3100th row, 10.739038584000001
3200th row, 11.078501709
3300th row, 11.435341917
3400th row, 11.803134167000001
3500th row, 12.187079250000002
3600th row, 12.517053124999999

In [15]:
dictionary_df.head()

Unnamed: 0,dictionary
0,안산시
1,안산
2,시
3,생활
4,임금


In [16]:
dictionary_df.shape

(185667, 1)

In [21]:
dict_series = dictionary_df["dictionary"]
dict_series = dict_series.drop_duplicates(ignore_index=True)
dict_series = dict_series.sort_values(ignore_index=True)
len(dict_series)

(185667, 1)

In [25]:
new_dict_df.head()

Unnamed: 0,dictionary
0,가
1,가가
2,가감
3,가게
4,가격


In [23]:
new_dict_df = pd.DataFrame(data=dict_series, columns=["dictionary"])
new_dict_df.shape

(20430, 1)

In [28]:
new_dict_df.to_csv(join(pre_path, f"dictionary_{I}.csv"), index=False, encoding="utf-8-sig")

In [27]:
await es_client.close()