In [2]:
import MeCab

# Mecab 토크나이저 생성
mecab = MeCab.Tagger()

# 예시 텍스트
text = "건강한 사람이 에너지 균형을 평형 상태로 유지하는 것은 중요합니다."

# 형태소 분석 수행
parsed = mecab.parse(text)

In [3]:
parsed

'건강\tNNG,정적사태,T,건강,*,*,*,*\n한\tXSA+ETM,*,T,한,Inflect,XSA,ETM,하/XSA/*+ᆫ/ETM/*\n사람\tNNG,*,T,사람,*,*,*,*\n이\tJKS,*,F,이,*,*,*,*\n에너지\tNNG,*,F,에너지,*,*,*,*\n균형\tNNG,정적사태,T,균형,*,*,*,*\n을\tJKO,*,T,을,*,*,*,*\n평형\tNNG,정적사태,T,평형,*,*,*,*\n상태\tNNG,정적사태,F,상태,*,*,*,*\n로\tJKB,*,F,로,*,*,*,*\n유지\tNNG,*,F,유지,*,*,*,*\n하\tXSV,*,F,하,*,*,*,*\n는\tETM,*,T,는,*,*,*,*\n것\tNNB,*,T,것,*,*,*,*\n은\tJX,*,T,은,*,*,*,*\n중요\tNNG,*,F,중요,*,*,*,*\n합니다\tXSV+EF,*,F,합니다,Inflect,XSV,EF,하/XSV/*+ᄇ니다/EF/*\n.\tSF,*,*,*,*,*,*,*\nEOS\n'

In [None]:
import MeCab

# Mecab 토크나이저 생성
mecab = MeCab.Tagger()

# 불필요한 품사 태그 정의 (세종 품사 태그 기준)
stoptags = {"E", "J", "SC", "SE", "SF", "VCN", "VCP", "VX"}

# Mecab을 사용하여 문서 토큰화 및 불필요한 품사 태그 제거
def tokenize_with_mecab(text):
    tokens = mecab.parse(text).splitlines()  # Mecab 결과를 라인 단위로 분리
    processed_tokens = []
    
    for token in tokens:
        if "\t" in token:  # 형태소와 품사 태그가 \t로 구분됨
            word, tag_info = token.split("\t")
            pos_tag = tag_info.split(",")[0]  # 품사 태그는 ,로 구분된 첫 번째 요소
            if pos_tag not in stoptags:  # 불필요한 품사 태그가 아닌 경우에만 추가
                processed_tokens.append(word)
    
    return processed_tokens

# 예시 문장
text = "금성에서 달의 관측 모습은 매우 흥미롭다."

# 토큰화 및 불필요한 품사 제거
filtered_tokens = tokenize_with_mecab(text)

print("필터링된 토큰:", filtered_tokens)


In [4]:
import jsonlines
import MeCab
from rank_bm25 import BM25Okapi

# Mecab 토크나이저 생성
mecab = MeCab.Tagger()

# JSONL 파일 경로
jsonl_file_path = '/upstage-ai-advanced-ir7/data/documents.jsonl'

stoptags = {"E", "J", "SC", "SE", "SF", "VCN", "VCP", "VX"}
# 문서와 docid 저장할 리스트
documents = []
docids = []

# JSONL 파일 읽기
with jsonlines.open(jsonl_file_path) as reader:
    for obj in reader:
        docids.append(obj['docid'])      # docid 저장
        documents.append(obj['content']) # content 저장

# Mecab을 사용하여 문서 토큰화
def tokenize_with_mecab(text):
    tokens = mecab.parse(text).splitlines()  # Mecab 결과를 라인 단위로 분리
    processed_tokens = []
    
    for token in tokens:
        if "\t" in token:  # 형태소와 품사 태그가 \t로 구분됨
            word, tag_info = token.split("\t")
            pos_tag = tag_info.split(",")[0]  # 품사 태그는 ,로 구분된 첫 번째 요소
            if pos_tag not in stoptags:  # 불필요한 품사 태그가 아닌 경우에만 추가
                processed_tokens.append(word)
    
    return processed_tokens

tokenized_corpus = [tokenize_with_mecab(doc) for doc in documents]

# BM25 인덱서 생성
bm25 = BM25Okapi(tokenized_corpus)


In [5]:
# 검색할 쿼리
query = "금성에서 달의 관측 모습"  # 예시 쿼리
tokenized_query = tokenize_with_mecab(query)

# BM25로 점수 계산
doc_scores = bm25.get_scores(tokenized_query)

# 점수가 높은 순서대로 문서 정렬 (상위 200개만)
ranked_docs = sorted(zip(docids, doc_scores), key=lambda x: x[1], reverse=True)[:200]

# 결과 출력 (상위 200개만)
print("검색 결과 (상위 200개):")
for docid, score in ranked_docs:
    print(f"DocID: {docid}, Score: {score}")

검색 결과 (상위 200개):
DocID: 35c5dcc7-4720-4318-901e-770105ae63fd, Score: 23.255246546985646
DocID: 553989d9-ee23-4203-b244-a941b6fa8d99, Score: 21.834324928354185
DocID: b2e0e809-c9e9-4465-9248-07a9b49b034f, Score: 20.79078652142007
DocID: efb313ef-d7af-4d82-86f4-b5f013714a0c, Score: 19.800191372463598
DocID: 45b8eb6a-87e3-4333-b01b-7c8b772f827f, Score: 17.722095036521026
DocID: bb6d04b6-a6cf-4a9f-8324-4e06e6e81c86, Score: 17.700545666361972
DocID: da6c8a3f-45a9-4025-a63a-47c05ba2b336, Score: 17.358241879199515
DocID: 340485f8-4e78-44f4-a53a-2df21915367f, Score: 16.891084046529215
DocID: 464ace62-ddf2-423d-a5d7-2f17e6785c8e, Score: 16.693979158365906
DocID: 8a78364e-63bf-4915-b718-fdc461bc62c9, Score: 16.670926297164456
DocID: 2b40e339-174c-462f-8607-7a6be35ccd6e, Score: 16.644933505247753
DocID: d1cbb6a8-6346-4e84-b294-0c8e84d37c07, Score: 16.573896850266394
DocID: 43b53301-468b-41a2-ad67-63d8ecd84596, Score: 16.429139000098953
DocID: 79216c43-fe13-4413-abcc-a8b9f70dcdad, Score: 16.292181

In [18]:
import numpy as np

def min_max_normalize(ranked_docs):
    """
    Min-Max 정규화를 수행하는 함수
    Args:
        ranked_docs (list): (docid, score)의 리스트
    Returns:
        normalized_ranked_docs (list): 정규화된 (docid, normalized_score)의 리스트
    """
    # 점수만 추출
    scores = np.array([score for _, score in ranked_docs])

    # Min-Max 정규화
    min_score = np.min(scores)
    max_score = np.max(scores)
    
    # 0으로 나누는 오류를 방지 (최소값과 최대값이 같은 경우)
    if max_score == min_score:
        normalized_scores = np.ones_like(scores)
    else:
        normalized_scores = (scores - min_score) / (max_score - min_score)

    # 정규화된 점수와 docid를 다시 결합
    normalized_ranked_docs = [(docid, score) for (docid, _), score in zip(ranked_docs, normalized_scores)]
    
    return normalized_ranked_docs

# 함수 호출
normalized_ranked_docs = min_max_normalize(ranked_docs)

In [19]:
normalized_ranked_docs

[('35c5dcc7-4720-4318-901e-770105ae63fd', 1.0),
 ('553989d9-ee23-4203-b244-a941b6fa8d99', 0.9444848888812836),
 ('b2e0e809-c9e9-4465-9248-07a9b49b034f', 0.8842733457626346),
 ('efb313ef-d7af-4d82-86f4-b5f013714a0c', 0.8193222278618356),
 ('bb6d04b6-a6cf-4a9f-8324-4e06e6e81c86', 0.6831144213783673),
 ('340485f8-4e78-44f4-a53a-2df21915367f', 0.6269314548145943),
 ('2b40e339-174c-462f-8607-7a6be35ccd6e', 0.6226977606628725),
 ('d1cbb6a8-6346-4e84-b294-0c8e84d37c07', 0.6169972099203739),
 ('8a78364e-63bf-4915-b718-fdc461bc62c9', 0.6149938243837528),
 ('79216c43-fe13-4413-abcc-a8b9f70dcdad', 0.5730639633952893),
 ('f016fa89-bfab-44ab-a7e4-a8979cf931ec', 0.5609983469940513),
 ('89cc1287-30e4-4319-8288-9453ea1ebdac', 0.5548349253922615),
 ('4d59d546-b596-4a07-aa3f-6d6e4e6d1129', 0.54882121864961),
 ('68fa9783-3297-4a26-a50d-5a3d0b7a4159', 0.5453978045150492),
 ('cb579876-ea7a-4bd8-aae3-ceee27238435', 0.532464206211747),
 ('940293bb-5bbf-475c-bae4-12b46e9fe8a3', 0.5304403908427593),
 ('fcfee92

In [14]:
print("Min-Max 정규화된 결과:")
for docid, score in normalized_ranked_docs:
    print(f"DocID: {docid}, Normalized Score: {score}")

Min-Max 정규화된 결과:
DocID: 42508ee0-c543-4338-878e-d98c6babee66, Normalized Score: 1.0
DocID: b2b8016e-70e6-4671-8c5e-1f0be65a929a, Normalized Score: 0.858401391619583
DocID: 07e56a06-cdba-4f74-8e9c-2f1940e4a382, Normalized Score: 0.6375152254111333
DocID: 3635bcf6-0390-48ca-8654-918b5b0d4d55, Normalized Score: 0.6190418417076347
DocID: 94cd01c5-d04f-4998-937d-3160ab46a1f1, Normalized Score: 0.58258327078511
DocID: 6382146e-8065-454c-9796-abb43a0c038d, Normalized Score: 0.5579644364752666
DocID: 03ba25ae-cd21-4d49-b192-09b53302f610, Normalized Score: 0.5380274709758294
DocID: 8d02085d-eb02-4c6a-a317-f12bb9573ae7, Normalized Score: 0.531699509015633
DocID: 3369e44b-ef46-4280-b377-2a1dee7b93e3, Normalized Score: 0.528317094671855
DocID: 3a6f491b-228e-40c4-a805-af9a523ecbd0, Normalized Score: 0.5038325804790447
DocID: 4534c0e3-946a-4b02-bdf9-e18ce1f1633a, Normalized Score: 0.5030836352898936
DocID: d198a560-29f2-45e3-a6c8-71738a6b450b, Normalized Score: 0.4956940972523521
DocID: 18290d45-2bf

In [15]:
def z_score_normalize(ranked_docs):
    """
    Z-Score 정규화를 수행하는 함수
    Args:
        ranked_docs (list): (docid, score)의 리스트
    Returns:
        normalized_ranked_docs (list): Z-Score로 정규화된 (docid, normalized_score)의 리스트
    """
    # 점수만 추출
    scores = np.array([score for _, score in ranked_docs])

    # Z-Score 정규화
    mean_score = np.mean(scores)
    std_dev = np.std(scores)
    
    # 표준편차가 0인 경우(모든 점수가 동일한 경우) 처리
    if std_dev == 0:
        normalized_scores = np.zeros_like(scores)
    else:
        normalized_scores = (scores - mean_score) / std_dev

    # 정규화된 점수와 docid를 다시 결합
    normalized_ranked_docs = [(docid, score) for (docid, _), score in zip(ranked_docs, normalized_scores)]
    
    return normalized_ranked_docs

# 함수 호출
normalized_ranked_docs = z_score_normalize(ranked_docs)

# 결과 출력
print("Z-Score 정규화된 결과:")
for docid, score in normalized_ranked_docs:
    print(f"DocID: {docid}, Normalized Score: {score}")

Z-Score 정규화된 결과:
DocID: 42508ee0-c543-4338-878e-d98c6babee66, Normalized Score: 5.069031226397429
DocID: b2b8016e-70e6-4671-8c5e-1f0be65a929a, Normalized Score: 4.242397929024895
DocID: 07e56a06-cdba-4f74-8e9c-2f1940e4a382, Normalized Score: 2.9528947198809243
DocID: 3635bcf6-0390-48ca-8654-918b5b0d4d55, Normalized Score: 2.845049633978491
DocID: 94cd01c5-d04f-4998-937d-3160ab46a1f1, Normalized Score: 2.6322094864382772
DocID: 6382146e-8065-454c-9796-abb43a0c038d, Normalized Score: 2.488488100926212
DocID: 03ba25ae-cd21-4d49-b192-09b53302f610, Normalized Score: 2.3720988247962813
DocID: 8d02085d-eb02-4c6a-a317-f12bb9573ae7, Normalized Score: 2.3351570488802937
DocID: 3369e44b-ef46-4280-b377-2a1dee7b93e3, Normalized Score: 2.3154109768386415
DocID: 3a6f491b-228e-40c4-a805-af9a523ecbd0, Normalized Score: 2.1724737337875544
DocID: 4534c0e3-946a-4b02-bdf9-e18ce1f1633a, Normalized Score: 2.1681014942695227
DocID: d198a560-29f2-45e3-a6c8-71738a6b450b, Normalized Score: 2.124962382495049
DocI