In [1]:
import os
import pandas as pd
import numpy as np
import classla
import string
from collections import Counter
import re

# 初始化CLASSLA管道
nlp = classla.Pipeline(lang='hr', processors='tokenize, pos, lemma')

# 加载数据并构建ID到标题的字典
x = pd.read_json('individual_data.json')
new_dict = dict(zip(x['id'], x['title']))

# 加载停用词
with open('stopwords-hr.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())

# 清理和词形还原函数
def clean_lemmatize(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if lemma and lemma not in string.punctuation and lemma not in stopwords:
                lemmas.append(lemma)
    return lemmas

# 加载或生成词形还原数据
lemmatized_file = 'lemmatized_data.pkl'
if os.path.exists(lemmatized_file):
    df = pd.read_pickle(lemmatized_file)
else:
    df = pd.read_json('individual_data.json')
    df = df.drop('title', axis=1)
    df['body'] = df['body'].apply(clean_lemmatize)
    df.to_pickle(lemmatized_file)

# 创建词汇表
vocab = set()
for body in df['body']:
    for word in body:
        vocab.add(word)
vocablist = list(vocab)

# 构建词-文档矩阵
def term_document_matrix(data, vocab, document_index='id', text='body'):
    if document_index not in data.columns:
        raise ValueError(f"Column '{document_index}' not found in data")
    vocab_index = pd.DataFrame(0, index=vocab, columns=data[document_index])
    for doc_id, lemmas in zip(data[document_index], data[text]):
        counts = Counter(lemmas)
        for lemma, freq in counts.items():
            if lemma in vocab_index.index:
                vocab_index.at[lemma, doc_id] = freq
    return vocab_index

term_doc_matrix = term_document_matrix(df, vocablist, document_index='id', text='body')

# 计算IDF
document_index = df.id.values
doc_freq = (term_doc_matrix[document_index] > 0).sum(axis=1)
idf_series = np.log2(len(document_index) / doc_freq.replace(0, 1))

# 查询处理
def query_processing(query):
    if not isinstance(query, str):
        return []
    query = re.sub(r'\W+', ' ', query).strip().lower()
    doc = nlp(query)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if lemma and lemma not in string.punctuation and lemma not in stopwords:
                lemmas.append(lemma)
    return lemmas

# BM25评分函数
def bm25_score(term_doc_matrix, query_lemmas, idf_series, document_index, k1=1.5, b=0.75):
    doc_lengths = term_doc_matrix[document_index].sum(axis=0)
    avgdl = doc_lengths.mean()
    scores = pd.Series(0.0, index=document_index)
    for term in set(query_lemmas):
        if term not in term_doc_matrix.index:
            continue
        tf = term_doc_matrix.loc[term, document_index]
        idf = idf_series.get(term, 0)
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * doc_lengths / avgdl)
        score = idf * numerator / (denominator + 1e-10)
        scores += score
    return scores.sort_values(ascending=False)

# 检索文档
def retrieve_index(data, scores, document_index):
    data = data.set_index(document_index)
    data['scores'] = scores
    top_ids = data.sort_values('scores', ascending=False).head(5).index
    return top_ids.tolist()

counter = 0
total = len(new_dict)
ranks = []
average_precisions = []
errors = []

fours_fives = pd.read_json('changed_4_and_5.json')

total_fourfive = 0
fourfive_count = 0

for doc_id, title in new_dict.items():
    # Step 1: Process the title (query)
    qlemmas = query_processing(title)

    # Step 2: Compute BM25 scores
    scores = bm25_score(term_doc_matrix, qlemmas, idf_series, document_index)

    # Step 3: Get top 5 doc IDs (ranked)
    sorted_doc_ids = scores.sort_values(ascending=False).index.tolist()
    top_doc_ids = sorted_doc_ids[:5]

    # Accuracy@5 and Rank tracking
    if doc_id in top_doc_ids:
        counter += 1
        rank = top_doc_ids.index(doc_id) + 1
        ranks.append(rank)
        average_precisions.append(1 / rank)  # AP for this query
    else:
        errors.append(doc_id)
        average_precisions.append(0.0)

    # Usefulness check for 4s and 5s
    for doc_id_candidate in top_doc_ids:
        for _, d in fours_fives.iterrows():
            total_fourfive += 1
            if doc_id_candidate == d["id"] and d["id2"] not in top_doc_ids:
                fourfive_count += 1

# Final metrics
accuracy = counter / total
avg_rank = sum(ranks) / len(ranks) if ranks else None
map_score = sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
usefulness = fourfive_count / total_fourfive if total_fourfive > 0 else 0.0

# Output
print(f"Found correct doc in top 5 for {counter}/{total} queries.")
print(f"Accuracy@5: {accuracy:.2%}")
if avg_rank is not None:
    print(f"Average rank position (for successful hits): {avg_rank:.2f}")
else:
    print("No correct documents found in top 5; cannot compute average rank.")
print(f"Mean Average Precision (MAP): {map_score:.4f}")
print(f"Similarity score usefulness: {usefulness:.4f}")
print(errors)

2025-05-27 19:14:18 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-05-27 19:14:18 INFO: Use device: cpu
2025-05-27 19:14:18 INFO: Loading: tokenize
2025-05-27 19:14:18 INFO: Loading: pos
2025-05-27 19:14:24 INFO: Loading: lemma
2025-05-27 19:14:45 INFO: Done loading processors!


Found correct doc in top 5 for 2020/2109 queries.
Accuracy@5: 95.78%
Average rank position (for successful hits): 1.31
Mean Average Precision (MAP): 0.8466
Similarity score usefulness: 0.0001
[9, 45, 92, 125, 134, 141, 178, 187, 206, 265, 315, 351, 380, 391, 394, 396, 401, 415, 533, 576, 594, 662, 666, 675, 676, 681, 682, 722, 723, 748, 768, 783, 791, 891, 902, 903, 908, 923, 926, 931, 941, 1002, 1008, 1030, 1034, 1052, 1059, 1081, 1116, 1137, 1162, 1175, 1197, 1198, 1284, 1317, 1354, 1375, 1383, 1384, 1391, 1413, 1472, 1493, 1498, 1517, 1536, 1549, 1556, 1564, 1587, 1637, 1679, 1689, 1719, 1770, 1871, 1872, 1873, 1875, 1897, 1919, 1951, 1992, 2032, 2039, 2042, 2081, 2098]
