In [1]:
import os
import pandas as pd
import numpy as np
import classla
import string
from collections import Counter
import re
from sentence_transformers import SentenceTransformer, util
import torch

# Load and preprocess data
x = pd.read_json('individual_data.json')
new_dict = dict(zip(x['id'], x['title']))

# Initialize CLASSLA pipeline for Croatian
nlp = classla.Pipeline(lang='hr', processors='tokenize,pos,lemma')

# Load Croatian stopwords
with open('stopwords-hr.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())

def clean_lemmatize(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if lemma and lemma not in string.punctuation and lemma not in stopwords:
                lemmas.append(lemma)
    return lemmas

# Load or create lemmatized data
lemmatized_file = 'lemmatized_data.pkl'
if os.path.exists(lemmatized_file):
    print("Loading cached lemmatized data...")
    df = pd.read_pickle(lemmatized_file)
else:
    print("Lemmatizing data from scratch...")
    df = pd.read_json('individual_data.json')
    df = df.drop('title', axis=1)
    df['body'] = df['body'].apply(clean_lemmatize)
    df.to_pickle(lemmatized_file)

# Build vocabulary
vocab = set()
for body in df['body']:
    vocab.update(body)
vocablist = list(vocab)

# Build term-document matrix
def term_document_matrix(data, vocab, document_index='id', text='body'):
    if document_index not in data.columns:
        raise ValueError(f"Column '{document_index}' not found in data")
    vocab_index = pd.DataFrame(0, index=vocab, columns=data[document_index])
    for doc_id, lemmas in zip(data[document_index], data[text]):
        counts = Counter(lemmas)
        for lemma, freq in counts.items():
            if lemma in vocab_index.index:
                vocab_index.at[lemma, doc_id] = freq
    return vocab_index

term_doc_matrix = term_document_matrix(df, vocablist, document_index='id', text='body')
document_ids = df['id'].values
doc_lengths = term_doc_matrix[document_ids].sum(axis=0)
avgdl = doc_lengths.mean()

# Compute IDF
doc_freq = (term_doc_matrix[document_ids] > 0).sum(axis=1)
idf_series = np.log2(len(document_ids) / doc_freq.replace(0, 1))

# Query processing
def query_processing(query):
    if not isinstance(query, str):
        return []
    query = re.sub(r'\W+', ' ', query).strip().lower()
    doc = nlp(query)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if lemma and lemma not in string.punctuation and lemma not in stopwords:
                lemmas.append(lemma)
    return lemmas

# Hardcoded BM25 retrieval
def bm25_score(term_doc_matrix, query_lemmas, idf_series, document_ids, k1=1.5, b=0.75):
    scores = pd.Series(0.0, index=document_ids)
    for term in set(query_lemmas):
        if term not in term_doc_matrix.index:
            continue
        tf = term_doc_matrix.loc[term, document_ids]
        idf = idf_series.get(term, 0)
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * doc_lengths / avgdl)
        score = idf * numerator / (denominator + 1e-10)
        scores += score
    return scores.sort_values(ascending=False)

def bm25_retrieval(query_lemmas, term_doc_matrix, idf_series, document_ids, top_k=100):
    scores = bm25_score(term_doc_matrix, query_lemmas, idf_series, document_ids)
    top_doc_ids = scores.head(top_k).index.tolist()
    return top_doc_ids, scores.loc[top_doc_ids].values

# SBERT reranking
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
original_texts = pd.read_json('individual_data.json').set_index('id')['body'].to_dict()

def sbert_rerank(query, doc_ids, original_texts, sbert_model, top_k=5):
    doc_texts = [original_texts.get(doc_id, "") for doc_id in doc_ids]

    valid_doc_ids = []
    valid_texts = []
    for doc_id, text in zip(doc_ids, doc_texts):
        if isinstance(text, str) and text.strip():
            valid_doc_ids.append(doc_id)
            valid_texts.append(text)

    if not valid_texts:
        return []

    query_embedding = sbert_model.encode(query, convert_to_tensor=True)
    doc_embeddings = sbert_model.encode(valid_texts, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
    top_results = torch.topk(cos_scores, k=min(top_k, len(valid_doc_ids)))
    return [valid_doc_ids[i] for i in top_results.indices.tolist()]

# Evaluation
def average_precision_single_relevant(retrieved, relevant_doc_id):
    if relevant_doc_id in retrieved:
        rank = retrieved.index(relevant_doc_id) + 1
        return 1.0 / rank
    return 0.0

counter = 0
total = len(new_dict)
ranks = []
total_ap = 0.0
errors = []

fours_fives = pd.read_json('changed_4_and_5.json')
total_fourfive = 0
fourfive_count = 0

for doc_id, title in new_dict.items():
    qlemmas = query_processing(title)
    top_doc_ids, _ = bm25_retrieval(qlemmas, term_doc_matrix, idf_series, document_ids, top_k=100)
    reranked_doc_ids = sbert_rerank(title, top_doc_ids, original_texts, sbert_model, top_k=5)

    if doc_id in reranked_doc_ids:
        counter += 1
        rank = reranked_doc_ids.index(doc_id) + 1
        ranks.append(rank)
    else:
        errors.append(doc_id)

    ap = average_precision_single_relevant(reranked_doc_ids, doc_id)
    total_ap += ap

     # Usefulness check for 4s and 5s
    for doc_id_candidate in reranked_doc_ids:
        for _, d in fours_fives.iterrows():
            total_fourfive += 1
            if doc_id_candidate == d["id"] and d["id2"] not in reranked_doc_ids:
                fourfive_count += 1

# Results
accuracy = counter / total
avg_rank = sum(ranks) / len(ranks) if ranks else None
map_score = total_ap / total if total > 0 else 0.0
usefulness = fourfive_count / total_fourfive if total_fourfive > 0 else 0.0

print(f"\nFound correct doc in top 5 for {counter}/{total} queries.")
print(f"Accuracy@5: {accuracy:.2%}")
if avg_rank is not None:
    print(f"Average rank position (for successful hits): {avg_rank:.2f}")
else:
    print("No correct documents found in top 5; cannot compute average rank.")
print(f"Mean Average Precision (MAP): {map_score:.4f}")
print(f"Similarity score usefulness: {usefulness:.4f}")
print(errors)

2025-05-28 11:38:22 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-05-28 11:38:22 INFO: Use device: cpu
2025-05-28 11:38:22 INFO: Loading: tokenize
2025-05-28 11:38:22 INFO: Loading: pos
2025-05-28 11:38:23 INFO: Loading: lemma
2025-05-28 11:38:34 INFO: Done loading processors!


Loading cached lemmatized data...

Found correct doc in top 5 for 1362/2109 queries.
Accuracy@5: 64.58%
Average rank position (for successful hits): 1.71
Mean Average Precision (MAP): 0.5009
Similarity score usefulness: 0.0003
[1, 2, 3, 7, 8, 9, 11, 18, 24, 27, 28, 33, 37, 38, 39, 40, 41, 43, 45, 46, 56, 60, 62, 66, 74, 76, 91, 92, 93, 95, 98, 105, 106, 116, 118, 121, 125, 127, 131, 132, 133, 134, 137, 140, 141, 142, 145, 147, 148, 151, 155, 156, 157, 158, 161, 166, 168, 170, 171, 173, 175, 178, 183, 186, 187, 195, 196, 198, 199, 204, 205, 207, 208, 210, 211, 212, 216, 222, 229, 234, 235, 238, 240, 246, 247, 248, 249, 252, 253, 256, 257, 259, 260, 262, 265, 267, 269, 273, 275, 277, 279, 283, 284, 286, 291, 294, 295, 299, 300, 302, 304, 305, 307, 308, 313, 315, 317, 320, 321, 322, 326, 334, 337, 339, 346, 347, 351, 357, 359, 366, 368, 369, 370, 371, 379, 381, 383, 384, 385, 386, 387, 391, 392, 394, 401, 405, 408, 409, 410, 412, 415, 417, 419, 422, 423, 427, 432, 434, 437, 438, 443, 445,