In [1]:
import os
import pandas as pd
import numpy as np
import classla
import string
from collections import Counter
import re

In [2]:
x = pd.read_json('individual_data.json')

# Build dictionary from 'id' to 'title'
new_dict = dict(zip(x['id'], x['title']))

In [3]:
# Initialize CLASSLA pipeline with tokenization and lemmatization
nlp = classla.Pipeline(lang='hr', processors='tokenize, pos, lemma')

# Load Croatian stopwords
with open('stopwords-hr.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())

def clean_lemmatize(text):
    # Handle non-string input (e.g., NaN or non-text)
    if not isinstance(text, str):
        return []

    # Process text with CLASSLA
    doc = nlp(text)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if (
                lemma and
                lemma not in string.punctuation and
                lemma not in stopwords
            ):
                lemmas.append(lemma)
    return lemmas

2025-05-27 18:33:28 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-05-27 18:33:28 INFO: Use device: cpu
2025-05-27 18:33:28 INFO: Loading: tokenize
2025-05-27 18:33:28 INFO: Loading: pos
2025-05-27 18:33:33 INFO: Loading: lemma
2025-05-27 18:33:55 INFO: Done loading processors!


In [4]:
lemmatized_file = 'lemmatized_data.pkl' # path to cached version

if os.path.exists(lemmatized_file):
    print("Loading cached lemmatized data...")
    df = pd.read_pickle(lemmatized_file)
else:
    print("Lemmatizing data from scratch...")
    df = pd.read_json('individual_data.json')
    df = df.drop('title', axis=1)
    df['body'] = df['body'].apply(clean_lemmatize)

    # Save to disk for next time
    df.to_pickle(lemmatized_file)

# Print the first few rows to verify
print(df.head(10))

Loading cached lemmatized data...
   id                                               body
0   1  [dan, mjesni, odbor, josipovac, crkven, god, s...
1   2  [osijek, htjeti, percipirati, središte, turbo,...
2   3  [pet, hrvatski, liječnički, udruga, predstavit...
3   4  [velik, prosvjed, liječnik, danas, ići, pred, ...
4   5  [beširević, trebati, razmisliti, sustav, azil,...
5   6  [nov, izdanje, školski, knjiga, predstaviti, k...
6   7  [mlad, hrvatski, ženski, nogometni, u-17, repr...
7   8  [mlad, hrvatski, ženski, u-17, reprezentacija,...
8   9  [budimirov, stožer, obično, provoditi, pun, ra...
9  10  [prosvjed, protiv, pobačaj, njemačka, čest, vl...


In [5]:
# create vocab of all unique words using a set

vocab = set()
for body in df['body']:
    for word in body:
        vocab.add(word)
vocablist = list(vocab)
print(len(vocablist))

39954


In [6]:
def term_document_matrix(data, vocab, document_index='id', text='body'):
    """
    Create a term-document matrix from lemmatized text data.
   
    Parameters:
    - data: DataFrame containing document IDs and lemmatized text
    - vocab: List of unique terms to include
    - document_index: Column name for document IDs
    - text: Column name for lemmatized text (list of lemmas)
   
    Returns:
    - vocab_index: DataFrame with terms as rows, document IDs as columns, and term frequencies as values
    """
    if document_index not in data.columns:
        raise ValueError(f"Column '{document_index}' not found in data")
    
    vocab_index = pd.DataFrame(0, index=vocab, columns=data[document_index])

    for doc_id, lemmas in zip(data[document_index], data[text]):
        counts = Counter(lemmas)
        for lemma, freq in counts.items():
            if lemma in vocab_index.index:
                vocab_index.at[lemma, doc_id] = freq

    return vocab_index

term_doc_matrix = term_document_matrix(df, vocablist, document_index='id', text='body')
term_doc_matrix

id,1,2,3,4,5,6,7,8,9,10,...,2100,2101,2102,2103,2104,2105,2106,2107,2108,2109
omražen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pitom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ofenziva,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aida,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nedovršeno,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shout,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sobov,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
desetorica,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
comodin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def tf_idf_score(vocab_index, document_index, inv_df='inverse_document_frequency'):
    """
    Efficiently compute TF-IDF scores from a term-document matrix.
    Returns a new DataFrame with TF-IDF scores (same shape as vocab_index[document_index]).
    """
    # Number of documents
    total_docs = len(document_index)

    # Compute Document Frequency (DF)
    df_series = (vocab_index[document_index] > 0).sum(axis=1)

    # Compute Inverse Document Frequency (IDF)
    idf = np.log2(total_docs / df_series.replace(0, 1))

    # Apply log-scaled TF: log2(1 + tf)
    tf = np.log2(1 + vocab_index[document_index])

    # Multiply TF by IDF (broadcasting the IDF over rows)
    tf_idf_matrix = tf.mul(idf, axis=0)

    return tf_idf_matrix

tfidf_matrix = tf_idf_score(term_doc_matrix, df.id.values)
tfidf_matrix

id,1,2,3,4,5,6,7,8,9,10,...,2100,2101,2102,2103,2104,2105,2106,2107,2108,2109
omražen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pitom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ofenziva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aida,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nedovršeno,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shout,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sobov,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
desetorica,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comodin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def query_processing(query):
    """
    Clean and lemmatize the input query using CLASSLA.
    Removes stopwords and punctuation.

    Returns:
        list of lemmatized, filtered tokens
    """
    if not isinstance(query, str):
        return []

    # Basic cleanup
    query = re.sub(r'\W+', ' ', query).strip().lower()

    # Lemmatize using CLASSLA
    doc = nlp(query)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            lemma = word.lemma.lower().strip()
            if lemma and lemma not in string.punctuation and lemma not in stopwords:
                lemmas.append(lemma)
    return lemmas

#query = "Andrej Kramarić"
#qlemmas = query_processing(query)
#print(qlemmas)

In [9]:
def query_vector(query_lemmas, vocab_index, idf_series):
    """
    Build a TF-IDF vector for the query using the global vocabulary and IDF values.

    Parameters:
        query_lemmas: list of preprocessed query lemmas
        vocab_index: term-document matrix (to get vocab)
        idf_series: Series with IDF values for each term

    Returns:
        Series: TF-IDF vector for the query (index aligned with vocab_index)
    """
    query_counts = Counter(query_lemmas)
    tf_query = pd.Series(0.0, index=vocab_index.index)

    for term, freq in query_counts.items():
        if term in idf_series.index:
            tf = np.log2(1 + freq)
            idf = idf_series[term]
            tf_query[term] = tf * idf  # TF-IDF

    return tf_query

document_index = df.id.values
doc_freq = (term_doc_matrix[document_index] > 0).sum(axis=1)
idf_series = np.log2(len(document_index) / doc_freq.replace(0, 1))

#query_vec = query_vector(qlemmas, term_doc_matrix, idf_series)
#print(query_vec)

In [10]:
def cosine_similarity(tfidf_matrix, query_vector):
    """
    Compute cosine similarity between the query vector and all documents.

    Parameters:
        tfidf_matrix: DataFrame (terms x documents), with TF-IDF values
        query_vector: Series (terms), with TF-IDF scores of query

    Returns:
        Series: Cosine similarity scores indexed by document ID
    """
    # Compute dot product between query and each document
    dot_products = tfidf_matrix.T.dot(query_vector)

    # Norms
    query_norm = np.sqrt((query_vector ** 2).sum())
    doc_norms = np.sqrt((tfidf_matrix ** 2).sum(axis=0))

    # Avoid division by zero
    similarities = dot_products / (query_norm * doc_norms + 1e-10)

    return similarities.sort_values(ascending=False)

#similarities = cosine_similarity(tfidf_matrix, query_vec)
#print(similarities.head(10))

In [11]:
def retrieve_index(data, cosine_scores, document_index):
    """
    Retrieves top 10 document indices based on cosine similarity scores.

    Parameters:
        data: DataFrame
            The original DataFrame with document texts and IDs.
        cosine_scores: pd.Series
            Cosine similarity scores where the index corresponds to document IDs.
        document_index: str
            Column name in `data` corresponding to document IDs.

    Returns:
        Int64Index:
            Index positions in the original DataFrame of the top 10 most similar documents.
    """
    # Ensure document_index is the index so we can match with cosine_scores
    data = data.set_index(document_index)

    # Add the cosine scores
    data['scores'] = cosine_scores

    top_ids = data.sort_values('scores', ascending=False).head(5).index
    return top_ids.tolist() # list of top 5 most relevant doc IDs

# Get the IDs of the top 5 similar documents
#top_doc_ids = retrieve_index(df, similarities, 'id')
#print(top_doc_ids)

In [12]:
counter = 0
total = len(new_dict)
ranks = []
average_precisions = []
errors = []

fours_fives = pd.read_json('changed_4_and_5.json')

total_fourfive = 0
fourfive_count = 0

for doc_id, title in new_dict.items():
     # Step 1: Process the title (query)
    qlemmas = query_processing(title)
    
    # Step 2: Build query vector
    qvec = query_vector(qlemmas, term_doc_matrix, idf_series)
    
    # Step 3: Compute similarities
    similarities = cosine_similarity(tfidf_matrix, qvec)
    
    # Step 4: Get top 5 doc IDs (ranked)
    top_doc_ids = retrieve_index(df, similarities, 'id')

    # Accuracy@5 and Rank tracking
    if doc_id in top_doc_ids:
        counter += 1
        rank = top_doc_ids.index(doc_id) + 1
        ranks.append(rank)
        average_precisions.append(1 / rank)  # AP for this query
    else:
        errors.append(doc_id)
        average_precisions.append(0.0)

    # Usefulness check for 4s and 5s
    for doc_id_candidate in top_doc_ids:
        for _, d in fours_fives.iterrows():
            total_fourfive += 1
            if doc_id_candidate == d["id"] and d["id2"] not in top_doc_ids:
                fourfive_count += 1

# Final metrics
accuracy = counter / total
avg_rank = sum(ranks) / len(ranks) if ranks else None
map_score = sum(average_precisions) / len(average_precisions) if average_precisions else 0.0
usefulness = fourfive_count / total_fourfive if total_fourfive > 0 else 0.0

# Output
print(f"Found correct doc in top 5 for {counter}/{total} queries.")
print(f"Accuracy@5: {accuracy:.2%}")
if avg_rank is not None:
    print(f"Average rank position (for successful hits): {avg_rank:.2f}")
else:
    print("No correct documents found in top 5; cannot compute average rank.")
print(f"Mean Average Precision (MAP): {map_score:.4f}")
print(f"Similarity score usefulness: {usefulness:.4f}")
print(errors)

Found correct doc in top 5 for 1987/2109 queries.
Accuracy@5: 94.22%
Average rank position (for successful hits): 1.39
Mean Average Precision (MAP): 0.8061
Similarity score usefulness: 0.0002
[9, 39, 45, 92, 125, 131, 134, 141, 171, 178, 184, 187, 206, 211, 234, 265, 308, 310, 315, 351, 391, 394, 396, 401, 415, 478, 500, 511, 518, 522, 533, 576, 594, 659, 662, 666, 675, 676, 681, 682, 722, 723, 748, 767, 783, 791, 891, 902, 903, 908, 926, 931, 941, 942, 943, 944, 970, 1002, 1008, 1024, 1030, 1059, 1076, 1080, 1081, 1116, 1117, 1137, 1153, 1156, 1162, 1168, 1170, 1197, 1198, 1204, 1280, 1284, 1354, 1375, 1384, 1391, 1413, 1465, 1472, 1498, 1508, 1512, 1536, 1540, 1556, 1564, 1574, 1587, 1608, 1631, 1637, 1657, 1679, 1689, 1693, 1719, 1770, 1777, 1871, 1872, 1873, 1875, 1877, 1897, 1912, 1919, 1936, 1951, 2020, 2032, 2039, 2067, 2081, 2094, 2098, 2105]
