In [58]:
import pandas as pd
import numpy as np
from scipy.sparse import  lil_matrix
import string
from gensim.models import KeyedVectors
from gensim import downloader as api
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
import re
from sklearn.metrics.pairwise import linear_kernel



In [59]:
def dcg_at_k(r, k):
    """
    Calculate Discounted Cumulative Gain at k.
    
    Parameters:
    - r: List of relevance scores
    - k: The position at which to stop the calculation
    
    Returns:
    - DCG value at position k
    """
    r = np.asarray(r)[:k]
    return np.sum((2 ** r - 1) / np.log2(np.arange(2, r.size + 2)))

def ndcg_at_k(r, k):
    """
    Calculate Normalized Discounted Cumulative Gain at k.
    
    Parameters:
    - r: List of relevance scores
    - k: The position at which to stop the calculation
    
    Returns:
    - NDCG value at position k
    """
    ideal_ranking = sorted(r, reverse=True)
    ideal_dcg = dcg_at_k(ideal_ranking, k)
    actual_dcg = dcg_at_k(r, k)
    
    if ideal_dcg == 0:
        return 0.0
    
    return actual_dcg / ideal_dcg

# Example usage:
# Relevance scores for a recommendation list
relevance_scores = [3, 2, 3, 0, 1, 2]

# Calculate NDCG at position 4
k = 6
ndcg_value = ndcg_at_k(relevance_scores, k)

print(f"NDCG at position {k}: {ndcg_value}")


NDCG at position 6: 0.9488107485678985


In [60]:
df_query = pd.read_json(path_or_buf='data/queries.jsonl', lines=True)
df_query['text'] = df_query['text'].str.strip()

In [None]:
df_train = pd.read_csv("data/task2_train.tsv", sep="\t")
df_train = pd.merge(df_train, df_query, left_on='query-id', right_on='_id')
df_train = df_train.drop(columns=['query-id', '_id','metadata'])
df_train['corpus-id'] = df_train['corpus-id'].str.replace("[",'')
df_train['corpus-id'] = df_train['corpus-id'].str.replace("]",'')
df_train['score'] = df_train['score'].str.replace("[",'')
df_train['score'] = df_train['score'].str.replace("]",'')
df_train['score'] = df_train['score'].str.split(", ")
df_train['corpus-id'] = df_train['corpus-id'].str.split(", ")
df_train

In [62]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus = corpus.reset_index(drop=True)
corpus

Unnamed: 0,corpus-id,text
0,0,The presence of communication amid scientific ...
1,8,"In June 1942, the United States Army Corps of ..."
2,12,Tutorial: Introduction to Restorative Justice....
3,16,The approach is based on a theory of justice t...
4,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
1471401,8841780,Wolves don't hide. They don't even live in cav...
1471402,8841787,The UNHCR Country Representative in Kenya. Str...
1471403,8841790,2. Describe the misery at Kakuma. 3. Compariso...
1471404,8841800,Following the death of his employer and mentor...


In [63]:
corpusss_idx = []
corpusss_score = []

for row in df_train.iloc:
    corpuss_idx = []
    corpuss_score = []
    for corpuss in row['corpus-id']:
        corpuss_idx.append(corpus.index[corpus['corpus-id'] == int(corpuss)][0])
    for scoree in row['score']:
        corpuss_score.append(int(scoree))
    corpusss_idx.append(corpuss_idx)
    corpusss_score.append(corpuss_score)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [None]:
try:
    model = KeyedVectors.load('data/glove.model.d2v')
except:
    print("model not found, loading from api")
    model = api.load("glove-wiki-gigaword-50")
    model.save('data/glove.model.d2v')
STEMMER = PorterStemmer()


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    def is_bad_word(s):
        return len(s) <= 1 or any(i not in string.printable or i in string.digits for i in s)

    words = text.split()
    cleaned_words = [word for word in words if not is_bad_word(word)]
    text = " ".join(cleaned_words)

    # the text contains many unidentified character, we decide to keep only ASCII characters
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)

    #remove punctuation and digits
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.translate(str.maketrans('', '', string.digits))

    # Tokenize the text (split it into words)
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stopwords_ = stopwords.words('english')
    words = [STEMMER.stem(word) for word in words if word not in stopwords_ and word in model]

    # Lemmatize words
    #words = [lemmatizer.lemmatize(word) for word in words]
    return words

model not found, loading from api


In [None]:
def vectorize_query(queries_df, vocabulary, idf):
    """Convert each query in the DataFrame into its TF-IDF vector."""
    
    print("Process queries 2 ...")
    # Preprocess all queries
    queries_df['processed'] = queries_df['text'].apply(preprocess_text)

    print("Initialize sparse matrix ...")
    num_queries = len(queries_df)
    num_terms = len(vocabulary)
    
    # Using a dictionary for term index lookup
    vocab_dict = {term: index for index, term in enumerate(vocabulary)}
    tf_matrix = lil_matrix((num_queries, num_terms))

    print("Compute  tf ...")
    # Populate the sparse matrix
    for idx, row in queries_df.iterrows():
        for term in row['processed']:
            if term in vocab_dict:
                tf_matrix[idx, vocab_dict[term]] += 1

    print("Multiply by idf ...")
    # Convert to CSR format for efficient multiplication and transform TFs to TF-IDF
    print(tf_matrix.shape, idf.shape)
    tfidf_matrix = (tf_matrix).multiply(idf)

    print("Done !")
    return tfidf_matrix

In [None]:
tfidf_corpus = pd.read_pickle("output/tfidf-stem.pkl").tocsr()
idf = pd.read_pickle("output/idf-stem.pkl")
vocab = pd.read_pickle("output/vocabulary-stem.pkl")

In [None]:
vectorized = vectorize_query(df_train, vocab, idf).tocsr()
for idx, vector_query in enumerate(vectorized):
    docc = tfidf_corpus[corpusss_idx[idx]]
    similarity = linear_kernel(vector_query, docc).flatten()
    ranking = np.argsort(similarity)
    print(ndcg_at_k(np.array(corpusss_score[idx])[ranking], ranking.shape[0]))

Process queries 2 ...
Initialize sparse matrix ...
Compute  tf ...
Multiply by idf ...
(10, 805289) (1, 805289)
Done !
0.6457128233925549
0.35709351552029606
0.43082118867564106
0.41596543182216583
0.5367035582507329
0.6169921963839984
0.542508600980465
0.36107290644545437
0.49105065091080663
0.545610972533918


In [82]:
df_task2 = pd.read_csv("data/task2_test.tsv", sep="\t")
df_task2 = pd.merge(df_task2, df_query, left_on='query-id', right_on='_id')
df_task2 = df_task2.drop(columns=['query-id', '_id','metadata'])
df_task2['corpus-id'] = df_task2['corpus-id'].str.replace("[",'')
df_task2['corpus-id'] = df_task2['corpus-id'].str.replace("]",'')
df_task2['corpus-id'] = df_task2['corpus-id'].str.split(", ")
df_task2

  df_task2['corpus-id'] = df_task2['corpus-id'].str.replace("[",'')
  df_task2['corpus-id'] = df_task2['corpus-id'].str.replace("]",'')


Unnamed: 0,id,corpus-id,text
0,7437,"[1036904, 1225084, 1440035, 1470412, 1540837, ...",what is an aml surveillance analyst
1,7438,"[1033903, 1116862, 1160501, 1313466, 1398253, ...",definition of a sigmet
2,7439,"[1055834, 1055835, 1061980, 1091609, 1091614, ...",lps laws definition
3,7440,"[1006865, 1006866, 1006867, 1006868, 1016940, ...",when was the salvation army founded
4,7441,"[1017759, 1082489, 109063, 1160863, 1160871, 1...",anthropological definition of environment
5,7442,"[1030532, 120982, 1267246, 1284406, 1307232, 1...",example of monotonic function
6,7443,"[104095, 1092118, 123547, 1281893, 1346114, 14...",causes of left ventricular hypertrophy
7,7444,"[103181, 1078132, 1078134, 1094131, 1094132, 1...",medicare's definition of mechanical ventilation
8,7445,"[1035060, 1035061, 1035062, 1035063, 1035064, ...",what is the daily life of thai people
9,7446,"[1011043, 1090088, 1090089, 1090091, 1090095, ...",does legionella pneumophila cause pneumonia


In [83]:
corpusss_idx = []
corpus_idss = []
for row in df_task2.iloc:
    corpuss_idx = []
    corpus_ids = []
    for corpuss in row['corpus-id']:
        corpus_ids.append(int(corpuss))
        corpuss_idx.append(corpus.index[corpus['corpus-id'] == int(corpuss)][0])
    corpusss_idx.append(corpuss_idx)
    corpus_idss.append(corpus_ids)

In [84]:
vectorized = vectorize_query(df_task2, vocab, idf).tocsr()
relevant_scores = []

for idx, vector_query in enumerate(vectorized):
    docc = tfidf_corpus[corpusss_idx[idx]]
    similarity = linear_kernel(vector_query, docc).flatten()

    relevant_scores.append(similarity.tolist())

Process queries 2 ...
Initialize sparse matrix ...
Compute  tf ...
Multiply by idf ...
(33, 805289) (1, 805289)
Done !


In [85]:
df_task2['score'] = relevant_scores
df_task2['corpus-id'] = -1

df_task2= df_task2.drop(columns=['processed', 'text'])
df_task2

Unnamed: 0,id,corpus-id,score
0,7437,-1,"[7.582589830310517, 11.193368595700777, 15.446..."
1,7438,-1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,7439,-1,"[2.0674917440032976, 0.9691305791318093, 2.696..."
3,7440,-1,"[9.334367895077003, 12.11722092725928, 9.77195..."
4,7441,-1,"[2.884645137354277, 2.0421651192889385, 6.9487..."
5,7442,-1,"[3.4157984230400604, 5.123697634560091, 4.3146..."
6,7443,-1,"[0.3510730502413966, 0.0, 10.752892643886767, ..."
7,7444,-1,"[0.0, 7.300408984621267, 5.7747081853577376, 6..."
8,7445,-1,"[2.457809247329394, 3.6144253637196972, 2.7308..."
9,7446,-1,"[6.255094107297959, 10.903264560810667, 4.5386..."
