In [None]:
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from operator import itemgetter
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from numba import jit
from concurrent.futures import ProcessPoolExecutor
from sklearn.feature_extraction.text import TfidfVectorizer
from annoy import AnnoyIndex
import random
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.random_projection import SparseRandomProjection
from annoy import AnnoyIndex
import time
from collections import defaultdict
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string

# Preprocessing
## Data Retrieval

In [None]:
corpus = pd.read_json('corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus

In [None]:
queries = pd.read_json(path_or_buf='queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()#.apply(tokenize)
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

In [None]:
query_corpus_train_map = pd.read_csv("task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

In [None]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

In [None]:
df_test = pd.read_csv("task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

## Tools preparation & usage

### TEST 1

In [None]:
@jit(nopython=True)
def cosine_distance(u:np.ndarray, v:np.ndarray):
    dot_products = v.dot(u.T)

    # Compute norms
    query_norm = np.linalg.norm(u)
    corpus_norms = np.linalg.norm(v.toarray(), axis=1)

    # Compute cosine similarities
    cosine_similarities = dot_products.flatten() / (query_norm * corpus_norms + 1e-10)  # small value to avoid division by zero

    return cosine_similarities

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
vectorizer.fit(corpus["text"])

In [None]:
n = 100  # maximum size of each chunk

sub_corpus_list = [df for _, df in corpus.groupby(np.arange(len(corpus)) // n)]
sub_corpus_list[1]

In [None]:
def find_top_k_matches(query):
    query_vector = vectorizer.transform([query['text']])

    # Compute cosine similarities
    cosine_similarities = np.array([1,2,3,4,5,6,7,8,9,10,11]) #linear_kernel(query_vector, sub_corpus_matrix).flatten()
    # Get top k corpus indices from this sub-corpus
    top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query-id'],
            'corpus_id': sub_corpus_df.iloc[index]['_id']
        })
    return local_results

In [None]:
results = []
for sub_corpus_df in sub_corpus_list:
    print("Transforming")
    sub_corpus_matrix = vectorizer.fit_transform(sub_corpus_df["text"])
    print("Parallelizing")
    # Use ProcessPoolExecutor to parallelize the inner loop
    count = 0
    for _, query in queries_test.iterrows():
        top_k_indices = find_top_k_matches(query)
        results.extend(top_k_indices)
        if count%100==0:
            print(count)
        count +=1
        
        

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)

In [None]:
# Assuming you have a list of documents and queries
documents = corpus
qs = queries_test 

# 1. Vectorize the documents using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
doc_vectors = vectorizer.fit_transform(documents["text"]).toarray()

# 2. Build Annoy Index
f = doc_vectors.shape[1]  # Number of dimensions of the vector
t = AnnoyIndex(f, 'angular')  # Use 'angular' for cosine similarity

for i in range(len(documents)):
    t.add_item(i, doc_vectors[i])

t.build(50)  # 50 trees. Increase if needed.

# 3. Query the index
results = {}

for query in qs:
    query_vector = vectorizer.transform([query]).toarray()[0]
    top10_indices = t.get_nns_by_vector(query_vector, 10)  # Find the top 10 document indices
    results[query] = [documents[i] for i in top10_indices]

print(results)


### TEST 2

In [None]:
def map_qrs_to_docs(qrs, dcs):
    
    # 1. Hashing: Convert documents to hashed vectors
    start = time.time()
    hash_vectorizer = HashingVectorizer(n_features=2**20, stop_words='english', norm=None)
    hashed_docs = hash_vectorizer.transform(dcs["text"])
    end = time.time()
    print(f"Hashing documents took {end - start} seconds.")
    
    # 2. Dimensionality Reduction
    start = time.time()
    transformer = SparseRandomProjection(n_components=100)  # Reducing to 100 dimensions
    reduced_docs = transformer.fit_transform(hashed_docs).toarray()
    end = time.time()
    print(f"Dimensionality reduction took {end - start} seconds.")
    
    # Build Annoy Index
    start = time.time()
    f = reduced_docs.shape[1]
    t = AnnoyIndex(f, 'angular')
    for i, vector in enumerate(reduced_docs):
        t.add_item(i, vector)
    t.build(50)
    end = time.time()
    print(f"Building Annoy index took {end - start} seconds.")
    
    # Query the index
    results = {}
    start = time.time()
    for index, row in qrs.iterrows():
        query_text = row["text"]
        hashed_query = hash_vectorizer.transform([query_text])
        reduced_query = transformer.transform(hashed_query).toarray()[0]
        top10_indices = t.get_nns_by_vector(reduced_query, 10)
        
        # Storing the _id from documents
        results[row["query-id"]] = [dcs.iloc[i]["corpus-id"] for i in top10_indices]
    end = time.time()
    print(f"Querying the index took {end - start} seconds.")
    
    return results


In [None]:
results = map_qrs_to_docs(queries_train_subset, corpus)
pd.DataFrame(results)

### TEST 3

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def build_inverted_index(docs):
    index = defaultdict(set)
    for doc_id, row in docs.iterrows():
        # Remove punctuation and convert to lowercase
        clean_text = remove_punctuation(row["text"].lower())
        for word in clean_text.split():
            index[word].add(doc_id)
    return index
    
def filter_docs(query, index):
    relevant_doc_ids = set()
    for word in query.split():
        relevant_doc_ids.update(index.get(word, set()))
    return relevant_doc_ids

start = time.time()
inverted_index = build_inverted_index(corpus)
end = time.time()
print(f"Indexing took {end - start} seconds.")
inverted_index

In [None]:
def remove_stopwords_from_index(inverted_index):
    # Use a predefined list of stopwords (ENGLISH_STOP_WORDS from sklearn here)
    for stopword in ENGLISH_STOP_WORDS:
        if stopword in inverted_index:
            del inverted_index[stopword]
    return inverted_index
inverted_index = remove_stopwords_from_index(inverted_index)
inverted_index

## JUNK & OTHERS

In [None]:
"""for idx, df in enumerate(sub_corpus_list):
    # Transform the text in the dataframe using the vectorizer
    tfidf_matrix = vectorizer.transform(df["text"])
    
    # Convert the sparse matrix to a dense DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Save the DataFrame to a CSV file
    tfidf_df.to_csv(f'tfidf_matrix_{idx}.csv', index=False)"""
"""
for idx, df in enumerate(sub_corpus_list):
    print("Transforming")
    tfidf_matrix = vectorizer.transform(df["text"])
    #print("framing")
    #tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    #print("storing")
    #tfidf_df.to_parquet(f'tfidf_matrix_{idx}.parquet')
    for idx, q_text  in queries:
        query_feature = tf.transform([query])
        cosine_similarities = linear_kernel(query_feature,corpus_feature).flatten()
        top_10 = np.argpartition(cosine_similarities,-5)[-5:]"""  
# Placeholder for the results

"""
results = []
# Assuming list_of_dfs is the list of your sub-corpuses created earlier
for sub_corpus_df in sub_corpus_list:
    print("Transform sub")
    sub_corpus_matrix = vectorizer.transform(sub_corpus_df["text"])
    
    for _, query in queries.iterrows():
        print("Treat query")
        query_vector = vectorizer.transform([query['text']])
        
        # Compute cosine similarities
        cosine_similarities = linear_kernel(query_vector, sub_corpus_matrix).flatten()
        
        # Get top k corpus indices from this sub-corpus
        top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
        
        for index in top_k_indices:
            results.append({
                'query_id': query['_id'],
                'corpus_id': sub_corpus_df.iloc[index]['_id']
            })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)
"""
def find_top_k_matches(query):
    print("Transform query")
    query_vector = vectorizer.transform([query['text']])

    print("Cosine Sim")
    # Compute cosine similarities
    cosine_similarities = linear_kernel(query_vector, sub_corpus_matrix).flatten()

    print("K vals")
    # Get top k corpus indices from this sub-corpus
    top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query_id'],
            'corpus_id': sub_corpus_df.iloc[index]['corpus_id']
        })
    print("Finished query !")
    return local_results
"""   
def find_top_k_matches(query):
    query_vector = vectorizer.transform([query['text']])
    cosine_similarities = []

    for idx in range(sub_corpus_matrix.shape[0]):
        corpus_vector = sub_corpus_matrix[idx].toarray().flatten()
        similarity = cosine_distance(query_vector.toarray().flatten(), corpus_vector)
        cosine_similarities.append(similarity)

    # Get top k corpus indices from this sub-corpus
    top_k_indices = np.argsort(cosine_similarities)[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query_id'],
            'corpus_id': sub_corpus_df.iloc[index]['corpus_id']
        })
    
    return local_results
"""
results = []

# Assuming list_of_dfs is the list of your sub-corpuses created earlier
for sub_corpus_df in sub_corpus_list:
    print("Transforming")
    sub_corpus_matrix = vectorizer.transform(sub_corpus_df["text"])
    print("Parallelizing")
    # Use ProcessPoolExecutor to parallelize the inner loop
    with ProcessPoolExecutor() as executor:
        results.extend(executor.map(find_top_k_matches, queries_test.iterrows()))

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)
        
        
