In [None]:
import os

os.chdir(os.getcwd() + "\\..")

from engine.notebook_utils import *
from engine.utils import *
from sklearn.metrics.pairwise import cosine_similarity


documents_metadata = fetch_training_dataset1_metadata(limit=None)


print(f"Corpus size = {len(documents_metadata)}")


In [None]:

documents = []

documents_ids = []

for d in documents_metadata:
    content = load_document_content(d.key)

    cleaned_content = preprocess(content)

    documents.append(cleaned_content)

    documents_ids.append(d.id)


In [None]:
from engine.utils import *

# Training Inputs:
min_count = 1
vector_size = 300
window = 5
epochs = 600
workers = 6

tokenized_documents = [tokenize_content(d) for d in documents]

model = Word2Vec(tokenized_documents, min_count = min_count, vector_size= vector_size, window = window, sg = 1, epochs=epochs, workers=workers)

In [None]:
import numpy as np

def get_embedding_vector(doc_tokens):
    size = model.vector_size
    embeddings = []
    if len(doc_tokens) < 1:
        return np.zeros(size)
    else:
        for tok in doc_tokens:
            if tok in model.wv.index_to_key:
                embeddings.append(model.wv.get_vector(tok))
            else:
                embeddings.append(np.random.rand(size))
    
    return np.mean(embeddings, axis=0)

train_matrix = [get_embedding_vector(tokenize_content(d)) for d in documents]

In [None]:
save_w2v_model(model, os.path.join("word2vec", "word2vec.model"))

save_model(documents_ids, os.path.join("word2vec", "documents_ids.pk"))

save_model(train_matrix, os.path.join("word2vec", "matrix.pk"))

In [None]:
os.chdir(os.getcwd() + "\\..")
from evaluation import *
from engine.word2vec_engine import Word2VecEngine
from engine.notebook_utils import get_test_queries_path, get_test_queries_matches_path

test_queries_path = get_test_queries_path()
test_queries_matches_path = get_test_queries_matches_path()

engine = Word2VecEngine(threshold=0.63, results_limit=30)

def listener(query_id, precision, recall, avg_precision, precision_at_10):
    print(f"query id: {query_id}, recall: {recall}, precision: {precision}, precision@10: {precision_at_10}, avg_precision: {avg_precision}")

evaluation = evaluate(engine, test_queries_path, test_queries_matches_path, listener)

In [None]:
import pandas as pd

df = pd.DataFrame(evaluation['queries'])

map_mrr_df = pd.DataFrame([
    {
        'Average Recall': evaluation['average_recall'],
        'Average Precision': evaluation['average_precision'],
        'MAP': evaluation['mean_average_precision'],
        'MRR': evaluation['mean_reciprocal_rank'],
    }
])

print(df)
print()
print(map_mrr_df)