# Models

In [1]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [5]:
sys.path.append('..')

from src.baselines import RIH_Cosine, RIH_QL
from src.expansion import query_expansion
from src.metrics import MAP, NDCG, RPrec

In [3]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("../data/queries.pkl", "rb")),
    "documents": pickle.load(open("../data/documents.pkl", "rb")),
    "relevances": pickle.load(open("../data/relevances.pkl", "rb"))
}

## TFIDF Vectorizer

In [6]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
english_stopwords = stopwords.words("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in english_stopwords]
    return words_lemmed

In [12]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [13]:
# Save the vectorizer for future use
joblib.dump(vectorizer, "../models/vectorizer.pkl")
joblib.dump(documents_vectors, "../models/documents_vectors.pkl")

['../models/documents_vectors.pkl']

## Skipgram

In [15]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [17]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram.model")

## Baselines

In [97]:
rih_cosine = RIH_Cosine(np.array(list(dataset["documents"].keys())), documents_vectors, vectorizer)

for query in dataset["queries"].values():
    print(rih_cosine.get_top_k(query[1], k=5))
    break

(array(['77f8425e7cc8e66fc3f48cc16cfd1b13519fc7bc',
       'f22da1c14e22d1cd1a88b6ac14cb26a993a281c1',
       '9044167fd5073fd17bf3ddbfd56a98a2058a35f2',
       '86a0f8aa4a60091074f7d312983d367c48bafba1',
       '0b62577001d4e4acec755801c29e68b7de25c72b'], dtype='<U40'), array([0.00654811, 0.0052162 , 0.00521528, 0.00422302, 0.00325326]))


In [96]:
rih_ql = RIH_QL(vectorizer)

for query in dataset["queries"].values():
    print(rih_ql.get_top_k(query[1], list(dataset["documents"].items()), k=5))
    break

(array(['77f8425e7cc8e66fc3f48cc16cfd1b13519fc7bc',
       '04aff0aa2eb9d52284455c8930af1aa8ff7f0e0a',
       '86a0f8aa4a60091074f7d312983d367c48bafba1',
       '20a0ba0f73d96877afcbb54ae6fb6eb6dcaa7646',
       '83ee36eebe2245d12a43775389cb0c0660f76c03'], dtype='<U40'), array([-1.89176667, -3.01677618, -4.10782848, -4.34684473, -4.43275143]))


In [4]:
w2v_model = Word2Vec.load("../models/skipgram.model")

In [13]:
documents_vectors = joblib.load(open("../models/documents_vectors.pkl", "rb"))
vectorizer = joblib.load(open("../models/vectorizer.pkl", "rb"))

In [42]:
rih_cosine = RIH_Cosine(np.array(list(dataset["documents"].keys())), documents_vectors, vectorizer)

for query_id, query in dataset["queries"].items():
    q = vectorizer.transform([rih_cosine.get_query(query)]).toarray()[0]
    relevant_documents = vectorizer.transform([dataset["documents"][doc_id] for doc_id in top_k_documents if (query_id, doc_id) in dataset["relevances"]]).toarray()
    nonrelevant_documents = vectorizer.transform([dataset["documents"][doc_id] for doc_id in top_k_documents if (query_id, doc_id) not in dataset["relevances"]]).toarray()

    print(embedding_expansion(w2v_model, query[0], [query[1], *query[2]]).shape)
    break

['0fb555e014a1ccca30dd9514625d44f986db3e87']
('Hog-dog rodeo Typical match', 'Hog-dog rodeo', ('Typical match',))
(300,)
