In [1]:
import collections
import io
import time
import math
import pickle
import os
import pyndri
import pyndri.compat
import logging
import sys
import numpy as np
import gensim

In [2]:
index = pyndri.Index('../index/')

In [3]:
def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            

# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
write_run(
    model_name='example',
    data={
        'Q1': ((1.0, 'DOC1'), (0.5, 'DOC2'), (0.75, 'DOC3')),
        'Q2': ((-0.1, 'DOC1'), (1.25, 'DOC2'), (0.0, 'DOC3')),
    },
    out_f=sys.stdout,
    max_objects_per_query=1000)

Q1 Q0 DOC1 1 1.0 example
Q1 Q0 DOC3 2 0.75 example
Q1 Q0 DOC2 3 0.5 example
Q2 Q0 DOC2 1 1.25 example
Q2 Q0 DOC3 2 0.0 example
Q2 Q0 DOC1 3 -0.1 example


In [4]:
def parse_topics(file_or_files,
                 max_topics=sys.maxsize, delimiter=';'):
    assert max_topics >= 0 or max_topics is None

    topics = collections.OrderedDict()

    if not isinstance(file_or_files, list) and \
            not isinstance(file_or_files, tuple):
        if hasattr(file_or_files, '__iter__'):
            file_or_files = list(file_or_files)
        else:
            file_or_files = [file_or_files]

    for f in file_or_files:
        assert isinstance(f, io.IOBase)

        for line in f:
            assert(isinstance(line, str))

            line = line.strip()

            if not line:
                continue

            topic_id, terms = line.split(delimiter, 1)

            if topic_id in topics and (topics[topic_id] != terms):
                    logging.error('Duplicate topic "%s" (%s vs. %s).',
                                  topic_id,
                                  topics[topic_id],
                                  terms)

            topics[topic_id] = terms

            if max_topics > 0 and len(topics) >= max_topics:
                break

    return topics


with open('../ap_88_89/topics_title', 'r') as f_topics:
    print(parse_topics([f_topics]))

OrderedDict([('51', 'Airbus Subsidies'), ('52', 'South African Sanctions'), ('53', 'Leveraged Buyouts'), ('54', 'Satellite Launch Contracts'), ('55', 'Insider Trading'), ('56', 'Prime (Lending) Rate Moves, Predictions'), ('57', 'MCI'), ('58', 'Rail Strikes'), ('59', 'Weather Related Fatalities'), ('60', 'Merit-Pay vs. Seniority'), ('61', 'Israeli Role in Iran-Contra Affair'), ('62', "Military Coups D'etat"), ('63', 'Machine Translation'), ('64', 'Hostage-Taking'), ('65', 'Information Retrieval Systems'), ('66', 'Natural Language Processing'), ('67', 'Politically Motivated Civil Disturbances'), ('68', 'Health Hazards from Fine-Diameter Fibers'), ('69', 'Attempts to Revive the SALT II Treaty'), ('70', 'Surrogate Motherhood'), ('71', 'Border Incursions'), ('72', 'Demographic Shifts in the U.S.'), ('73', 'Demographic Shifts across National Boundaries'), ('74', 'Conflicting Policy'), ('75', 'Automation'), ('76', 'U.S. Constitution - Original Intent'), ('77', 'Poaching'), ('78', 'Greenpeace'

In [5]:
with open('../ap_88_89/topics_title', 'r') as f_topics:
    queries = parse_topics([f_topics])

index = pyndri.Index('../index/')

num_documents = index.maximum_document() - index.document_base()

dictionary = pyndri.extract_dictionary(index)

tokenized_queries = {
    query_id: [dictionary.translate_token(token)
               for token in index.tokenize(query_string)
               if dictionary.has_token(token)]
    for query_id, query_string in queries.items()}

query_term_ids = set(
    query_term_id
    for query_term_ids in tokenized_queries.values()
    for query_term_id in query_term_ids)

# inverted index creation.

start_time = time.time()

document_lengths = {}
unique_terms_per_document = {}

try:
    print('Trying to load statistics from file...', end='')
    with open('../pickles/inverted_index.pkl', 'rb') as file:
        inverted_index = pickle.load(file)
    with open('../pickles/collection_frequencies.pkl', 'rb') as file:
        collection_frequencies = pickle.load(file)
    with open('../pickles/document_lengths.pkl', 'rb') as file:
        document_lengths = pickle.load(file)
    with open('../pickles/unique_terms_per_document.pkl', 'rb') as file:
        unique_terms_per_document = pickle.load(file)
    with open('../pickles/document_term_frequency.pkl', 'rb') as file:
        document_term_frequency = pickle.load(file)
    with open('../pickles/avg_doc_length.pkl', 'rb') as file:
        avg_doc_length = pickle.load(file)
    print('Success!')
except FileNotFoundError:
    print('Error!')
    print('Gathering statistics about', len(query_term_ids), 'terms.')

    inverted_index = collections.defaultdict(dict)
    collection_frequencies = collections.defaultdict(int)

    total_terms = 0

    for int_doc_id in range(index.document_base(), index.maximum_document()):
        ext_doc_id, doc_token_ids = index.document(int_doc_id)

        document_bow = collections.Counter(
            token_id for token_id in doc_token_ids
            if token_id > 0)
        document_length = sum(document_bow.values())

        document_lengths[int_doc_id] = document_length
        total_terms += document_length

        unique_terms_per_document[int_doc_id] = len(document_bow)

        for query_term_id in query_term_ids:
            assert query_term_id is not None

            document_term_frequency = document_bow.get(query_term_id, 0)

            if document_term_frequency == 0:
                continue

            collection_frequencies[query_term_id] += document_term_frequency
            inverted_index[query_term_id][int_doc_id] = document_term_frequency

    avg_doc_length = total_terms / num_documents

    print('Inverted index creation took', time.time() - start_time, 'seconds.')

    print('Saving statistics for future use...', end='')
    with open('../pickles/inverted_index.pkl', 'wb') as file:
        pickle.dump(inverted_index, file)
    with open('../pickles/collection_frequencies.pkl', 'wb') as file:
        pickle.dump(collection_frequencies, file)
    with open('../pickles/document_lengths.pkl', 'wb') as file:
        pickle.dump(document_lengths, file)
    with open('../pickles/unique_terms_per_document.pkl', 'wb') as file:
        pickle.dump(unique_terms_per_document, file)
    with open('../pickles/document_term_frequency.pkl', 'wb') as file:
        pickle.dump(document_term_frequency, file)
    with open('../pickles/avg_doc_length.pkl', 'wb') as file:
        pickle.dump(avg_doc_length, file)
    print('Success!')

Trying to load statistics from file...Success!


### Task 2: Latent Semantic Models (LSMs) [15 points] ###

In this task you will experiment with applying distributional semantics methods ([LSI](http://lsa3.colorado.edu/papers/JASIS.lsi.90.pdf) **[5 points]** and [LDA](https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf) **[5 points]**) for retrieval.

You do not need to implement LSI or LDA on your own. Instead, you can use [gensim](http://radimrehurek.com/gensim/index.html). An example on how to integrate Pyndri with Gensim for word2vec can be found [here](https://github.com/cvangysel/pyndri/blob/master/examples/word2vec.py). For the remaining latent vector space models, you will need to implement connector classes (such as `IndriSentences`) by yourself.

In order to use a latent semantic model for retrieval, you need to:
   * build a representation of the query **q**,
   * build a representation of the document **d**,
   * calculate the similarity between **q** and **d** (e.g., cosine similarity, KL-divergence).
     
The exact implementation here depends on the latent semantic model you are using. 
   
Each of these LSMs come with various hyperparameters to tune. Make a choice on the parameters, and explicitly mention the reasons that led you to these decisions. You can use the validation set to optimize hyper parameters you see fit; motivate your decisions. In addition, mention clearly how the query/document representations were constructed for each LSM and explain your choices.

In this experiment, you will first obtain an initial top-1000 ranking for each query using TF-IDF in **Task 1**, and then re-rank the documents using the LSMs. Use TREC Eval to obtain the results and report on `NDCG@10`, Mean Average Precision (`MAP@1000`), `Precision@5` and `Recall@1000`.

Perform significance testing **[5 points]** (similar as in Task 1) in the class of semantic matching methods.

In [6]:
with open('../pickles/preprocessed_tfidf_collection.pkl', 'rb') as file:
    tfidf_data = dict(pickle.load(file))

In [7]:
class Sentences2Vec(pyndri.compat.IndriSentences):
    """IndriSentences own class implementation."""
    
    def __init__(self, index, dictionary, max_documents=None):
        super().__init__(index, dictionary, max_documents)

    def __iter__(self):
        for int_doc_id in range(self.index.document_base(),
                                self._maximum_document()):
            ext_doc_id, doc = self.index.document(int_doc_id)
            tokens_bow = self.dictionary.doc2bow(doc)

            yield tuple(
                (token_id, weight)
                for (token_id, weight) in tokens_bow
                if token_id in self.dictionary and token_id > 0)

### LSI

In [10]:
class LatentSemanticIndexing():
    """Latent Semantic Indexing method implementation."""
    
    def __init__(self, index: pyndri.Index, dictionary: dict, num_topics=200, load_model=False, fname=""):
        self.index = index
        self.dictionary = dictionary
        
        if load_model:
            if fname == "":
                raise ValueError('File path not provided.')
            self.load(fname)
        else:
            self.train(num_topics)
            
        self.doc_representations_dict = collections.defaultdict(list)
        self.load_documents_representation()
            
    @property
    def model_name(self):
        """Model name"""
        return "LSI"
        
    def load(self, fpath: str):
        """Load model from file.
        
        Args:
            fpath: file path to load model.
        """
        self.model = gensim.models.lsimodel.LsiModel.load(fpath)
        print("Model loaded.")
        
    def save(self, fpath: str):
        """Save current model to file for further use.
        
        Args:
            fname: file path to save model.
        """
        self.model.save(fpath)
        print("Model saved.")        
        
    def train(self, num_topics=200):
        """Train LSI model given the index and dictionary."""
        print("Training started...")
        retrieval_start_time = time.time()
        
        corpus = Sentences2Vec(self.index, self.dictionary)
        self.model = gensim.models.lsimodel.LsiModel(corpus=corpus,
                                                     id2word=self.dictionary.id2token,
                                                     num_topics=num_topics)
        print("Model trained in {} seconds.".format(time.time() - retrieval_start_time))
        
    def load_documents_representation(self):
        """Get and store document representations for future use."""
        try:
            print("Loading document representations from file...", end='')
            with open('../pickles/LSI_DocRepresentations.pkl', 'rb') as file:
                self.doc_representations_dict = pickle.load(file)
            print("Success!")
        except FileNotFoundError:
            print("Error!")
            print("Computing and loading documents' representations...")
    
            retrieval_start_time = time.time()
            for int_doc_id in range(self.index.document_base(), self.index.maximum_document()):
                ext_doc_id, doc = self.index.document(int_doc_id)
                self.doc_representations_dict[int_doc_id] = self.get_representation(doc)
            
            print("Documents successfully loaded in {} seconds.".format(time.time() - retrieval_start_time))
            
            with open('../pickles/LSI_DocRepresentations.pkl', 'wb') as file:
                pickle.dump(self.doc_representations_dict, file)
        
    def get_representation(self, tokens_list):
        """Build representation given list of token ids.
        
        Args:
            tokens_list: list of token ids.
        Return:
            List of the LSI representation.
        """
        tokens_bow = self.dictionary.doc2bow(tokens_list)
        doc_representation = [(token_id, weight)
                              for (token_id, weight) in tokens_bow
                              if token_id in self.dictionary and token_id > 0]
        lsi_repr = [x[1] for x in self.model[doc_representation]]
        return lsi_repr
    
    def cosine_similarity(self, vec1, vec2):
        """Compute cosine similarity of 2 vectors.
        
        Args:
            vec1: 1st vector.
            vec2: 2nd vector.
        Return:
            Cosine similarity.
        """
        dot_prod = np.dot(vec1, vec2)
        norm_1 = np.linalg.norm(vec1)
        norm_2 = np.linalg.norm(vec2)
        return dot_prod / (norm_1 * norm_2)
    
    def run_retrieval(self, tfidf_data):
        """
        Runs a retrieval method for all the queries and writes the TREC-friendly results in a file.

        Args:
            tfidf_data: top-1000 query-document rankings from TF-IDF.
        """
        run_out_path = '{}.run'.format(self.model_name)

        if os.path.exists(run_out_path):
            print('RUN file already existing')
            return
        
        data = collections.defaultdict(list)
        
        print('Retrieving using {}'.format(self.model_name))
        retrieval_start_time = time.time()
        
        for query_id, doc_list in tfidf_data.items():
            query_representation = self.get_representation(tokenized_queries[query_id])
            
            for int_doc_id in doc_list:
                ext_doc_id, doc = index.document(int_doc_id)
                doc_representation = self.doc_representations_dict[int_doc_id]
                
                cos_similarity = self.cosine_similarity(query_representation, doc_representation)            
                data[query_id].append((cos_similarity, ext_doc_id))
            
            data[query_id] = sorted(data[query_id], reverse=True)
        
        with open(run_out_path, 'w') as f_out:
            write_run(
                model_name=self.model_name,
                data=data,
                out_f=f_out,
                max_objects_per_query=1000)
            
        print('Retrieval run took {} seconds.'.format(time.time() - retrieval_start_time))

In [11]:
# lsi_model = LatentSemanticIndexing(index, dictionary, num_topics=len(queries))
lsi_model = LatentSemanticIndexing(index, dictionary, load_model=True, fname='../models/new_lsi')

Model loaded.
Loading document representations from file...Success!


In [13]:
lsi_model.run_retrieval(tfidf_data)

Retrieving using LSI
Retrieval run took 102.59836983680725 seconds.


In [None]:
# lsi_model.save('../models/new_lsi')

### LDA

In [14]:
class LatentDirichletAllocation():
    """Latent Dirichlet Allocation method implementation."""
    
    def __init__(self, index: pyndri.Index, dictionary: dict, num_topics=200, load_model=False, fname=""):
        self.index = index
        self.dictionary = dictionary
        self.num_topics = num_topics
        
        if load_model:
            if fname == "":
                raise ValueError('File path not provided.')
            self.load(fname)
        else:
            self.train()
            
        self.doc_representations_dict = collections.defaultdict(list)
        self.load_documents_representation()
            
    @property
    def model_name(self):
        """Model name"""
        return "LDA"
        
    def load(self, fpath: str):
        """Load model from file.
        
        Args:
            fpath: file path to load model.
        """
        self.model = gensim.models.ldamodel.LdaModel.load(fpath)
        print("Model loaded.")
        
    def save(self, fpath: str):
        """Save current model to file for further use.
        
        Args:
            fname: file path to save model.
        """
        self.model.save(fpath)
        print("Model saved.")        
        
    def train(self):
        """Train LDA model given the index and dictionary."""
        print("Training started...")
        retrieval_start_time = time.time()
        
        corpus = Sentences2Vec(self.index, self.dictionary)
        self.model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                     id2word=self.dictionary.id2token,
                                                     num_topics=self.num_topics,
                                                     update_every=1,
                                                     chunksize=10000,
                                                     passes=1)
        print("Model trained in {} seconds.".format(time.time() - retrieval_start_time))
        
    def load_documents_representation(self):
        """Get and store document representations for future use."""
        try:
            print("Loading document representations from file...", end='')
            with open('../pickles/LDA_DocRepresentations.pkl', 'rb') as file:
                self.doc_representations_dict = pickle.load(file)
            print("Success!")
        except FileNotFoundError:
            print("Error!")
            print("Computing and loading documents' representations...")
    
            retrieval_start_time = time.time()
            for int_doc_id in range(self.index.document_base(), self.index.maximum_document()):
                ext_doc_id, doc = self.index.document(int_doc_id)
                self.doc_representations_dict[int_doc_id] = self.get_representation(doc)
            
            print("Documents successfully loaded in {} seconds.".format(time.time() - retrieval_start_time))
            
            with open('../pickles/LDA_DocRepresentations.pkl', 'wb') as file:
                pickle.dump(self.doc_representations_dict, file)
        
    def get_representation(self, tokens_list):
        """Build representation given list of token ids.
        
        Args:
            tokens_list: list of token ids.
        Return:
            List of the LDA representation.
        """
        tokens_bow = self.dictionary.doc2bow(tokens_list)
        doc_representation = [(token_id, weight)
                              for (token_id, weight) in tokens_bow
                              if token_id in self.dictionary and token_id > 0]
        lda_repr = np.zeros(self.num_topics)
        for index, value in self.model[doc_representation]:
            lda_repr[index] = value
        return lda_repr
    
    def cosine_similarity(self, vec1, vec2):
        """Compute cosine similarity of 2 vectors.
        
        Args:
            vec1: 1st vector.
            vec2: 2nd vector.
        Return:
            Cosine similarity.
        """
        dot_prod = np.dot(vec1, vec2)
        norm_1 = np.linalg.norm(vec1)
        norm_2 = np.linalg.norm(vec2)
        return dot_prod / (norm_1 * norm_2)
    
    def run_retrieval(self, tfidf_data):
        """
        Runs a retrieval method for all the queries and writes the TREC-friendly results in a file.

        Args:
            tfidf_data: top-1000 query-document rankings from TF-IDF.
        """
        run_out_path = '{}.run'.format(self.model_name)

        if os.path.exists(run_out_path):
            print('RUN file already existing')
            return
        
        data = collections.defaultdict(list)
        
        print('Retrieving using {}'.format(self.model_name))
        retrieval_start_time = time.time()
        
        for query_id, doc_list in tfidf_data.items():
            query_representation = self.get_representation(tokenized_queries[query_id])
            
            for int_doc_id in doc_list:
                ext_doc_id, doc = index.document(int_doc_id)
                doc_representation = self.doc_representations_dict[int_doc_id]
                
                cos_similarity = self.cosine_similarity(query_representation, doc_representation)            
                data[query_id].append((cos_similarity, ext_doc_id))
            
            data[query_id] = sorted(data[query_id], reverse=True)
        
        with open(run_out_path, 'w') as f_out:
            write_run(
                model_name=self.model_name,
                data=data,
                out_f=f_out,
                max_objects_per_query=1000)
            
        print('Retrieval run took {} seconds.'.format(time.time() - retrieval_start_time))

In [15]:
lda_model = LatentDirichletAllocation(index, dictionary, load_model=True, fname='../models/lda_model')
# lda_model.save('../models/lda_model')

Model loaded.
Loading document representations from file...Success!


In [16]:
lda_model.run_retrieval(tfidf_data)

Retrieving using LDA
Retrieval run took 41.169721841812134 seconds.


### Play stuff

In [None]:
# lsi_scoring = collections.defaultdict(list)
# mapped_data = collections.defaultdict(list)

# for query_id, doc_list in data.items():
#     query_representation = lsi_model.get_representation(tokenized_queries[query_id])
    
#     for int_doc_id in doc_list:
#         ext_doc_id, doc = index.document(int_doc_id)        
#         doc_representation = lsi_model.get_representation(doc)
#         cos_similarity = cosine_similarity(query_representation, doc_representation)
        
#         lsi_scoring[query_id].append((cos_similarity, int_doc_id))
        
#     lsi_scoring[query_id] = sorted(lsi_scoring[query_id], reverse=True)
#     mapped_data[query_id] = list(map(lambda item: item[1], lsi_scoring[query_id]))
    
#     break

In [None]:
# csc_matrix = gensim.matutils.corpus2csc(sentences)
# corpus = gensim.matutils.Sparse2Corpus(csc_matrix)

In [None]:
# tfidf = gensim.models.tfidfmodel.TfidfModel(sentences)
# corpus_tfidf = tfidf[sentences]