In [1]:
version = "REPLACE_PACKAGE_VERSION"

---
# Assignment 2 Part 1: Vector Space Model and Relevance Feedback (50 pts)

In this assignment, we will improve our simple text retrieval system built in Assignment 1 by leveraging the Vector Space Model and relevance feedback. 

In [2]:
# Configure nltk

import nltk

nltk_data_path = "assets/nltk_data"
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)
    

In [3]:
# ignores warnings, should be on for submitting to the autograder but you may comment it out if needed while working.
import warnings
warnings.simplefilter("ignore")

## Question 1: Retrieve and rank documents with TF-IDF weighting scheme (10 pts)

Modify your `retrieve_n_rank_docs` function from Assignment 1 so that it now uses the TF-IDF weights to retrieve and rank documents as shown in the lecture slides. Specifically, for each *term* in a given query, we accumulate the TF-IDF weight for each document that contains the term. We apply **maximum frequency normalisation** to raw term frequencies to obtain the TF and follow the definition of IDF given in the lecture. After repeating this procedure for all terms, we rank the documents found in descending order of their total TF-IDF weights **rounded to three decimal places**. Again, if two documents tie, the document with a *lower* document number should be ranked higher. Refer to the lecture slides titled "Ranking Documents: Example" in the TF-IDF slide deck for more details. 

Your modified function should accept the same set of arguments as in Assignment 1 and likewise output a `dict` that contains an ordered `list` of retrieved documents for each query, similar to:

```
{
    'q1'  : ['d51', 'd486', ..., 'd876'], 
    'q2'  : ['d12', 'd51', ..., 'd486'],
    ...,
    'q225': ['d1188', 'd1380', ..., 'd173']
}
```

⚠️ Please remember to **round the total TF-IDF weights to three decimal places** before ranking the documents. ⚠️

**This function should return a `dict` of length `len(queries)`, where each key is a `str` representing a `q_id` and each value is a `list` that contains the `doc_id` of the retrieved documents. `max_docs` determines the maximum number of documents that can be retrieved for each query.**

In [4]:
import math

def retrieve_n_rank_docs(inverted_index, queries, max_docs=-1):
    """
    Retrieve documents in order of relevance from an inverted index based on some queries
    """
    
    ret_docs = None
    
    set_docs = []
    for word in inverted_index:
        for document in inverted_index[word]:
            set_docs.append(document)

    set_docs = set(set_docs)
    
    n = len(set_docs)

    max_freqs = {}

    for document in set_docs:
        freqs = []
        for word in inverted_index:
            if document in inverted_index[word]:
                freqs.append(inverted_index[word][document])

        max_freqs[document] = max(freqs)[0]
            
            
    
    import re
    ret_docs = {}
                 
    for query in queries:
        posting = []
        doc_list = []
        for word in queries[query]:
            if word in inverted_index:
                posting_dict = {}
                for i in range(len(inverted_index[word])):
                    
                    this_doc = list(inverted_index[word].items())[i][0]
                    word_freq = list(inverted_index[word].items())[i][1][0]
                    
                    tf_w = .5 + ((.5 * word_freq)/max_freqs[this_doc])
                    
                    k = len(inverted_index[word])
                    
                    idf_w = 1 + math.log(n/k)
                                 
                    
                    posting_dict[this_doc] = tf_w * idf_w
                    doc_list.append(this_doc)
                posting.append({word:posting_dict})

                        
                        
                        

#         print(posting)
        totals = {}
    
        for doc in set_docs:
            tot = []
            for post in range(len(posting)):
                for q_term in posting[post]:
                    if doc in posting[post][q_term]:
                        tot.append(posting[post][q_term][doc])
                totals[doc] = round(sum(tot),3)

        totals = dict(sorted(totals.items(), key = lambda x: (-x[1], int(re.search(r'\d+',x[0]).group()))))
        
        totals = {x:y for x, y in totals.items() if y != 0}
        
        print(totals)
        
        if max_docs == -1:
            total_list = list(totals.keys())
        else:
            total_list = list(totals.keys())[:max_docs]
        
        
        ret_docs[query] = total_list
    
    # YOUR CODE HERE
#     raise NotImplementedError()
    
    return ret_docs

In [5]:
# We import the reference implementations of some utility functions from Week 1 to be used in the autograder tests
# They should be used ONLY in the autograder tests, NOT in your solutions
# Ignore any ModuleNotFoundError message in the autograder feedback - it does not affect your grade
from utils import load_cranfield_docs, load_cranfield_queries, build_inverted_index

In [6]:
# The definitions of the three utility functions are hidden here - not for grading


In [7]:
# Autograder tests

min_df = 10 # min_df won't change in the hidden tests
stu_docs = load_cranfield_docs()
stu_inv_index = build_inverted_index(stu_docs, min_df=min_df)
stu_queries = load_cranfield_queries()

max_docs = 10 # max_docs may vary in the hidden tests
stu_ret_docs = retrieve_n_rank_docs(stu_inv_index, stu_queries, max_docs=max_docs)

# Some sanity checks
assert isinstance(stu_ret_docs, dict), "Q1: Your function should return a dictionary. "
assert len(stu_ret_docs) == len(stu_queries), "Q1: Your dictionary should have the same length as there are queries. "

for q_id in stu_ret_docs:
    
    assert q_id in stu_queries, f"Q1: When max_docs = {max_docs}, '{q_id}' in your dictionary is not a valid q_id. "
    
    assert len(stu_ret_docs[q_id]) <= max_docs, f"Q1: When max_docs = {max_docs}, your # retrieved docs ({len(stu_ret_docs[q_id])}) for {q_id} is bigger than max_docs. "

# Some hidden tests

del stu_inv_index, stu_queries, stu_ret_docs, min_df, max_docs

{'d51': 15.503, 'd486': 13.553, 'd878': 11.491, 'd1003': 9.154, 'd14': 9.053, 'd573': 8.945, 'd329': 8.941, 'd576': 8.831, 'd172': 8.47, 'd876': 8.152, 'd12': 8.029, 'd252': 7.391, 'd1361': 7.35, 'd374': 7.328, 'd1147': 7.307, 'd729': 7.136, 'd629': 7.134, 'd13': 7.027, 'd184': 6.966, 'd315': 6.854, 'd202': 6.822, 'd640': 6.806, 'd1263': 6.744, 'd1335': 6.719, 'd1328': 6.71, 'd359': 6.707, 'd982': 6.631, 'd747': 6.603, 'd879': 6.587, 'd526': 6.583, 'd25': 6.488, 'd1144': 6.433, 'd332': 6.407, 'd378': 6.406, 'd110': 6.378, 'd78': 6.357, 'd717': 6.355, 'd216': 6.264, 'd1268': 6.229, 'd917': 6.198, 'd811': 6.181, 'd1186': 6.161, 'd792': 6.159, 'd663': 6.126, 'd29': 6.063, 'd1219': 6.004, 'd601': 5.984, 'd240': 5.949, 'd305': 5.94, 'd908': 5.899, 'd606': 5.886, 'd1305': 5.881, 'd56': 5.788, 'd665': 5.776, 'd1300': 5.77, 'd721': 5.743, 'd280': 5.714, 'd711': 5.683, 'd638': 5.682, 'd328': 5.57, 'd253': 5.541, 'd746': 5.493, 'd1072': 5.469, 'd364': 5.461, 'd160': 5.454, 'd219': 5.452, 'd193':

## Question 2: Rocchio Relevance Feedback (40 pts)

Now comes the really fun stuff. Let's try to simulate an interactive Rocchio relevance feedback loop. The two main characters in the story are:

* an inquisitive `User` who never ceases issuing queries; and
* a diligent `RetrievalSystem` that always attends to the queries with extraordinary patience.

You will play the role of `RetrievalSystem`, represented by the `RetrievalSystem` class. An instance of the `User` class can be created as follows.  

In [8]:
from utils import User
test_user = User(random_state=0) # The User in the autograder tests uses a different seed

### Question 2a: Perform Latent Semantic Indexing (LSI) (10 pts)

As the first step of constructing a `RetrievalSystem` class, let's first complete the `__init__` method. It accepts a collection of documents, `docs`, as returned by the `load_cranfield_docs` function, and an argument `num_concepts` indicating the number of concepts/topics to keep for LSI. In addition to the argument `min_df` with its usual meaning, the method also accepts three other arguments, `alpha`, `beta` and `gamma`, that correspond to the $\alpha$, $\beta$ and $\gamma$ parameters in the Rocchio query update formula. 

A document-term matrix with TF-IDF weights, `doc_term_mat`, has been created for you. Your task at this step is to perform LSI on the document-term matrix and obtain the LSI-transformed vectors in the concept space for each document. The LSI-transformed vectors should be stored in the variable `self.doc_vecs` as a 2-D `np.ndarray` of the shape `(len(docs), num_concepts)`. You may (and probably need to) create additional class attributes to hold objects that may be shared with other class methods. Please use `random_state=42` whenever it is required. 


**This class method should fill the class attribute `self.doc_vecs` with a 2-D `np.ndarray` of the shape `(len(docs), num_concepts)`.**

In [9]:
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat) # document vectors in a matrix
        
#         print(self.doc_vecs)
        
        # YOUR CODE HERE
#         raise NotImplementedError()

In [10]:
# Autograder tests

num_concepts, min_df = 100, 10 # These won't change in the hidden tests

stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)

# Some sanity checks
assert isinstance(stu_ret_sys.doc_vecs, np.ndarray), "Q2a: The doc_vecs attribute of your RetrievalSystem should be a np.ndarray. "
assert stu_ret_sys.doc_vecs.shape == (len(stu_docs), num_concepts), "Q2a: The doc_vecs attribute of your RetrievalSystem has an incorrect shape. "
assert np.issubdtype(stu_ret_sys.doc_vecs.dtype, np.floating), "Q2a: The doc_vecs attribute of your RetrievalSystem should have a float dtype. "


# Some hidden tests

del stu_ret_sys, num_concepts, min_df

[[ 0.27155389  0.0570719  -0.15633433 ... -0.11597242 -0.00544305
  -0.02938602]
 [ 0.39302361 -0.03820294  0.20726782 ...  0.04821608  0.03822412
  -0.00955175]
 [ 0.22086864 -0.02285926  0.14173127 ...  0.13969519 -0.04252505
  -0.01721254]
 ...
 [ 0.2567331  -0.20176827 -0.11689396 ... -0.02044707 -0.11124338
  -0.05291053]
 [ 0.3876303  -0.22704584 -0.05591145 ...  0.02669777 -0.02952717
   0.02358047]
 [ 0.19341863 -0.14975643 -0.14570992 ... -0.0211641  -0.00980013
   0.03340227]]


### Question 2b: Retrieve and rank documents in the concept space (10 pts)

Next, let's complete the method `retrieve_n_rank_docs` for retrieving and ranking documents in the concept space generated by your LSI earlier. The method accepts the same arguments `queries` and `max_docs` as you have seen in other retrieval functions. The only novelty is that now we keep the LSI-transformed vectors for all the documents instead of an inverted index. 

The argument `queries` is a `dict` of queries as returned by the `load_cranfield_queries` function. However, it may contain an arbitrary subset of the Cranfield queries rather than necessarily all the Cranfield queries. In fact, you should not assume any knowledge about the possible queries included in `queries`, because what queries you will receive is at the discretion of the inquisitive `User`. Upon receiving `queries`, you should turn each **NEW** query into an LSI-tranformed vector, following what you have done with the documents; for queries that you have received before, fetch their vectors from the class attribute `self.q_vecs`. The reason why we need to treat new and old queries differently is that later we will perform Rocchio updates on some query vectors so that their final vector representations will differ from the LSI representations they started with. All query vectors start as an LSI vector but may end up differently depending on how often they are involved in the feedback loop. 

Once you have the correct vector representation for each query, retrieve documents in descending order of the cosine similarity between their vector representations. The maximum number of documents to retrieve for each query is again governed by the argument `max_docs`. As before, your method should finally output a `dict` containing the documents retrieved for each query, similar to:

```
{
    'q217': ['d983', 'd554', ..., 'd623'],
    'q99' : ['d716', 'd67', ..., 'd164'],
    ...
}
```

It is fine if you see a `RuntimeWarning`. That's because two document vectors are zero vectors. 


⚠️ **Always remember, the document numbers start at 1.** Namely, the first document is `d1`. ⚠️

**This method should return a `dict` of length `len(queries)`, where each key is a `str` representing a `q_id` and each value is a `list` that contains the `doc_id` of the retrieved documents in order. `max_docs` determines the maximum number of documents to retrieve for each query.**

In [104]:
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics.pairwise import cosine_similarity
import re

class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat) # document vectors in a matrix
        
        
        # YOUR CODE HERE
#         raise NotImplementedError()

    def retrieve_n_rank_docs(self, queries, max_docs=-1):
        """
        Retrieve and rank documents in the latent semantic (concept) space
        """
        ret_docs = {}
        
#         self.q_vecs = dict() # query vectors
        
        for query in queries:
            if query not in list(self.q_vecs.keys()):
                query_term_mat = self.vec.transform([" ".join(queries[query])])

                self.q_vecs[query] = self.svd.transform(query_term_mat)
        
        
#         for q_vecs in self.q_vecs:
            cos_sim_dict = {}
            document_count = 1
            for d_vecs in self.doc_vecs:
                cos_sim_dict['d' + str(document_count)] = cosine_similarity(self.q_vecs[query].reshape(1, -1), d_vecs.reshape(1, -1))
                document_count += 1
            cos_sim_dict = dict(sorted(cos_sim_dict.items(), key = lambda x: (-x[1], int(re.search(r'\d+',x[0]).group()))))

            cos_sim_dict = {x:y for x, y in cos_sim_dict.items() if y != 0}

#             print(cos_sim_dict)

            if max_docs == -1:
                total_list = list(cos_sim_dict.keys())
            else:
                total_list = list(cos_sim_dict.keys())[:max_docs]
                
            ret_docs[query] = total_list
            
                    
#         print(queries)
#         print(self.q_vecs)
            
        
        # YOUR CODE HERE
#         raise NotImplementedError()

        return ret_docs

In [105]:
import random
random.seed(0)

num_concepts, min_df = 100, 10
stu_docs = load_cranfield_docs()
stu_queries = load_cranfield_queries()
stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)

stu_queries = dict(random.sample(stu_queries.items(), 100)) # Test on a random sample
stu_ret_docs = stu_ret_sys.retrieve_n_rank_docs(stu_queries, max_docs=10)
print(len(stu_ret_docs))

stu_queries = dict(random.sample(stu_queries.items(), 50))    
stu_ret_docs = stu_ret_sys.retrieve_n_rank_docs(stu_queries, max_docs=7)
print(len(stu_ret_docs))

100
50


In [30]:
import random
random.seed(0)

num_concepts, min_df = 100, 10 # These won't change in the hidden tests
max_docs = 7 # This may vary in the hidden tests

queries = load_cranfield_queries()
stu_queries = dict(random.sample(queries.items(), 100)) # Test on a random sample

stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)
stu_ret_docs = stu_ret_sys.retrieve_n_rank_docs(stu_queries, max_docs)

In [28]:
len(stu_ret_docs)

100

In [15]:
# Autograder tests
import random
random.seed(0)

num_concepts, min_df = 100, 10 # These won't change in the hidden tests
max_docs = 10 # This may vary in the hidden tests

queries = load_cranfield_queries()
stu_queries = dict(random.sample(queries.items(), 100)) # Test on a random sample

stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)
stu_ret_docs = stu_ret_sys.retrieve_n_rank_docs(stu_queries, max_docs)

# Some sanity checks
assert isinstance(stu_ret_docs, dict), "Q2b: Your method should return a dictionary. "
assert len(stu_ret_docs) == len(stu_queries), "Q2b: Your dictionary should have the same length as there are queries. "

for q_id in stu_ret_docs:
    
    assert q_id in stu_queries, f"Q2b: When max_docs = {max_docs}, '{q_id}' in your dictionary is not a valid q_id. "
    
    assert len(stu_ret_docs[q_id]) <= max_docs, f"Q2b: When max_docs = {max_docs}, your # retrieved docs ({len(stu_ret_docs[q_id])}) for {q_id} is bigger than max_docs. "


# Some hidden tests

del stu_ret_sys, num_concepts, min_df
del max_docs, queries, stu_queries, stu_ret_docs

### Question 2c: Perform Rocchio query updates (20 pts)

Finally it's time to engage with the inquisitive `User`. Complete the method `gather_feedback` for inviting a `user` to your Rocchio relevance feedback loop. Your interaction with the `user` proceeds in four steps. 

* **Step 1**: You request the `user` to issue queries by invoking the `user`'s `issue_queries` method. The `user` then returns to you a `dict` of queries represented by `queries`, and a number `max_docs` indicating the maximum number of documents the `user` wants to retrieve for each query. 


* **Step 2**: You retrieve the required number of documents based on `queries` as you did in the last question. 


* **Step 3**: You invite the `user` to give feedback to the documents retrieved, `ret_docs`, by invoking the `user`'s `give_feedback` method with `ret_docs` passed along. Normally in a relevance feedback loop, the `user` would mark each document retrieved as either relevant or irrelevant; however, the `user` you are engaging with prefers giving feedback in the form of **precisions at each rank**. 
    
    Specifically, suppose your `ret_docs` is as follows:

    ```
    {
        'q167': ['d553', 'd1100', 'd1096', 'd1279', 'd1099'], 
        'q54' : ['d123', 'd554', 'd623', 'd398', 'd102'], 
        'q197': ['d768', 'd884', 'd883', 'd909', 'd882']
    }
    ```
    from which we can deduce that the `user` must have issued three queries and for each query at most five documents should be retrieved. If you pass your `ret_docs` to the `user` for feedback, you would get back:
    
    ```
    {
        'q167': [0.0, 0.0, 0.0, 0.0, 0.0], 
        'q54' : [0.0, 0.0, 0.0, 0.0, 0.0], 
        'q197': [0.0, 0.5, 0.3333333333333333, 0.25, 0.2]
    }
    ```
    where each number indicates the precision at that rank. For example, the second number `0.5` from `q197` indicates that the precision at rank 2 is `0.5` for query `q197`. Similarly, the precision at rank 4 is `0.25` for query `q197`. For queries `q167` and `q54`, the precisions at top 5 ranks are, unfortunately, zero. The precisions at each rank are returned back to you in the variable `pre_at_n`, along with another variable `avg_ndcg` which indicates your average NDCG for all queries. 
    
Code for the first three steps has been provided to you. Your task is to complete the code for

* **Step 4**: You perform a Rocchio update on all the queries the `user` issued to you based on the feedback you received. As a result, the query vectors in `self.q_vecs` should be updated accordingly. In other words, the "modified query vector" for each query under consideration should be stored in the corresponding entry in `self.q_vecs`. 

⚠️ **Always remember, the document numbers start at 1.** Namely, the first document is `d1`. ⚠️

**This method should return a copy of `self.q_vecs` for grading.**

In [108]:
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        # Copy and paste your code from the last question here
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat) # document vectors in a matrix
        # YOUR CODE HERE
#         raise NotImplementedError()


    def retrieve_n_rank_docs(self, queries, max_docs=-1):
        """
        Retrieve and rank documents in the latent semantic (concept) space
        """
        
        # Copy and paste your code from the last question here
        
        ret_docs = {}
        
        for query in queries:
            if query not in list(self.q_vecs.keys()):
                query_term_mat = self.vec.transform([" ".join(queries[query])])

                self.q_vecs[query] = self.svd.transform(query_term_mat)
        
        
#         for q_vecs in self.q_vecs:
            cos_sim_dict = {}
            document_count = 1
            for d_vecs in self.doc_vecs:
                cos_sim_dict['d' + str(document_count)] = cosine_similarity(self.q_vecs[query].reshape(1, -1), d_vecs.reshape(1, -1))
                document_count += 1
            cos_sim_dict = dict(sorted(cos_sim_dict.items(), key = lambda x: (-x[1], int(re.search(r'\d+',x[0]).group()))))

            cos_sim_dict = {x:y for x, y in cos_sim_dict.items() if y != 0}

#             print(cos_sim_dict)

            if max_docs == -1:
                total_list = list(cos_sim_dict.keys())
            else:
                total_list = list(cos_sim_dict.keys())[:max_docs]
                
            ret_docs[query] = total_list
#         print(queries)
#         print(len(self.q_vecs))
            
        
        # YOUR CODE HERE
#         raise NotImplementedError()

        return ret_docs
    
    def gather_feedback(self, user):
        """
        This function models the interactive relevance feedback loop
        """
        
        # Step 1: Request the user to issue queries
        queries, max_docs = user.issue_queries()
        
        # Step 2: Retrieve the required number of docs in reponse to the queries
        ret_docs = self.retrieve_n_rank_docs(queries, max_docs=max_docs)
#         print(self.q_vecs.keys())

        # Step 3: Obtain feedback from the user in the form of precisions at each rank
        pre_at_n, avg_ndcg = user.give_feedback(ret_docs)
        
#         print(pre_at_n)
        
#         print(self.doc_vecs)
        
        # Step 4: Perform a Rocchio query update based on the feedback
        for query in pre_at_n:
            rel_docs = []
            irrel_docs = []
            start_pre = 0.1
            for idx, pre in enumerate(pre_at_n[query]):
                if (pre >= start_pre) & (pre != 0):
                    doc_num = ret_docs[query][idx]
                    doc_num = int(re.search(r'\d+',doc_num).group())
                    rel_docs.append(self.doc_vecs[doc_num-1])
                    start_pre = pre
                else:
                    doc_num = ret_docs[query][idx]
                    doc_num = int(re.search(r'\d+',doc_num).group())
                    irrel_docs.append(self.doc_vecs[doc_num-1])
                    start_pre = pre
                    
#             print(doc_num)
#             if query == 'q213':
#                 print(query)
#                 print(self.q_vecs[query])
#                 print(pre_at_n[query])
    
#                 print(rel_docs)
#                 print(irrel_docs)
                    
            mod_q_vec = (self.alpha * self.q_vecs[query]) + (self.beta * (np.nan_to_num(np.mean(rel_docs, axis = 0))) - (self.gamma * (np.nan_to_num(np.mean(irrel_docs, axis = 0)))))
            self.q_vecs[query] = mod_q_vec
            
#             if query == 'q213':
#                 print(self.q_vecs[query])
        
#         print(self.q_vecs.keys())
        # YOUR CODE HERE
#         raise NotImplementedError()
#         print(self.q_vecs.copy())
        return self.q_vecs.copy()

In [17]:
# The definition of the User class is hidden here - not for grading


In [109]:
import random
random.seed(0)

num_concepts, min_df = 100, 10 # These won't change in the hidden tests
max_docs = 10 # This may vary in the hidden tests

queries = load_cranfield_queries()
stu_queries = dict(random.sample(queries.items(), 100)) # Test on a random sample

stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)
stu_ret_docs = stu_ret_sys.gather_feedback(test_user)

In [101]:
# stu_ret_docs['q150']

TypeError: tuple indices must be integers or slices, not str

In [38]:
# Autograder tests

num_concepts, min_df = 100, 10 # These won't change in the hidden tests

stu_ret_sys = RetrievalSystem(stu_docs, num_concepts, min_df)


# Some hidden tests

del stu_ret_sys, num_concepts, min_df

In [54]:
# test = 'd884'

# int(re.search(r'\d+',test).group())

884

In [67]:
# np.nan_to_num(np.mean([],axis = 0),0)

0.0