# Assignment: Information Retrieval (IR)

## Preparations
* Put all your imports, and path constants in the next cells

In [1]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget



In [2]:
import wget
wget.download("https://github.com/MIE1513HS-2022/course-datasets/raw/main/government.zip", "government.zip")

'government (4).zip'

In [3]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: None


In [4]:
# imports
# Put all your imports here
from whoosh import index, writing, qparser,scoring
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
from whoosh.writing import BufferedWriter
import nltk
from nltk.stem import *
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following names,topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents") 
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.add_files()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):
        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))
            
    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = topic_results.score(docnum)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = topic_results.score(docnum)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        self.print_trec_eval_result(results)


In [6]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [markdown cell]
iprec_at_recall_1.00 all

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]
iprec_at_recall_1.00 means Precision Averages at 1.00 recall, which meausres a score of once all the relevant documents are included in the search results, how many of those are actually relevant and how many are not relevant as precision is true positive out of all the positive predictions. For a government website search, the person who is searching likely wants a complete set of information regarding the search topics, not just some information. Therefore, the person will likely read all the top relevant documents in case the other ones contain some edge case regarding the topic he/she is trying to search too. Thus, it is important that there are as few irrelevant documents as possible until the person finishes reading through all the relevant documents. Because if the person reads an irrelevant document, he/she will assume the results after that one is also equally or less relevant and will likely stop reading at that point.

## Question 2

### Q2 (a): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [7]:
class IRQ2(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        self.schema = Schema(file_path = ID(stored=True),
                             file_content = TEXT(analyzer = RegexTokenizer()))
        self.index_sys = index.create_in(tempfile.mkdtemp(), self.schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(query, limit=None)
        return topic_results

In [8]:
q2 = IRQ2("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [9]:
q2.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [10]:
q2.print_rel_name('16')

---------------------------Topic_id and Topic_phrase----------------------------------
16 Emergency and disaster preparedness assistance
---------------------------Return documents----------------------------------
16 Q0 G00-34-3591274 0 34.092076 test
16 Q0 G00-05-0719078 1 32.195486 test
16 Q0 G00-92-2053892 2 27.131764 test
16 Q0 G00-70-2681284 3 26.574622 test
16 Q0 G00-33-2857182 4 21.813916 test
16 Q0 G00-51-3264753 5 10.948533 test
16 Q0 G00-32-1907807 6 10.008862 test
---------------------------Relevant documents----------------------------------
16 0 G00-03-0589290 1
16 0 G00-21-0494028 1
16 0 G00-21-2114990 1
16 0 G00-32-0551737 1
16 0 G00-86-3719816 1
16 0 G00-92-2974327 1
16 0 G00-99-0140748 1


### Q2 (b): Provide answer to Q2 (b) here [markdown cell]
The metric I chose - iprec_at_recall_1.00 all - was 0.1869. 

### Q2 (c): Provide answer to Q2(c) here [markdown cell]
Yes it did well on topics 18 and 24 where the iprec_at_recall_1.00 was 1 for both of them. It did very badly for most of them, for example it got 0 on topics 1,2,4,6,7,9,16 and 26, and it got around 0.2 for all other topics. 

## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]
The query I investigated was query 16 "Emergency and disaster preparedness assistance", and the the false positive test case I investigated was G00-92-2053892 while the false negative test case I investigated was G00-21-0494028. I found that in the false negative case the word "emergency" did show up 3 times but the first letter was all capaitlized and one time it showed up as plural "Emergengies", the word "and" did not show up as many times as in the false positive document as in the false negative document but this word does not matter so it should be removed. The word "disaster" actually showed up many times in the false neagtive case but it showed up as plural sometimes too and also with first letter capitalized. For the word "preparedness", it showed up as "prepared" instead of the full noun in the negative case. 

**For proposed solution and resonaing:**
I think the changes of adding a lower-case filter, a stop words filter, a strip filter, and a stemmer will all improve Whoosh's performance on this test collection. This is because sometimes the first letters of these words show up as either uppercase or lowercase; therefore, it is important to make them all lower case. The word like "and" " can show up a lot of times in a document and not mean anything, and a stop filter can solve this issue. The strip filter can filter out any potential odd characters. The stemmers will return just the stem of the words which then can get rid of things like plurals and past tense etc which shows up in words like "diasters" and "preparedness".

### Q3 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [11]:
myAnalyzer1 = RegexTokenizer() | LowercaseFilter() |  StopFilter() | StripFilter() | CustomFilter(LancasterStemmer().stem)


In [12]:

class IRQ3(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        self.schema = Schema(file_path = ID(stored=True),
                             file_content = TEXT(analyzer = myAnalyzer1))
        self.index_sys = index.create_in(tempfile.mkdtemp(), self.schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(query, limit=None)
        return topic_results

In [13]:
q3 = IRQ3("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [14]:
q3.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [15]:
q3.print_rel_name('16')

---------------------------Topic_id and Topic_phrase----------------------------------
16 Emergency and disaster preparedness assistance
---------------------------Return documents----------------------------------
16 Q0 G00-68-3661801 0 24.587401 test
16 Q0 G00-34-3591274 1 24.315923 test
16 Q0 G00-70-2681284 2 23.245521 test
16 Q0 G00-03-2245885 3 23.203165 test
16 Q0 G00-05-0719078 4 23.178558 test
16 Q0 G00-88-2853984 5 23.084678 test
16 Q0 G00-45-0006211 6 23.084678 test
16 Q0 G00-84-2647789 7 22.750629 test
16 Q0 G00-20-3839216 8 21.989272 test
16 Q0 G00-92-2053892 9 21.895389 test
16 Q0 G00-93-0870338 10 21.537457 test
16 Q0 G00-14-0931254 11 20.573064 test
16 Q0 G00-46-3010333 12 20.466274 test
16 Q0 G00-33-2857182 13 18.790767 test
16 Q0 G00-75-3633903 14 18.190679 test
16 Q0 G00-56-2140972 15 17.853890 test
16 Q0 G00-49-2630728 16 17.747169 test
16 Q0 G00-23-1010771 17 17.250207 test
16 Q0 G00-30-1702552 18 14.587239 test
16 Q0 G00-15-3335359 19 14.492165 test
16 Q0 G00-51-32

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]
The modifications I made are: adding a lower-case filter, a stop words filter, a strip filter, and a stemmer.
Yes there were improvements over some queries in performance using the iprec_at_recall_1.00 score; for example, the performance of topics 9, 10, 14 and 28 all improved in terms of iprec_at_recall_1.00 score.The false negative case still exists from part a but the score for it is lower this time compared to in part a, and the document is lower ranked on the relevant documents list; however, the false positive case from part a did not get improved.


### Q3 (d): Provide answer to Q3 (d) here [markdown cell]
Yes it overall improved the average iprec_at_recall_1.00     all    to  0.2418 from the original 0.1869. 

### Q3 (e): Provide answer to Q3 (e) here [markdown cell]
Yes. For example query 22 became 0.03 when it was 0.2 before, and query 14 became 1 when it was 0 before.

### Q3 (f): Provide answer to Q3 (f) here [markdown cell]
I think it means the idea is good overall as it did make an overall improvement on the iprec_at_recall_1.00 score. Also when I look at other scores like overall map, it also improved from 0.19 to 0.35. However as it made some queries performed worse, I believe there is still room for improvement on the searcher in order for it to do even better. 

## Question 4


### Q4 (a): Provide answer to Q4 (a) here [markdown cell]

I found that the first letter was all capaitlized in some words, but this should not make a difference verus an uncapitalized word in the search; distinguishing them would lead to a low performing model. Other common words like "and", "the" etc. which exists in the queries' topics can show up many times in a document, but these words are not importamt so they should be removed. Another cause of bad model performance is when there are many compound words that can show up with a "-" in between the subgroups; for example, "wireless" vs "wire-less" and this does not create a match anymore between the words. In order to avoid sparseness and allow words to show up in various forms like plural singular future and past tense etc, the compound words should be splitted and stripped down to their stem words. Otherwise this can cause a bad model result where relevant words are not picked up by the search. 

I also find that another source of bad model performance is caused by the searcher requring all the words in the topic query to be present inside the document. This is sometimes no the case because a relevant document does not need to contain all the words in the topic. A bad relevance scoring function can cause the relevance rank of the dcouments to be skewed as well.

### Q4 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [16]:
#tried but not the final proposed solution
myAnalyzer2 = RegexTokenizer() | LowercaseFilter() |  StopFilter() | StripFilter() | CompoundWordFilter(['bio', 'technology', 'cyber', 'crime', 'child', 'hood'], keep_compound=True) |CustomFilter(LancasterStemmer().stem)


In [17]:
#tried but not the final proposed solution
myAnalyzer3 = RegexTokenizer() | LowercaseFilter() |  StopFilter() | StripFilter() | CompoundWordFilter(wordnet, keep_compound=False) |CustomFilter(LancasterStemmer().stem)


In [18]:
# final proposed analyzer
myAnalyzer4 = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StripFilter() | CustomFilter(LancasterStemmer().stem)


In [19]:
#tried but not the final proposed solution
myAnalyzer5 = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StripFilter() | CustomFilter(WordNetLemmatizer().lemmatize)


In [20]:
#tried but not the final proposed solution
myAnalyzer6 = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StripFilter() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')


In [21]:
#tried but not the final proposed solution
w_new1 = scoring.BM25F(B=0.6, K1=2)

In [22]:
#tried but not the final proposed solution
w_new2 = scoring.BM25F(B=0.6, K1=2.5)

In [23]:
#tried but not the final proposed solution
w_new3 = scoring.BM25F(B=0.6, K1=1.9)

In [24]:
# final proposed scoring function paramaters
w_new4 = scoring.BM25F(B=0.5, K1=1.9)

In [25]:
#tried but not the final proposed solution
w_new5 = scoring.BM25F(B=0.4, K1=1.9)

In [26]:
#tried but not the final proposed solution
w_new6 = scoring.BM25F(B=0.8, K1=1.9)

In [27]:
class IRQ4(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        self.schema = Schema(file_path = ID(stored=True),
                             file_content = TEXT(analyzer = myAnalyzer4))
        self.index_sys = index.create_in(tempfile.mkdtemp(), self.schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'
         # changed parser to "or group" instead of the default "and"
        self.query_parser = QueryParser("file_content", schema=self.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher(weighting = w_new4)

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(query, limit=None)
        return topic_results

In [28]:
q4 = IRQ4("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [29]:
q4.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       481.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0624
gm_map                   1       -2.7741
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0526
iprec_at_recall_0.00     1       0.1034
iprec_at_recall_0.10     1       0.1034
iprec_at_recall_0.20     1       0.1034
iprec_at_recall_0.30     1       0.1034
iprec_at_recall_0.40     1       0.1034
iprec_at_recall_0.50     1       0.1034
iprec_at_recall_0.60     1       0.1034
iprec_at_recall_0.70     1       0.0455
iprec_at_recall_0.80     1       0.0455
iprec_at_recall_0.90     1       0.0391
iprec_at_recall_1.00     1       0.0391
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.10

In [30]:
q4.print_rel_name('16')

---------------------------Topic_id and Topic_phrase----------------------------------
16 Emergency and disaster preparedness assistance
---------------------------Return documents----------------------------------
16 Q0 G00-34-3591274 0 29.326098 test
16 Q0 G00-68-3661801 1 28.709376 test
16 Q0 G00-70-2681284 2 27.665278 test
16 Q0 G00-03-2245885 3 26.842133 test
16 Q0 G00-05-0719078 4 26.343186 test
16 Q0 G00-84-2647789 5 26.212271 test
16 Q0 G00-21-2114990 6 26.051207 test
16 Q0 G00-77-1693859 7 25.521510 test
16 Q0 G00-21-0494028 8 25.291854 test
16 Q0 G00-88-2853984 9 25.115507 test
16 Q0 G00-45-0006211 10 25.115507 test
16 Q0 G00-92-2053892 11 24.687331 test
16 Q0 G00-86-3719816 12 24.686965 test
16 Q0 G00-33-2857182 13 24.402549 test
16 Q0 G00-53-0263242 14 24.268069 test
16 Q0 G00-60-2564326 15 24.264838 test
16 Q0 G00-93-0870338 16 23.636041 test
16 Q0 G00-75-3633903 17 22.642157 test
16 Q0 G00-46-3010333 18 22.341671 test
16 Q0 G00-56-2140972 19 22.288811 test
16 Q0 G00-09-13

### Q4 (b): Provide answer to Q4 (b) here [markdown cell]

The modications I made and the resoaning for each is listed below:

LowercaseFilter() - to get rid of distinction between upper and lower case letters in words 
IntraWordFilter() - to get rid of compound words connected by "-" "/" etc. 

StopFilter() - to get rid of common unimportant words like "and", "the" etc 

StripFilter() - to get rid of odd characters 

CustomFilter(LancasterStemmer().stem) - to return only the stem root of the words

For query parser, changed the parser criteria from the defualt "and" to "orgroup"; this is because maybe not all the words in the topic would show up in the document, but the more words that show up the better still. 

For BM25 paramaters, I changed B to 0.5 and K1 to 1.9. 

I refeered to this website to understand how changing B and K impact the searches. https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables. I decided to decrease B to minimze the effects of the length of the document compared to the average length because the length of the government document does not matter that much to the relevance of the topic. I also decided to increase K1 to so that the score for each term can continue to go up by relatively more for more instances of that term. 

### Q4 (d): Provide answer to Q4 (c) here [markdown cell]
Yes there is an improvement in the performance compared to question 3 and a significant improvement in performance compared to the baseline. It improved slightly from 0.35 in average map in Q3 to 0.40 in this section.

## Validation

#### Run the following cells to make sure your code returns the correct value types

In [31]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [32]:
assert(isinstance(q2.index_sys, FileIndex)), "Index Type"
assert(isinstance(q2.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q2.searcher, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [33]:
assert(isinstance(q3.index_sys, FileIndex)), "Index Type"
assert(isinstance(q3.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q3.searcher, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation

In [34]:
assert(isinstance(q4.index_sys, FileIndex)), "Index Type"
assert(isinstance(q4.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q4.searcher, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
