# Baseline System 

In [1]:
from elasticsearch  import Elasticsearch
from typing import Dict, List, Optional
import json

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoBERT

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/subankankarunakaran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2021-11-20 16:55:50 [INFO] loader: Loading faiss with AVX2 support.
2021-11-20 16:55:50 [INFO] loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2021-11-20 16:55:50 [INFO] loader: Loading faiss.
2021-11-20 16:55:50 [INFO] loader: Successfully loaded faiss.


In [3]:
INDEX_SETTINGS = {
    "mappings": {  
    "properties": {
        "passage": {"type": "text", "term_vector": "yes", "analyzer": "english"},
    }
    },
    "settings" : {
        'index': {
            'number_of_shards': 1,
            'number_of_replicas': 1,

            # configure our default similarity algorithm explicitly to use bm25,
            # this allows it to use it for all the fields
            'similarity': {
                'default': {
                    'type': 'BM25'
                }
            }
        }
    }
}

In [4]:
INDEX_NAME = "cast_base"

In [5]:
es = Elasticsearch()

In [6]:
class CAsT_base():
    def __init__(self, context_responses: int = 0, reranking: bool = False) -> None:
        self.INDEX_NAME = "cast_base"
        self.es = Elasticsearch()
        self.queries = []
        self.responses = []
        self.reranking = reranking
        self.reranker = MonoBERT() if reranking else None 
        self.context_responses = context_responses

    def clear_context(self, clear_queries: bool = True, clear_responses: bool = True):
        if clear_queries:
            self.queries = []
        if clear_responses:
            self.responses = []
 
    def listToString(self,s: List): 
        # initialize an empty string
        str1 = " " 
        
        # return string  
        return (str1.join(s))

    def query(self, q: str,remove_stopwords: bool = True) -> str:
        """ 
        Preprocessing query and scoring using bm25
        """
        stop_words = set(stopwords.words('english'))
        
        if remove_stopwords:
            tokens = word_tokenize(q)
            q_list = []
            for w in tokens:
                if w not in stop_words:
                    q_list.append(w)
            q = self.listToString(q_list)
            
            
        hits = es.search(
            index=self.INDEX_NAME, q=q, _source=True, size=100
        ).get("hits", {}).get("hits")
        
        
        hits_cleaned = [{
            "passage": hit.get("_source", {}).get("passage"),
            "_id": "MARCO_" + hit.get("_id") if hit.get("_source").get(
                    "origin") == "msmarco" else "CAR_" + hit.get("_id"),
            "_score": hit.get("_score", "FAILED")} for hit in hits]
        
        if self.reranking:
            print("RERANKING")
            texts = [Text(hit.get("passage"), {
                '_id': hit.get("_id", "FAILED")}, 0) for hit in hits_cleaned]

            reranked = self.reranker.rerank(Query(q), texts)
            hits_cleaned = [{
                "passage": hit.text,
                "_id": hit.metadata["_id"],
                "_score": hit.score}
                for hit in reranked]
        
        
        if len(hits) > 0:
            print("Query: " + q)
            return hits_cleaned[:1000]
        else:
            return []

In [7]:
cast = CAsT_base(reranking=True)
cast.query("Tell me about Oslo?",remove_stopwords=True)

2021-11-20 16:56:05 [INFO] base: GET http://localhost:9200/ [status:200 request:0.025s]
2021-11-20 16:56:05 [INFO] base: POST http://localhost:9200/cast_base/_search?q=Tell+Oslo+%3F [status:200 request:0.069s]


RERANKING
Query: Tell Oslo ?


[{'passage': '1 Your doctor will probably tell you not to take paroxetine. If you stop taking paroxetine, you should wait at least 2 weeks before you start to take an MAO inhibitor.  tell your doctor and pharmacist what other prescription and nonprescription medications and vitamins you are taking or plan to take.',
  '_id': 'CAR_8841226',
  '_score': -11.096511840820312},
 {'passage': 'The Wolfpack, a documentary by Crystal Moselle, tells the tale of the Angulo brothers, raised in isolation, sustained by movies. Magnolia Pictures. Siblings, from left, Krsna Angulo (now Eddie), Jagadisa Angulo (now Glenn) and Mukunda Angulo.he Wolfpack, a documentary by Crystal Moselle, tells the tale of the Angulo brothers, raised in isolation, sustained by movies. Magnolia Pictures. Siblings, from left, Krsna Angulo (now Eddie), Jagadisa Angulo (now Glenn) and Mukunda Angulo.',
  '_id': 'CAR_8841162',
  '_score': -11.610649108886719},
 {'passage': "Tell a friend about us, add a link to this page, or 