# Baseline System 

In [53]:
from elasticsearch  import Elasticsearch
from typing import Dict, List, Optional
import json

In [54]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/subankankarunakaran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
INDEX_SETTINGS = {
    "mappings": {  
    "properties": {
        "passage": {"type": "text", "term_vector": "yes", "analyzer": "english"},
    }
    },
    "settings" : {
        'index': {
            'number_of_shards': 1,
            'number_of_replicas': 1,

            # configure our default similarity algorithm explicitly to use bm25,
            # this allows it to use it for all the fields
            'similarity': {
                'default': {
                    'type': 'BM25'
                }
            }
        }
    }
}

In [56]:
INDEX_NAME = "cast_base"

In [57]:
es = Elasticsearch()

In [58]:
class CAsT():
    def __init__(self, context_responses: int = 0) -> None:
        self.INDEX_NAME = "cast_base"
        self.es = Elasticsearch()
        self.queries = []
        self.responses = []
        self.context_responses = context_responses

    def clear_context(self, clear_queries: bool = True, clear_responses: bool = True):
        if clear_queries:
            self.queries = []
        if clear_responses:
            self.responses = []

    def query(self, q: str,remove_stopwords: bool = True) -> str:
        """ 
        Preprocessing query and scoring using bm25
        """
        stop_words = set(stopwords.words('english'))
        
        if remove_stopwords:
            tokens = word_tokenize(q)
            q_list = []
            for w in tokens:
                if w not in stop_words:
                    q_list.append(w)
            print('query: ',q_list)
        
    
        hits = es.search(
            index=self.INDEX_NAME, q=q, _source=True, size=100
        ).get("hits", {}).get("hits")

        if len(hits) > 0:
            print("Query: " + q)
            self.responses.append(hits[0].get("_source").get("passage"))
            return hits[:1000]
        else:
            return []

In [59]:
cast = CAsT()
cast.query("Tell me about Oslo?",remove_stopwords=True)

query:  ['Tell', 'Oslo', '?']
Query: Tell me about Oslo?


[{'_index': 'cast_base',
  '_type': '_doc',
  '_id': '8841272',
  '_score': 9.102427,
  '_source': {'passage': "Tell a friend about us, add a link to this page, or visit the webmaster's page for free fun content. Link to this page: <a href=http://acronyms.thefreedictionary.com/South+African+Board+for+Personnel+Practice>SABPP</a>. Facebook."}},
 {'_index': 'cast_base',
  '_type': '_doc',
  '_id': '8841042',
  '_score': 8.981482,
  '_source': {'passage': 'Tell Me Something: The Songs of Mose Allison. Singles from How Long Has This Been Going On. How Long Has This Been Going On is the twenty-fourth studio album by Northern Irish singer-songwriter Van Morrison, with Georgie Fame and Friends, released in December 1995 (see 1995 in music) in the UK. It charted at #1 on Top Jazz Albums.'}},
 {'_index': 'cast_base',
  '_type': '_doc',
  '_id': '8841113',
  '_score': 8.887823,
  '_source': {'passage': "I've searched and searched but can't find a thread on it, so forgive me if its been discussed