# Goal: ask me anything cricket based on wikipedia

## Prepare data

In [None]:
# Dowload latest wiki dump from here
# downloads around 20 GB compressed file. Time taking process. took me 1 day 
! wget -c https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2

# extract files using wikiextractor (take a few hours)
! python3 -m wikiextractor.WikiExtractor enwiki-20180601-pages-articles-multistream.xml.bz2 --json

# get cricket records in a separate file
# take a few minutes
! grep -i cricket text/*/* > cricket.jsonl


## Analyze data

In [17]:
# analyze cricket records
import ujson
with open("data/cricket.jsonl") as fp:
    for line in fp.readlines():
        # print(line)
        record = ujson.loads(line)
        print(record.keys())
        break
        

dict_keys(['id', 'revid', 'url', 'title', 'text'])


## Load llm

In [18]:
from haystack.nodes import PromptNode
from getpass import getpass

In [19]:
HF_TOKEN = getpass("Your Hugging Face Token")

Your Hugging Face Token ········


In [20]:
pn = PromptNode(model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",  # instruct fine-tuned model: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
                max_length=800,
                api_key=HF_TOKEN)

In [21]:
# Let's quickly try the model

out=pn("[INST] Explain in a ironic way why Cricket rocks! [/INST]")

print(out[0])

 Cricket is a sport that is widely popular in India and other parts of the world, and it's not hard to see why. With its slow-paced gameplay, long breaks, and endless amount of time spent on the field, it's the perfect sport for people who love to watch TV or play video games. And let's not forget the excitement of watching a team of players who are so slow and clumsy that they can barely run, let alone hit the ball. It's truly a spectacle to behold. So if you're looking for a sport that will keep you entertained for hours on end, while also providing a constant reminder of how much better other sports are, then Cricket is the perfect choice for you!


## Store data in a document store

In [22]:
import glob,json
from haystack import Document
from haystack.nodes import PreProcessor

docs = []

data_file = "data/cricket.jsonl"
with open(data_file, "r") as fp:
    for line in fp.readlines():
        line_json = ujson.loads(line)
        # index only text, store rest as meta
        doc_json = {"content": line_json.get("text",""), "meta": {key:value for key,value in line_json.items() if key!="text"}}
        doc = Document.from_json(doc_json)
        docs.append(doc)


In [23]:
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    language="en",
)
preprocessed_docs = processor.process(docs)

Preprocessing:   0%|▏                                    | 288/70781 [00:02<12:21, 95.05docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing:   2%|▋                                  | 1496/70781 [00:09<05:39, 204.14docs/s]Document 7eb86f5c0c893931bd3117dedfdb61a2 is 23254 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 86972c9084ba1f6cbbb0bf9173945c9f is 13254 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document 26ce52f0ad36bdb4616bbab44136debe is 40319 characters long after preprocessing, where the maximum length should be 1000

## Create an InMemoryDocumentStore and store data

In [32]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)
document_store.write_documents(preprocessed_docs)

Updating BM25 representation...: 100%|███████████| 244779/244779 [00:07<00:00, 34220.05 docs/s]


## Create a RAG Pipeline

In [33]:

from haystack import Pipeline
from haystack.nodes import BM25Retriever, PromptNode, PromptTemplate

In [34]:
retriever = BM25Retriever(document_store, top_k=4)
     

In [35]:
# a good Question Answering template, adapted for the instruction format
# (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)

qa_template = PromptTemplate(prompt=
  """[INST] Using the information contained in the context, answer the question (using a maximum of two sentences).
  If the answer cannot be deduced from the context, answer \"I don't know.\"
  Context: {join(documents)};
  Question: {query}
  [/INST]""")

In [36]:
prompt_node = PromptNode(model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",
                         api_key=HF_TOKEN,
                         default_prompt_template=qa_template,
                         max_length=5500,
                         model_kwargs={"model_max_length":8000})

In [37]:
rag_pipeline = Pipeline()
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

## Time for AMA

In [38]:
from pprint import pprint
print_answer = lambda out: pprint(out["results"][0].strip())

In [68]:
x = rag_pipeline.run(query="How many centuries did Sachin Tendulkar hit in Test cricket?",  params={
        "retriever": {
            "debug": True
        },
        "prompt_node": {
            "debug": True
        }
    })
print_answer(x)

'Sachin Tendulkar hit 51 centuries in Test cricket.'


In [48]:
print_answer(rag_pipeline.run(query="Tell me something funny about Sachin Tendulkar?"))

('Sachin Tendulkar scored his 100th international century on 16 March 2012, '
 'against Bangladesh in the Asia Cup, becoming the first person in history to '
 'achieve this feat. However, he found it mentally tough because everyone was '
 'talking about his 100th hundred, and nobody talked about his 99 hundreds.')


In [71]:
print_answer(rag_pipeline.run(query="Who is Matthew Hayden?"))

('Matthew Hayden is a former Australian international cricketer known for his '
 'powerful and aggressive left-handed opening batting style. He holds several '
 'records in international cricket, including the highest score made by an '
 'Australian batsman in Tests (380) and the record for the highest individual '
 'test score by an opening batsman in test history. Hayden retired from all '
 'forms of cricket in September 2012 and was inducted into the Australian '
 'Cricket Hall of Fame in 2017.')
