# **Install necessary libraries**

In [1]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.41.3 trl==0.4.7
!pip install gdown
!pip install --upgrade --quiet  docx2txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.2/806.2 kB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m20.

# **Import necessary libraries**

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

# ***Using a quantized Mistral-7B Model***

## **Initializing the Tokenizer**

In [3]:
model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

## **bitsandbytes parameters**

In [4]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

## **Setting up quantization config**

In [5]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

## **Loading pre-trained config**

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

# **Trying out the Mistral 7B response without implementing RAG.**

In [7]:
question= "How should you position your hands and adjust your head when catching a flat trajectory ball aimed at head height?"
inputs_not_chat = tokenizer.encode_plus("[INST] "+ question + "[/INST]", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat,
                               max_new_tokens=1000,
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [8]:
print(decoded)

["<s> [INST] How should you position your hands and adjust your head when catching a flat trajectory ball aimed at head height?[/INST] When catching a flat trajectory ball aimed at head height, you should position your hands as if you are catching a basketball. Spread your fingers wide apart and extend your arms slightly in front of your body. Your thumbs should be tucked in towards your body. The palms of your hands should be facing upwards and be angled slightly to the side. \n\nYou should also adjust your head slightly upwards and turn your body so that your shoulders are facing towards the incoming ball. You should keep your eyes on the ball and focus on the position where it will land when it hits the ground. Make sure to keep enough space between your arms and body to allow the ball to land comfortably in your hands. \n\nWhen the ball is about to reach its maximum height, lift your arms up towards the sky and catch the ball with your palms facing towards it. Use your legs to help

## **Trainable parameters**

In [9]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


# **Createing the Vector Database**

In [10]:
import gdown
url = 'https://drive.google.com/uc?id=1ChZGjOwvgKVuUplQdpLHYSdL8a-o2GPa'
output = 'Cricket.docx'
gdown.download(url, output)


Downloading...
From: https://drive.google.com/uc?id=1ChZGjOwvgKVuUplQdpLHYSdL8a-o2GPa
To: /content/Cricket.docx
100%|██████████| 1.06M/1.06M [00:00<00:00, 132MB/s]


'Cricket.docx'

In [12]:
from langchain_community.document_loaders import Docx2txtLoader

loader = Docx2txtLoader("/content/Cricket.docx")

data = loader.load()

In [13]:
data

[Document(page_content='UNIT 6\n\n\n\nUNIT 7\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTHE ROYAL MARINES\n\n\n\n\n\n\n\nCricket\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCONTENTS\n\n\n\n\n\n\tStudent Notes\tTeacher Notes\n\n\n\n\n\nPrepared for The Royal Navy in Association with The National Cricket Association and produced by Education & Youth Ltd, London.\n\nIf there is any support you feel The Royal Navy can give regarding this project please contact 0870 333 0423.\n\n\n\n\n\nAcknowledgements\n\nThese notes have been compiled by Bob Carter – National Coach, (Coach Education) The National Cricket Association\n\nTechnical Editor: P Edwards M.A.\n\n\t\nMESSAGE FROM THE NATIONAL CRICKET ASSOCIATION\t3\n\n\tINTRODUCTION TO THE MODULE\t3\n\n\t\n\nUNIT 1\n\nINTRODUCTION TO CRICKET MODULE\n\n5\n\nUNIT 2\n\nTECHNIQUES AND SKILLS\n\n6\n\n\n\n1 FIELDING\n\n6\n\n\n\n2 BOWLING\n\n9\n\n\n\n3 BATTING\n\n14\n\n\n\n4 WICKET KEEPING\n\n22\n\nUNIT 3\n\nTACTICS

In [14]:
# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(data)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever()



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Creating LLM Chain**

## **Setting up the pipeline**

In [16]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.0,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1500,
)

In [17]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## **Prompt and LLM Chain**

In [98]:
prompt_template = """
### [INST] Instruction: Answer the question based on your Cricket knowledge.
If the user explicitly asks for points only then make sure to add a sentence at the beginning like, "Here are the factors...". Otherwise write in a paragraph.
Make sure to use every information relevant to the context.
Here is context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

## **Query without a context**

In [105]:
res = llm_chain.invoke({"context": "", "question": "What are some general points for fielders to keep in mind during a cricket match?"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [106]:
print(res['text'])


During a cricket match, fielders play an important role in supporting their team by preventing the opposing team from scoring runs. Here are some general points that fielders should keep in mind:

1. Communication: Effective communication between fielders is crucial for success. They need to communicate with each other about where the ball is going and who is best positioned to catch it.
2. Anticipation: Fielders should anticipate the direction and speed of the ball as it travels through the air. This will help them position themselves correctly to catch or stop the ball.
3. Positioning: Fielders should position themselves in such a way that they can cover all possible angles and directions that the ball may come from.
4. Focus: Fielders should stay focused on the task at hand and avoid getting distracted by anything else.
5. Confidence: Fielders should have confidence in their abilities and not let fear or doubt affect their performance.
6. Adaptability: Fielders should be able to ad

## **Query with Context from VectorDB[RAG Implementation]**

In [116]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke("When were the modern Laws of cricket first written down and printed?")


# 1.How should you position your hands and adjust your head when catching a flat trajectory ball aimed at head height?
# 2.What are some general points for fielders to keep in mind during a cricket match?
# 3.When were the modern Laws of cricket first written down and printed?

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


# **Checking the context of the Query**

In [117]:
result['context']

[Document(page_content='1744\tThe Laws of the game first written down and printed. 1787\tThe formation of the Marylebone Cricket Club (MCC). 1788\tThe MCC revised the laws.', metadata={'source': '/content/Cricket.docx'}),
 Document(page_content='“The Laws of Cricket” can be obtained from the MCC. Address page 36.\n\nUNIT 1', metadata={'source': '/content/Cricket.docx'}),
 Document(page_content='History of Cricket', metadata={'source': '/content/Cricket.docx'}),
 Document(page_content='A History of Cricket, B. Green\tISBN 0 7126 2080 X The MCC Cricket Coaching Book\t\t(Fourth Edition)', metadata={'source': '/content/Cricket.docx'})]

# **Query Result**

In [118]:
print(result['text'])

 Here are the factors that determine when the modern Laws of cricket were first written down and printed:

  - The Laws of the game were first written down and printed in 1787.
  - This was done by the Marylebone Cricket Club (MCC), which was formed in 1787.
  - The MCC also revised the laws in 1788.
  - According to the document "A History of Cricket" by B. Green, the MCC Cricket Coaching Book (fourth edition) contains a copy of the Laws of Cricket.
  - The book can be obtained from the MCC, with the address given on page 36 of the document.


In [104]:
# '''
# ==========================================================================================================================================================

# Question 1: How should you position your hands and adjust your head when catching a flat trajectory ball aimed at head height?

# ==========================================================================================================================================================

# Without VectorDB context:

# When catching a flat trajectory ball aimed at head height, it's important to position your hands correctly to ensure that you can catch the ball cleanly and securely. Here are some tips to help you do this:

# 1. Keep your hands relaxed and open, with your palms facing forward. This will allow you to easily grab the ball as it comes towards you.
# 2. Position your hands slightly below the level of the ball, so that you can intercept it before it reaches its peak.
# 3. Adjust your head so that you are looking directly at the ball as it approaches. This will help you judge the timing of your catch more accurately.
# 4. Keep your body balanced and stable, with your feet shoulder-width apart and your knees slightly bent. This will help you maintain your balance as you catch the ball.
# 5. Practice your catching technique regularly to improve your skills and increase your chances of success.

# Remember, catching a flat trajectory ball aimed at head height requires good hand-eye coordination, timing, and balance. With practice and patience, you can become a skilled catcher and enjoy the thrill of playing cricket.

# ==========================================================================================================================================================

# With VectorDB context:

# When catching a flat trajectory ball aimed at head height, it is important to keep your eyes fixed on the ball and maintain a steady head position.
# Your palms should be facing the anticipated line of the ball, which means that they should be parallel to the ground and pointing towards the direction of the ball's trajectory.
# Additionally, your hands should be positioned slightly ahead of your body to allow for proper timing and reaction when the ball arrives.
# If the ball is caught directly in front of your head, you should move your head to the side as your hands give, following the natural movement of your arms.
# This will help ensure that you can properly secure the ball without getting hit by it.






# ==========================================================================================================================================================

# Question 2: What are some general points for fielders to keep in mind during a cricket match?

# ==========================================================================================================================================================

# Without VectorDB context:

# During a cricket match, fielders play an important role in supporting their team by preventing the opposing team from scoring runs. Here are some general points that fielders should keep in mind:

# 1. Communication: Effective communication between fielders is crucial for success. They need to communicate with each other about where the ball is going and who is best positioned to catch it.
# 2. Anticipation: Fielders should anticipate the direction and speed of the ball as it travels through the air. This will help them position themselves correctly to catch or stop the ball.
# 3. Positioning: Fielders should position themselves in such a way that they can cover all possible angles and directions that the ball may come from.
# 4. Focus: Fielders should stay focused on the task at hand and avoid getting distracted by anything else.
# 5. Confidence: Fielders should have confidence in their abilities and not let fear or doubt affect their performance.
# 6. Adaptability: Fielders should be able to adapt to changing situations and adjust their positioning and strategy accordingly.
# 7. Safety: Fielders should always prioritize safety and take necessary precautions to prevent injuries.
# 8. Teamwork: Fielders should work together as a team and support each other throughout the match.

# ==========================================================================================================================================================

# With VectorDB context:

# During a cricket match, fielders play an important role in preventing runs from being scored by the opposing team. Here are some general points that fielders should keep in mind:

# 1. Expect every ball to come to you, but do not anticipate its line. This means that fielders should always be alert and ready to catch the ball, regardless of where it is coming from.
# 2. If not fielding the ball, back up at either end to reduce height or speed. This will give the fielder more time to react and prevent them from getting injured.
# 3. Throw the ball immediately to the wicketkeeper or prevent injury and be ready for instant action. Fielders should always throw the ball as quickly as possible to avoid giving the batsman too much time to score runs.
# 4. Save one run by covering, mid on, square leg, etc. Fielders should position themselves strategically to save runs by blocking the path of the ball or catching it before it reaches the boundary.
# 5. Judgement and decision making; looking for gaps in the field; noting ability of fielders and exploiting weaknesses. Fielders should use their judgement and decision making skills to look for gaps in the field and exploit any weaknesses of the opposing team's fielders.






# ==========================================================================================================================================================

# Question 3: When were the modern Laws of cricket first written down and printed?

# ==========================================================================================================================================================

# Without VectorDB context:

# The modern Laws of cricket were first written down and printed in 1844.
# The laws were compiled by a committee appointed by the Marylebone Cricket Club (MCC) in England, which was the governing body of cricket at the time.
# The committee was tasked with standardizing the rules of the game, as there were significant variations between different counties and clubs.
# After several drafts and revisions, the final version of the laws was approved and published in 1844.
# These laws have since been updated and revised several times, but the basic principles and structure remain largely unchanged.

# ==========================================================================================================================================================

# With VectorDB context:

#  Here are the factors that determine when the modern Laws of cricket were first written down and printed:

#   - The Laws of the game were first written down and printed in 1787.
#   - This was done by the Marylebone Cricket Club (MCC), which was formed in 1787.
#   - The MCC also revised the laws in 1788.
#   - According to the document "A History of Cricket" by B. Green, the MCC Cricket Coaching Book (fourth edition) contains a copy of the Laws of Cricket.
#   - The book can be obtained from the MCC, with the address given on page 36 of the document.
# '''

