# Mr. Tydi bn corpus preprocessing

In [1]:
import datasets
import random


In [2]:
raw_corpus = datasets.load_dataset('castorini/mr-tydi-corpus', 'bengali', split='train')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
raw_corpus

Dataset({
    features: ['docid', 'title', 'text'],
    num_rows: 304059
})

In [4]:
raw_corpus = raw_corpus.remove_columns('docid')
raw_corpus = raw_corpus.remove_columns('title')

In [5]:
raw_corpus

Dataset({
    features: ['text'],
    num_rows: 304059
})

In [6]:
text_corpus_list = [entry for entry in raw_corpus['text']]

In [7]:
len(text_corpus_list)

304059

In [8]:
half_length = len(text_corpus_list) // 5

# Shuffle the list randomly
random.seed(42)
random.shuffle(text_corpus_list)

# Remove elements randomly until half of the list remains
while len(text_corpus_list) > half_length:
    text_corpus_list.pop()

In [9]:
len(text_corpus_list)

60811

# Mr. Tydi bn dataset preprocessing

In [None]:
from tqdm import tqdm

In [None]:
import datasets

ds = datasets.load_dataset('castorini/mr-tydi', 'bengali', split='train')

In [None]:
ds

In [None]:
ds = ds.remove_columns('query_id')

In [None]:
ds

In [None]:
pairs = []
for data in tqdm(ds):
    pos = []
    neg = []
    query = data['query']
    
    for text1 in data['positive_passages']:
        pos.append(text1['text'])
    
    # pos = data['positive_passages'][0]['text']

    for text1 in data['negative_passages']:
        pos.append(text1['text'])
    
    # neg = data['negative_passages'][0]['text']
    pairs.append((query, pos, neg))

In [None]:
pairs[0]

# fine tuning

In [None]:
from ragatouille import RAGTrainer
# from ragatouille.utils import get_wikipedia_page

In [None]:
trainer = RAGTrainer(
    model_name = "mrtydi_bn_fine_tuned_colBERT",
    pretrained_model_name = "colbert-ir/colbertv2.0",
    language_code="bn"
)

In [None]:
trainer.prepare_training_data(
    raw_data=pairs,
    # all_documents=text_corpus_list,
    data_out_path="./data/",
    mine_hard_negatives	= False,
)

In [None]:
trainer.train(
    batch_size=8,
    maxsteps = 10000,
) # Train with the default hyperparams

# Indexing

In [14]:
!export CUDA_HOME=$CONDA_PREFIX

In [15]:
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_pretrained(".ragatouille/colbert/none/2024-03/17/02.20.47/checkpoints/colbert")

In [16]:
# full_document = [
#     get_wikipedia_page("রুটি"),
#     get_wikipedia_page("মাইক্রোসফট_কর্পোরেশন"),
#     get_wikipedia_page("অ্যাপল_ইনকর্পোরেটেড"),
#     ]

full_document = text_corpus_list

# article_list = ["রুটি", "মাইক্রোসফট_কর্পোরেশন", "অ্যাপল_ইনকর্পোরেটেড"]
# # Iterate over the list and call the function with each set of parameters
# for params in article_list:
#     full_document += get_wikipedia_page(article_list)

In [17]:
len(full_document)

60811

In [18]:
import faiss

In [34]:
!conda list torch

# packages in environment at /home/turjo/anaconda3/envs/rag-demo2:
#
# Name                    Version                   Build  Channel
fast-pytorch-kmeans       0.2.0.1                  pypi_0    pypi
torch                     2.2.1                    pypi_0    pypi


In [None]:
RAG.index(
    collection=full_document,
    # document_ids=['',],
    index_name="mrtydi-corpus-bn-60k",
    # max_document_length=180,
    split_documents=True,
    )

In [None]:
k = 3 # How many documents you want to retrieve, defaults to 10, we set it to 3 here for readability
RAG.search(query="আবহাওয়াবিদ্যা বলতে কি বুঝায়?", k=k)

# Read index, query, retrieve

In [1]:
 # .ragatouille/colbert/indexes/mrtydi-corpus-bn-60k/

In [2]:
from ragatouille import RAGPretrainedModel



In [3]:
RAG = RAGPretrainedModel.from_pretrained(".ragatouille/colbert/none/2024-03/17/02.20.47/checkpoints/colbert", n_gpu=1)

In [4]:
path_to_index = ".ragatouille/colbert/indexes/mrtydi-corpus-bn-60k/"
RAG = RAGPretrainedModel.from_index(path_to_index, n_gpu=1)

In [5]:
query="বাংলাদেশ এর রাজধানী কোথায়?"
results = RAG.search(query, k=3)
results
context_list = [entry['content'] for entry in results]
context_list = context_list[0] + context_list[1] + context_list[2]

Loading searcher for index mrtydi-corpus-bn-60k for the first time... This may take a few seconds
[May 02, 21:53:09] #> Loading codec...
[May 02, 21:53:09] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[May 02, 21:53:09] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[May 02, 21:53:09] #> Loading IVF...
[May 02, 21:53:09] #> Loading doclens...


100%|██████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 2820.11it/s]

[May 02, 21:53:09] #> Loading codes and residuals...



100%|████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 24.84it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . বাংলাদেশ এর রাজধানী কোথায়?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  1368, 29914, 29882, 29909, 29914, 29900, 29917, 29910,
         1351, 29908,  1372, 29914, 29894, 29901, 29914, 29902, 29916,  1353,
        29917, 29914, 29899, 29914, 29907,  1029,   102,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0], device='cuda:0')



In [6]:
print("query: " + query + "\nretrieved context: ")
print(context_list)

query: বাংলাদেশ এর রাজধানী কোথায়?
retrieved context: 
বাংলাদেশের ভূ-প্রাকৃতিক গঠনই এমন যে, কোথাও কোথাও ভূভাগ যথেষ্ট ঢালু। খুলনার সুন্দরবনের অবস্থান এমন একটা জায়গায়, যা ত্রিভূজাকৃতির বঙ্গোপসাগরের শীর্ষবিন্দুতে গাঙ্গেয় মোহনায় অবস্থিত। এই গাঙ্গেয়কোথাও কেউ নেই, বাংলাদেশ টেলিভিশনে প্রচারিত,ও প্রিয়া তুমি কোথায় বাংলাদেশের জনপ্রিয় একটি গান। এই গানটি আসিফ আকবর এর প্রথম একক অ্যালবামের গান। এই গানটি ও প্রিয়া তুমি কোথায় অ্যালবাম এর কভার সঙ্গীত। এই গানটি ইউটিউব এ ৭০ লাখ বার দেখা হয়েছে।


In [1]:
# imports

import os
os.chdir("..")

import torch
from transformers import GenerationConfig, LlamaTokenizer, LlamaForCausalLM
from peft import PeftModel, PeftConfig

In [2]:
# Utils

def generate_prompt(instruction: str, input_ctxt: str = None) -> str:
    if input_ctxt:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_ctxt}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [4]:
MODEL_PATH = "/media/turjo/hdd/CSE499/llm-models/llama-7b-hf-prompt-answering/"
BASE_MODEL_PATH = "/media/turjo/hdd/CSE499/llm-models/llama-7b-hf-prompt-answering/decapoda-research/llama-7b-hf/"

config = PeftConfig.from_pretrained(MODEL_PATH)

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH)

model = PeftModel.from_pretrained(model, MODEL_PATH)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
generation_config = GenerationConfig(
    temperature=0.2,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=32,
)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)


In [None]:
instruction = query
input_ctxt = context_list

# instruction = "বাংলাদেশ এর রাজধানীর নাম কি?"
# input_ctxt = 'বাংলাদেশের রাজধানী ঢাকায় তার নামে একটি এলাকার নামকরন করা হয়েছে শেরেবাংলা নগর (পূর্ববর্তী আইয়ুবনগর ও তারও পূর্বে মনিপুর) ,২০১৪ বাংলাদেশ স্থানীয় সময় শুক্রবার বেলা তিনটায় রাজধানীর ইউনাইটেড হাসপাতালে চিকিৎসাধীন অবস্থায় মারা যান বেবী মওদুদ। মৃত্যুর সময় তার বয়স হয়েছিল ৬৬ বছর।বাংলাদেশ জাতীয় জাদুঘর বাংলাদেশের রাজধানী ঢাকা শহরে অবস্থিত দেশের প্রধান জাদুঘর। এটি ২০, মার্চ, ১৯১৩ খ্রিস্টাব্দে প্রতিষ্ঠিত হয়, এবং ৭ আগস্ট,'

In [None]:
prompt = generate_prompt(instruction, input_ctxt)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

In [16]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
    )

response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print(response)



OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 11.76 GiB of which 39.44 MiB is free. Including non-PyTorch memory, this process has 11.41 GiB memory in use. Of the allocated memory 10.63 GiB is allocated by PyTorch, and 651.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [2]:
model = AutoModelForCausalLM.from_pretrained(
    "/media/turjo/hdd/CSE499/llm-models/BanglaLLM/bangla-llama-7b-instruct-v0.1/",
    # cache_dir="/data/yash/base_models",
    device_map='auto',
    # quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained("/media/turjo/hdd/CSE499/llm-models/BanglaLLM/bangla-llama-7b-instruct-v0.1/",
                                          # cache_dir="/data/yash/base_models",
                                         )

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.

In [5]:
def get_llama2_chat_reponse(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs,
                             max_new_tokens=max_new_tokens,
                             # temperature=0.2,
                             )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [19]:
question = query
context = context_list
prompt =f"""নিম্নে বর্ণিত ইন্সট্রাকশন এবং কন্টেক্সট হিসেবে দেওয়া ইনপুট অনুযাই সঠিকভাবে প্রশ্নের উত্তর দিন।

### ইন্সট্রাকশন:
{question}

### ইনপুট:
{context}

### উত্তর:
"""

print(get_llama2_chat_reponse(prompt, max_new_tokens=20))

নিম্নে বর্ণিত ইন্সট্রাকশন এবং কন্টেক্সট হিসেবে দেওয়া ইনপুট অনুযাই সঠিকভাবে প্রশ্নের উত্তর দিন।

### ইন্সট্রাকশন:
বাংলাদেশ এর রাজধানীর নাম কি?

### ইনপুট:
বাংলাদেশের রাজধানী ঢাকায় তার নামে একটি এলাকার নামকরন করা হয়েছে শেরেবাংলা নগর (পূর্ববর্তী আইয়ুবনগর ও তারও পূর্বে মনিপুর) ,২০১৪ বাংলাদেশ স্থানীয় সময় শুক্রবার বেলা তিনটায় রাজধানীর ইউনাইটেড হাসপাতালে চিকিৎসাধীন অবস্থায় মারা যান বেবী মওদুদ। মৃত্যুর সময় তার বয়স হয়েছিল ৬৬ বছর।বাংলাদেশ জাতীয় জাদুঘর বাংলাদেশের রাজধানী ঢাকা শহরে অবস্থিত দেশের প্রধান জাদুঘর। এটি ২০, মার্চ, ১৯১৩ খ্রিস্টাব্দে প্রতিষ্ঠিত হয়, এবং ৭ আগস্ট,

### উত্তর:
বাংলাদেশের রাজধানী বাংলাদেশের প্রধান জাদুঘর বাংলাদেশের


In [6]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from transformers import pipeline

In [7]:
# model_name_or_path = "/media/turjo/hdd/CSE499/llm-models/Llama-2-7B-Chat-AWQ/"
model_name_or_path = "/media/turjo/hdd/CSE499/llm-models/Llama-2-13B-chat-AWQ/"

# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=True, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

Replacing layers...: 100%|█████████████████████████████████████████████████| 40/40 [00:06<00:00,  6.50it/s]
Fusing layers...: 100%|████████████████████████████████████████████████████| 40/40 [00:00<00:00, 58.16it/s]


In [None]:
question = query
context = context_list

prompt_template = f"""
You are a nice and helpful question-answering chatbot.
Use the following pieces of context to answer the question at the end. Answer in Bengali language.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

[INST]{question}[/INST]
"""

# prompt = "what is the capital of bangladesh?"
# prompt_template=f'''{prompt}

# '''

print("\n\n*** Generate:")

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    max_new_tokens=100
)

print("Output: ", tokenizer.decode(generation_output[0]))

In [4]:
import requests

In [5]:
# question = query
# context = context_list

# prompt_template = f"""
# You are a nice and helpful question-answering chatbot.
# Use the following pieces of context to answer the question at the end. Answer in Bengali language.
# If you don't know the answer, just say that you don't know, don't try to make up an answer.

# {context}

# [INST]{question}[/INST]
# """

qatext = "how are you today?"

headers = {
    "Content-Type": "application/json",
}

data = {
    'inputs': qatext,
    'parameters': {
        
    },
}
response = requests.post('http://0.0.0.0:8080/generate', headers=headers, json=data)
print(response.json())

{'generated_text': "\n\nAnswer: I'm doing well, thank you for asking! I'm just an AI, I don't have feelings or emotions like humans do, but I'm always happy to help with any questions or tasks you may have. How can I assist you today?"}
