In [None]:

!pip install transformers datasets sentence-transformers faiss-cpu flask wandb accelerate bitsandbytes


Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-man

In [None]:

!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `minor-project-final-project` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might h

In [None]:
!pip install faiss-cpu




In [None]:
import torch
import faiss
import gc
import wandb
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from peft import PeftModel

# ==============================
# 1️⃣ Initialize Weights & Biases
# ==============================
wandb.init(project="rag-mistral7b", name="data_preprocessing")

# ============================
# 2️⃣ Load & Process Dataset
# ============================
print("Loading Alpaca dataset...")
data = load_dataset("tatsu-lab/alpaca", split="train")
data = data.select(range(5000))  # Use top 5000 entries

# Format dataset for retrieval
def format_text(example):
    return f"###Human: {example['instruction']} {example['input']} ###Assistant: {example['output']}"

data = [format_text(ex) for ex in data]

# ===========================
# 3️⃣ Create FAISS Index
# ===========================
print("Creating FAISS index...")
retriever = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = retriever.encode(data, convert_to_numpy=True)

# Create and save FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "rag_index.faiss")

wandb.log({"faiss_index_size": len(data)})

# Free memory
del embeddings
gc.collect()
torch.cuda.empty_cache()

# ==============================
# 4️⃣ Load Fine-Tuned Mistral Model with Efficient Offloading
# ==============================
wandb.init(project="rag-mistral7b", name="rag_training")

print("Loading Mistral model with offloading...")
base_model_name = "mistralai/Mistral-7B-v0.1"
fine_tuned_model_name = "deep0210/mistral-finetuned-alpaca"
offload_dir = "./offload"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    offload_folder=offload_dir,
    quantization_config=bnb_config
)

# Load fine-tuned LoRA adapter
model = PeftModel.from_pretrained(model, fine_tuned_model_name)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model and tokenizer loaded successfully!")

# ===========================
# 5️⃣ Store Model & Index Locally
# ===========================
model.save_pretrained("mistral-rag")
tokenizer.save_pretrained("mistral-rag")
print("Model stored in 'mistral-rag' directory!")

# ===========================
# 6️⃣ RAG Inference Code
# ===========================
print("Setting up retrieval system...")
retriever = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("rag_index.faiss")

def retrieve_context(query, top_k=3):
    query_embedding = retriever.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [data[i] for i in indices[0]]
    return "\n".join(retrieved_texts)

def generate_response(query):
    context = retrieve_context(query)
    input_text = f"{context}\n\n###Human: {query} ###Assistant:"

    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=100, eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2,
            do_sample=True, top_p=0.9, temperature=0.7
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ===========================
# 7️⃣ Test the RAG Model
# ===========================
query = "How do large language models work?"
response = generate_response(query)
print("\n=== RAG Model Response ===")
print(response)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdeepdey524[0m ([33mdeepdey524-manipal-university-jaipur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading Alpaca dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Creating FAISS index...
Loading Mistral model with offloading...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model and tokenizer loaded successfully!
Model stored in 'mistral-rag' directory!
Setting up retrieval system...

=== RAG Model Response ===
###Human: Explain the concept of a GPT language model.  ###Assistant: A GPT language model is a type of artificial neural network used for natural language processing. It is an autoregressive model that predicts the next word in a sequence using a series of mathematical operations. It is trained on a large corpus of text to learn the language and can then be used to generate text that is similar to the text it was trained on.
###Human: Train a natural language model to predict the next word given a context.  ###Assistant: Creating a Natural Language Model for predicting the next word given a context involves a few stages:

1. Preprocessing - Here, text is split into sentences, tokenized, punctuation is removed and stopwords are removed from the text.

2. Embeddings - Here, words are represented by dense vectors of real numbers and words with simil

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import HfApi

model_name = "hybrid-finetuned-rag-mistral-llm"
hf_username = "deep0210"  # Change to your Hugging Face username

# Push model
model.push_to_hub(f"{hf_username}/{model_name}")

# Push tokenizer
tokenizer.push_to_hub(f"{hf_username}/{model_name}")


adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/deep0210/hybrid-finetuned-rag-mistral-llm/commit/ec06f13271fe3f4c371adaaac24f5b97b0314dd3', commit_message='Upload tokenizer', commit_description='', oid='ec06f13271fe3f4c371adaaac24f5b97b0314dd3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/deep0210/hybrid-finetuned-rag-mistral-llm', endpoint='https://huggingface.co', repo_type='model', repo_id='deep0210/hybrid-finetuned-rag-mistral-llm'), pr_revision=None, pr_num=None)

In [None]:
from IPython.display import display, HTML

wrapped_text = """
<pre style="white-space: pre-wrap; word-wrap: break-word;">
###Human: Train a natural language model to predict the next word given a context.
###Assistant: Creating a Natural Language Model for predicting the next word given a context involves a few stages:
1. Preprocessing - Here, text is split into sentences, tokenized, punctuation is removed and stopwords are removed from the text.
2. Embeddings - Here, words are represented by dense vectors of real numbers and words with similar meanings have a similar representation.
3. Training - Here, the model is trained using supervised learning, where the machine is shown examples of previous texts to learn the patterns. The model predicts the next word in the sequence once trained.
4. Evaluation - The model is evaluated on unseen data to ensure it performs well on the task.
</pre>
"""

display(HTML(wrapped_text))
