<a href="https://colab.research.google.com/github/Adyypower/LLMs-Model/blob/main/RAGUsingLLAMA_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q "langchain-astradb>=0.3.2" langchain cassandra-driver pypdf sentence-transformers "transformers>=4.38" "torch>=2.1" requests

print("✅ Installation complete!")


✅ Installation complete!


In [5]:
import os
from google.colab import userdata

try:
    ASTRA_DB_API_ENDPOINT = userdata.get('ASTRA_DB_API_ENDPOINT')
    ASTRA_DB_APPLICATION_TOKEN = userdata.get('ASTRA_DB_APPLICATION_TOKEN')
    print("✅ Astra DB credentials loaded successfully from Colab Secrets!")
except (KeyError, TypeError):
    print("⚠️ Secure credentials not found.")
    # Fallback for direct pasting (NOT RECOMMENDED)
    # ASTRA_DB_API_ENDPOINT = "YOUR_API_ENDPOINT_HERE"
    # ASTRA_DB_APPLICATION_TOKEN = "YOUR_TOKEN_HERE (starts with AstraCS:...)"
    if 'ASTRA_DB_API_ENDPOINT' not in locals():
        raise ValueError("Astra DB credentials are not set.")

✅ Astra DB credentials loaded successfully from Colab Secrets!


In [10]:
import requests
pdf_url = "https://arxiv.org/pdf/1706.03762.pdf"
pdf_path = "attention_is_all_you_need.pdf"

try:
    response = requests.get(pdf_url)
    response.raise_for_status()
    with open(pdf_path, 'wb') as f:
        f.write(response.content)
    print(f"✅ Successfully downloaded '{pdf_path}'")
except requests.exceptions.RequestException as e:
    print(f"❌ Error downloading PDF: {e}")

✅ Successfully downloaded 'attention_is_all_you_need.pdf'


In [7]:
!pip install langchain-community



In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


In [None]:
loader = PyPDFLoader(pdf_path)
documents = loader.load()
print(f"📄 Loaded {len(documents)} pages from the research paper.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print(f" বিভক্ত করা হয়েছে {len(docs)} chunks of text.")

my_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("✅ Embedding model loaded.")

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [23]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_astradb.vectorstores import AstraDBVectorStore # Updated import
from langchain_community.llms import LlamaCpp
from langchain_community.embeddings import LlamaCppEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

In [13]:
from huggingface_hub import notebook_login

# 3. Run the login function
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
# 1. Upgrade pip and other build tools to prevent common issues
!pip install --upgrade pip setuptools

# 2. Install a specific, known-stable version of llama-cpp-python with GPU acceleration
# This is the key to bypassing the build error.
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.77 --force-reinstall --upgrade --no-cache-dir

Collecting llama-cpp-python==0.2.77
  Downloading llama_cpp_python-0.2.77.tar.gz (50.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 MB[0m [31m22.1 MB/s[0m  [33m0:00:02[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions>=4.5.0 (from llama-cpp-python==0.2.77)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python==0.2.77)
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting diskcache>=5.6.1 (from llama-cpp-python==0.2.77)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting jinja2>=2.11.3 (from llama-cpp-python==0.2.77)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting Ma

In [16]:
from huggingface_hub import hf_hub_download

# This will download the model file to your Colab environment
# It will show a progress bar and might take a few minutes as the file is large (~4 GB)
print("Downloading Llama 2 model...")

model_path = hf_hub_download(
    repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
    filename="llama-2-7b-chat.Q4_K_M.gguf"
)

print(f"Model downloaded to: {model_path}")

Downloading Llama 2 model...


llama-2-7b-chat.Q4_K_M.gguf:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

Model downloaded to: /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q4_K_M.gguf


In [18]:
from langchain_community.llms import LlamaCpp

# Now, this will work because 'model_path' contains the real location
# of the downloaded file.
llm = LlamaCpp(
    model_path=model_path, # Use the path from the download step
    n_gpu_layers=40,
    n_batch=512,
    n_ctx=4096,
    f16_kv=True,
    verbose=True,
)

print("✅ LlamaCpp model loaded successfully!")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_co

✅ LlamaCpp model loaded successfully!


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | 
Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000001', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '32', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}
Using fallback chat format: llama-2


In [20]:
embedding = LlamaCppEmbeddings(model_path=model_path)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_co

In [21]:
COLLECTION_NAME = "llma2_rag_app"

In [24]:
vector_store = AstraDBVectorStore( # Updated class name
    embedding=embedding,
    collection_name=COLLECTION_NAME,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
)


llama_print_timings:        load time =    3093.16 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    3466.33 ms /     7 tokens (  495.19 ms per token,     2.02 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    3509.53 ms /     8 tokens


In [31]:
from langchain_community.document_loaders import PyPDFLoader # Import PyPDFLoader

def ingest_data(file_path):
    """
    Loads, splits, and ingests data from a PDF into AstraDB.
    """
    loader = PyPDFLoader(file_path) # Use PyPDFLoader
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)

    vector_store.add_documents(texts)
    print(f"Ingested {len(texts)} text chunks from {file_path}")

In [27]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)

In [29]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_with_memory = ConversationalRetrievalChain.from_llm(
    llm, vector_store.as_retriever(), memory=memory
)

In [33]:
if __name__ == "__main__":
    # Ingest data from the downloaded PDF
    ingest_data(pdf_path) # Use the pdf_path variable

    # --- Q&A Example ---
    query_qa = "What are the main features of the Transformer model?"
    result_qa = qa_chain.invoke({"query": query_qa}) # Updated to use invoke
    print("\n--- Q&A Result ---")
    print(f"Query: {result_qa['query']}")
    print(f"Result: {result_qa['result']}")
    print(f"Source Documents: {result_qa['source_documents']}")

    # # --- Summarization Example (Temporarily commented out) ---
    # query_summary = "Summarize the provided document."
    # result_summary = qa_chain({"query": query_summary})
    # print("\n--- Summarization Result ---")
    # print(f"Query: {result_summary['query']}")
    # print(f"Result: {result_summary['result']}")

    # # --- Chat with Memory Example (Temporarily commented out) ---
    # print("\n--- Chat with Memory ---")
    # while True:
    #     user_input = input("You: ")
    #     if user_input.lower() in ["exit", "quit"]:
    #         break
    #     response = qa_with_memory({"question": user_input})
    #     print(f"AI: {response['answer']}")


llama_print_timings:        load time =    3093.16 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   54939.14 ms / 12693 tokens (    4.33 ms per token,   231.04 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   59348.26 ms / 12694 tokens


Ingested 49 text chunks from attention_is_all_you_need.pdf



llama_print_timings:        load time =    3093.16 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    5095.68 ms /    12 tokens (  424.64 ms per token,     2.35 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    5136.07 ms /    13 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     580.70 ms
llama_print_timings:      sample time =      92.11 ms /   131 runs   (    0.70 ms per token,  1422.17 tokens per second)
llama_print_timings: prompt eval time =     986.68 ms /   811 tokens (    1.22 ms per token,   821.95 tokens per second)
llama_print_timings:        eval time =    3605.10 ms /   130 runs   (   27.73 ms per token,    36.06 tokens per second)
llama_print_timings:       total time =    4792.52 ms /   941 


--- Q&A Result ---
Query: What are the main features of the Transformer model?
Result: The main features of the Transformer model are:

* Based solely on attention mechanisms, dispensing with recurrence and convolutions
* The encoder and decoder are both composed of multiple identical layers, each of which consists of a self-attention mechanism followed by a position-wise feed-forward network (FFN)
* Each layer in the encoder and decoder also contains an embedding layer that maps the input sequence to a higher dimensional space
* The softmax function is used to compute attention weights for each token in the input sequence, which are then used to compute the output of the FFN.
Source Documents: [Document(id='5814c8666092480d8fd105fa0900505d', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2

In [2]:
import os
from google.colab import drive
from huggingface_hub import hf_hub_download

# --- Step 1: Connect to Google Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive connected!")

# --- Step 2: EDIT THIS LINE to change your save location ---
# This is the only line you need to change.
# Example: "/content/drive/My Drive/My Projects/AI Models"
drive_model_dir = "/content/drive/My Drive/Colab_AI_Models" # <--- YOUR NEW FOLDER PATH

os.makedirs(drive_model_dir, exist_ok=True)
print(f"Model will be saved in: {drive_model_dir}")

# --- Step 3: Download the Model Directly to Your New Location ---
print("Downloading Llama 2 model directly to your specified Google Drive folder...")

model_filename = "llama-2-7b-chat.Q4_K_M.gguf"
final_model_path = os.path.join(drive_model_dir, model_filename)

# Check if the model already exists to avoid re-downloading
if not os.path.exists(final_model_path):
    hf_hub_download(
        repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
        filename=model_filename,
        local_dir=drive_model_dir, # Download to the specified folder
        local_dir_use_symlinks=False # Prevents complex folder structures
    )
    print(f"✅ Model successfully downloaded to: {final_model_path}")
else:
    print(f"✅ Model already exists at: {final_model_path}")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive connected!
Model will be saved in: /content/drive/My Drive/Colab_AI_Models
Downloading Llama 2 model directly to your specified Google Drive folder...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


KeyboardInterrupt: 