In [1]:
# Install Dependencies & Check GPU
!pip install -q torch pymupdf llama-index llama-index-llms-llama-cpp llama-index-embeddings-huggingface

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Upload & Extract PDF Text
from google.colab import files
import fitz  # PyMuPDF
import os

# Upload PDF
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

# Extract text
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text() for page in doc])
print(f"Extracted {len(text.split())} words.")

Saving sample_contract.pdf to sample_contract.pdf
Extracted 315 words.


In [3]:
# Download Mistral 7B in GGUF format
model_path = "/content/mistral.gguf"

if not os.path.exists(model_path):
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}

print("Model downloaded and ready.")

--2025-07-01 08:48:26--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 3.166.152.44, 3.166.152.105, 3.166.152.110, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1751363306&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MTM2MzMwNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiND

In [4]:
# Set Up LlamaIndex with Mistral + Embeddings
from llama_index.core import VectorStoreIndex, Document, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.settings import Settings
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Load Mistral 7B LLM
llm = LlamaCPP(
    model_path=model_path,
    temperature=0.7,
    max_new_tokens=512,
    context_window=2048,
    model_kwargs={"n_gpu_layers": 1}
)

# Embedding model for chunk search
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Register with LlamaIndex
Settings.llm = llm
Settings.embed_model = embed_model

# Load text into LlamaIndex
documents = [Document(text=text)]
index = VectorStoreIndex.from_documents(documents)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /content/mistral.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Configure Retrieval + QA Pipeline
retriever = VectorIndexRetriever(index=index, similarity_top_k=2)

response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [6]:
# Ask Questions from PDF
prompts = [
    "What are the penalties for late payments?",
    "Summarize the key terms in this contract.",
    "What is the refund policy?",
]

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    response = query_engine.query(prompt)
    print(f"Response:\n{response}\n")


Prompt: What are the penalties for late payments?


llama_perf_context_print:        load time =  197706.33 ms
llama_perf_context_print: prompt eval time =  197705.52 ms /   555 tokens (  356.23 ms per token,     2.81 tokens per second)
llama_perf_context_print:        eval time =   11670.42 ms /    16 runs   (  729.40 ms per token,     1.37 tokens per second)
llama_perf_context_print:       total time =  209387.90 ms /   571 tokens
Llama.generate: 541 prefix-match hit, remaining 15 prompt tokens to eval


Response:
1.5% interest per month from the due date until paid in full.


Prompt: Summarize the key terms in this contract.


llama_perf_context_print:        load time =  197706.33 ms
llama_perf_context_print: prompt eval time =    5961.87 ms /    15 tokens (  397.46 ms per token,     2.52 tokens per second)
llama_perf_context_print:        eval time =  155188.11 ms /   212 runs   (  732.02 ms per token,     1.37 tokens per second)
llama_perf_context_print:       total time =  161307.90 ms /   227 tokens
Llama.generate: 541 prefix-match hit, remaining 11 prompt tokens to eval


Response:

1. Effective Date: January 15, 2025
2. Parties: ABC Company Inc. (Service Provider) and XYZ Corporation (Client)
3. Services: Consulting services as described in Exhibit A.
4. Standards: Service Provider to perform Services in accordance with industry standards.
5. Payment: Client to pay Service Provider at rates specified in Exhibit B, monthly invoicing, net 30-day payment terms, and late payment interest.
6. Term: One-year term, renewable or terminated with 30 days' written notice.
7. Refund Policy: Dissatisfied Clients may request refunds within 14 days of service delivery, refunds issued at Service Provider's discretion, no refunds for completed projects.
8. Confidentiality: Parties to maintain confidentiality of each other's information and not disclose it to third parties without prior written consent.


Prompt: What is the refund policy?


llama_perf_context_print:        load time =  197706.33 ms
llama_perf_context_print: prompt eval time =    4486.16 ms /    11 tokens (  407.83 ms per token,     2.45 tokens per second)
llama_perf_context_print:        eval time =   58393.40 ms /    80 runs   (  729.92 ms per token,     1.37 tokens per second)
llama_perf_context_print:       total time =   62927.79 ms /    91 tokens


Response:
1. If a client is dissatisfied with the Services, they may request a refund within 14 days of service delivery. 2. Refunds are issued at the sole discretion of the Service Provider and will be processed within 30 days of approval. 3. No refunds will be issued for completed projects that meet the specifications outlined in Exhibit A.

