# Install required packages with version specifications

In [1]:
!pip install fastai

!pip install -U pypdf langchain langchain-community sentence-transformers faiss-cpu accelerate unstructured

!pip install -U langchain-huggingface

!pip install jq

!pip install bitsandbytes

# Install latest version of transformers from GitHub
!pip install git+https://github.com/huggingface/transformers

!pip install torch torchvision torchaudio
!pip install einops  # needed for transformer models

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<2.7,>=1.10->fastai)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

# Setup

In [2]:
CONFIG = {
    # "model_name": "mistralai/Mistral-7B-Instruct-v0.1", # LLM
    # "model_name": "mistralai/Mistral-7B-Instruct-v0.3", # edited by ali (testing purpose) -> (after testing v3, answers are more accurate)
    "model_name": "meta-llama/Llama-2-13b-chat-hf", # bestest for Now
    # "model_name": "braindao/Qwen2.5-14B",  # very bad
    "hf_token": "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",  # Hugging face token
    # "embedding_model": "BAAI/bge-large-en-v1.5", #edit by ali (testing purpose) -> (after testing it, answers are more accurate)
    "embedding_model": "BAAI/bge-m3",
    "doc_paths": ["/content/data.json"],
    "max_context_tokens": 1024,
    "chunk_size": 260,
    "chunk_overlap": 100,
    "gen_params": {
        "max_new_tokens": 512,
        "temperature": 0.7,
        "do_sample": True
    }
}

# Organized Dependecies

In [3]:
# Core libraries
import torch
import numpy as np
import re # regex
import logging
import jq # for loading json
import json

# LangChain components
from langchain.document_loaders import CSVLoader, PyPDFLoader, TextLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveJsonSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.docstore.document import Document

# Transformers pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

# DataSet Processing

In [4]:
def load_and_chunk_documents(paths: list) -> list:
    """Load and split documents into chunks, with JSON-specific fixes."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CONFIG["chunk_size"],
        chunk_overlap=CONFIG["chunk_overlap"]
    )

    # reference : https://python.langchain.com/docs/how_to/recursive_json_splitter/
    json_splitter = RecursiveJsonSplitter(max_chunk_size=300)

    loaders = {
        '.csv': CSVLoader,
        '.pdf': PyPDFLoader,
        '.txt': TextLoader,
        '.json': lambda path: JSONLoader(file_path=path, jq_schema='.', text_content=False) # added by layal
    }

    all_docs = []
    for path in paths:
        ext = path[path.rfind('.'):]
        if ext not in loaders:
            continue

        loader = loaders[ext](path)
        docs = loader.load()

        if ext == '.json':
            for doc in docs:
                try:
                    json_data = json.loads(doc.page_content)

                    if isinstance(json_data, list):
                        json_data = {"items": json_data}

                    json_chunks = json_splitter.split_json(json_data)

                    # Convert chunks to strings for Document compatibility
                    for chunk in json_chunks:
                        new_doc = Document(
                            page_content=json.dumps(chunk),
                            metadata=doc.metadata.copy()
                        )
                        all_docs.append(new_doc)
                except Exception as e:
                    print(f"JSON error in {path}: {e}")
        else:
            chunks = text_splitter.split_documents(docs)
            all_docs.extend(chunks)

    return all_docs

In [5]:
def create_vector_store(docs: list):
    """Create FAISS vector store from documents."""
    # embeddings = HuggingFaceEmbeddings(model_name=CONFIG["embedding_model"])
    # edit by ali
    embeddings = HuggingFaceEmbeddings(
        model_name=CONFIG['embedding_model'],
        model_kwargs={"device": "cuda"},  # or cpu
        encode_kwargs={
            "normalize_embeddings": True  # Required for cosine similarity
        }
    )
    return FAISS.from_documents(documents=docs, embedding=embeddings)

# Model Loading

In [6]:
def initialize_model():
    """Initialize the language model and pipeline."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # print(f"Using device: {device}")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"], token=CONFIG["hf_token"])

    # ALI:trying to quantatize the model using transformers (and it works)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 8-bit quantization (accuracy enhacement)
    )
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        token=CONFIG["hf_token"],
        quantization_config=bnb_config
    ).to(device)

    # Create pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        pad_token_id=tokenizer.eos_token_id,
        **CONFIG["gen_params"]
    )

    return HuggingFacePipeline(pipeline=pipe)

# Query Processing

In [7]:
def build_prompt(context, query):
    return f"""You're an expert analyst. Follow these rules:
1. Base answers on the context below
2. Use technical language


Context:
{''.join(context)}

Question: {query}

Final Answer:"""

In [12]:
def get_response(query: str, vector_store, confidence_percentile: int = 60) -> str:
    """Process query and generate response."""
    # RAG cors (Retrievel Augmented Generation)
    # Retrieve relevant documents
    candidate_results = vector_store.similarity_search_with_score(query, k=200)
    # Build context
    scores = np.array([score for _, score in candidate_results])
    threshold = np.percentile(scores, confidence_percentile)

    context = []
    """query/context\answer"""
    current_tokens = 0
    for doc, score in candidate_results:
        if score < threshold and current_tokens < CONFIG["max_context_tokens"]:
            context.append(doc.page_content)
            current_tokens += len(doc.page_content.split())

    # Generate response
    prompt = build_prompt(context, query)

    llm_output = llm.invoke(prompt).strip()

    # Extract final answer using regex
    final_answer = re.search(
        r'(?i)(?:final\s+answer\s*:\s*)([\s\S]*)',
        llm_output,
        re.IGNORECASE
    )

    return final_answer.group(1).strip() if final_answer else llm_output

# TODO:
# 1. Input Validation:
#    - Ensure 'query' is a non-empty string.
#    - Check that 'vector_store' has the required methods (e.g., similarity_search_with_score).
#
# 2. Robust Retrieval:
#    - Handle the case when no candidate documents are returned.
#    - Log or raise an error if the candidate_results list is empty.
#
# 3. Context Building Enhancements:
#    - Consider refining the token counting logic (e.g., using a tokenizer instead of split()).
#    - Allow for different strategies to select context (e.g., based on score ranking or alternative thresholds).
#
# 4. Dynamic Prompt Construction:
#    - Externalize the prompt template for easier adjustments.
#    - Consider parameterizing the context separator (e.g., newline vs. space).
#
# 5. LLM Invocation Handling:
#    - Implement error handling around the LLM invocation (e.g., timeouts or connection issues).
#    - Provide fallback strategies if the LLM output is not as expected.
#
# 6. Response Extraction:
#    - Improve the regex pattern to handle more diverse LLM outputs.
#    - Consider alternative parsing strategies (e.g., using structured output or markers in the prompt).
#
# 7. Logging and Monitoring:
#    - Log key steps (e.g., candidate selection, prompt generation, LLM response) for debugging.
#
# 8. Testing:
#    - Create unit tests to cover edge cases, including:
#       * Empty or null query.
#       * No matching documents.
#       * LLM returning unexpected formats.
#
# 9. Performance Optimization:
#    - Optimize the loop for context building, perhaps by using batch processing of documents.
#    - Monitor and log the performance of similarity_search_with_score and context assembly.
#
# 10. Documentation:
#    - Enhance the function docstring with detailed explanations of parameters, process, and potential exceptions.

# Process Flow

In [11]:
print("Initializing The LLM")
llm = initialize_model()

Initializing The LLM


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cuda:0


In [9]:
print("Loading documents...")
documents = load_and_chunk_documents(CONFIG["doc_paths"])
print(f"Loaded {len(documents)} documents")

Loading documents...
Loaded 3 documents


In [10]:
vector_store = create_vector_store(documents)
print(f"Loaded {len(documents)} document chunks")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Loaded 3 document chunks


In [16]:
queries = [
    # 'highest gpa in the students?',
    # 'what is NLP',
    # """
    # # 'How many publications does Dr. Alice Johnson have?',
    # # 'How many publications does Dr. John Smith have?',
    # 'how many doctors we have?',
    # 'what are the publications of each Doctor'
    # 'Is there any publication about statistics',
    # 'list all the publications plz',
    # 'did any professor publish about ai'
    # 'do we have computer science major?',
    # 'what are the majors in the uni?',
    # 'what info do u have about every student?',
    # 'Calculate the average gpa of students?',
    #'How many students study computer science?',
    # 'How many faculties are there?',
    # 'list the members in well form',
    # 'What do you know about Dr. John Smith',
    # 'major of Emily Davis',
    # 'list the Members in Faculty of Computer Science only?',
    # 'introduce the student, Michael Brown' # all is about prompt enginnering now,
    # 'introduce all the publications of doctors'
    # 'list the faculities names'
    # 'give me total grades ',
    'how many credits per semester for the Computer Science'
]
# ali assi: to ensure more well-formed answers, we can achieve this by refining the response
for query in queries:
    print('******************************************************************')
    print(f"Query: {query}")
    print(f"Response: {get_response(query, vector_store)}")

******************************************************************
Query: how many credits per semester for the Computer Science
Response: Based on the provided context, the Computer Science major at the Lebanese University has 30 credits per semester.


# Deployment

### Steps:

1. **Create the Flask app**  
   - Install Flask if needed.
   - Initialize a basic Flask application.

2. **Implement the routes**  
   - Define a simple `/predict` route that processes requests.

3. **Create a tunnel between Colab and your local machine**  
   - Use `flask-ngrok`

4. **Set up the tunnel routes**  
   - Ensure the tunnel is correctly routing requests from the public URL to your Colab-hosted Flask server.

In [None]:
!pip install flask pyngrok
!pip install flask-cors

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3
Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-5.0.1


In [None]:
!ngrok config add-authtoken 2t4cKyto0m9BRS4ht7JfpFiftTR_6QvcFvTCuVfLGk87EGSjX

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# Helper Function (abstract)
def LLM_PREDICTION(data):
    if not data:
      return "no data"
    return get_response(data, vector_store)

In [None]:
from flask import Flask, jsonify, request
from flask_cors import CORS
import threading

app = Flask(__name__)

# Configure CORS properly
CORS(app, resources={r"/predict": {"origins": "*"}}, methods=["GET"])  # Allow only GET

<flask_cors.extension.CORS at 0x7db5a5b3ca10>

In [None]:
# USELESS BUT It is for testing purposes
@app.route('/', methods=['GET'])
def index():
    return "Hello World...."

In [None]:
@app.route('/predict', methods=['GET'])
def predict():
    input_text = request.args.get('input')  # Extract query parameter

    if input_text:
        print(f"Received input: {input_text}")
        prediction = LLM_PREDICTION(input_text)
        return jsonify({
            "result": prediction,
            "data_received": input_text
        })
    else:
        return jsonify({"error": "No input provided"}), 400

In [None]:
def start_ngrok():
    from pyngrok import ngrok
    ngrok.set_auth_token("YOUR_OWN_API_TOKEN")
    ngrok.kill()  # Clean up existing tunnels
    public_url = ngrok.connect(5000, bind_tls=True).public_url
    # this is the api
    print(f"Public URL: {public_url}")

In [None]:
# Start ngrok in a separate thread
threading.Thread(target=start_ngrok).start()

# Run Flask with proper host and port configuration
app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Public URL: https://dc88-34-142-187-128.ngrok-free.app
