# now converting the entire code in a single cell

The free Colab environment couldn't run the 16GB model and the embedding model at the same time without crashing. I solved this by implementing a memory-clearing step in the code. Once the embedding model is done creating the knowledge base, it's removed from the GPU, which frees up enough memory for the main model to work correctly.

# The Solution: Free Up Memory
The solution is to be smarter about memory management. Once we have finished creating our vector_store, we don't need the embeddings model anymore. We can safely delete it from memory to free up precious VRAM for the main LLM to use.



In [None]:
# app.py
# This script contains the full backend logic for our Financial Analyst application.

import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader

# --- 1. Model Loading ---
def load_model_and_tokenizer(drive_model_path):
    """
    Loads the pre-trained model and tokenizer from a specified path on Google Drive.
    This function assumes the model has already been downloaded and saved.
    """
    print(f"Checking for model at: {drive_model_path}")
    if not os.path.exists(drive_model_path):
        print("❌ Model not found. Please ensure the model is downloaded and the path is correct.")
        return None, None

    print("✅ Model already exists on Google Drive. Loading...")
    tokenizer = AutoTokenizer.from_pretrained(drive_model_path)
    model = AutoModelForCausalLM.from_pretrained(drive_model_path, torch_dtype=torch.bfloat16, device_map="auto")

    if model and tokenizer:
        print("✅ Llama 3.1 8B Model and Tokenizer are loaded and ready.")

    return model, tokenizer

# --- 2. LangChain Pipeline Creation ---
def create_llm_pipeline(model, tokenizer):
    """
    Creates a LangChain-compatible pipeline for text generation.
    """
    if not model or not tokenizer:
        print("❌ Cannot create pipeline because model or tokenizer was not loaded.")
        return None

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.15,
        return_full_text=False,
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    print("✅ Llama 3.1 8B LLM Pipeline created and ready!")
    return llm

# --- 3. RAG Pipeline (Vector Store Creation) ---
def build_vector_store(pdf_file_path):
    """
    Loads a PDF, splits it into chunks, and creates a FAISS vector store.
    """
    if not os.path.exists(pdf_file_path):
        print(f"❌ PDF file not found at '{pdf_file_path}'.")
        return None

    print("Loading PDF...")
    loader = PyPDFLoader(pdf_file_path)
    pages = loader.load()
    print(f"PDF loaded successfully. It has {len(pages)} pages.")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
    docs = text_splitter.split_documents(pages)
    print(f"Document split into {len(docs)} chunks.")

    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    print("Embedding model loaded.")

    print("Creating FAISS vector store from document chunks... This may take a moment.")
    vector_store = FAISS.from_documents(docs, embeddings)
    print("✅ Vector store created successfully!")

    # FIX: Free up memory by deleting the embedding model after use
    del embeddings
    torch.cuda.empty_cache()
    print("✅ Embedding model memory cleared.")

    return vector_store

# --- 4. Q&A Function ---
def ask_analyst(question, vector_store, llm):
    """
    Takes a question, finds relevant context, and generates an answer using the LLM.
    """
    if not vector_store or not llm:
        return "Error: Vector store or LLM not initialized."

    print(f"\nAnalysing report for the question: '{question}'")
    relevant_docs = vector_store.similarity_search(question, k=3)
    context = "\n\n".join([doc.page_content for doc in relevant_docs])

    template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert financial analyst. Your task is to answer the user's question based *only* on the provided text from the company's annual report. Be precise and cite specific numbers or facts from the text. If the answer is not in the provided text, say 'The answer is not available in the provided context.'<|eot_id|><|start_header_id|>user<|end_header_id|>
CONTEXT FROM ANNUAL REPORT:
---
{context}
---

QUESTION: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)

    print("\nGenerating final answer with LangChain...")
    result = llm_chain.invoke({"context": context, "question": question})

    return result['text'].strip()

# --- Main Execution Block (for testing) ---
# This block will run if you execute the script directly.
# Tomorrow, we will replace this with our Streamlit UI code.
if __name__ == "__main__":
    # Define paths
    repo_id = "meta-llama/Llama-3.1-8B-Instruct"
    drive_model_path = f"/content/drive/MyDrive/llama_model/{repo_id.replace('/', '_')}"
    pdf_file_path = "/content/drive/MyDrive/Annual report financial analyst/microsoft_2024_annual_report.pdf"
    # --- Step 1: Load the Model ---
    model, tokenizer = load_model_and_tokenizer(drive_model_path)

    # --- Step 2: Create the LLM Pipeline ---
    llm = create_llm_pipeline(model, tokenizer)

    # --- Step 3: Build the Knowledge Base ---
    vector_store = build_vector_store(pdf_file_path)

    # --- Step 4: Ask a question ---
    if vector_store and llm:
        q1 = "What were the total revenues for the most recent fiscal year?"
        answer = ask_analyst(q1, vector_store, llm)
        print("\n--- Analyst's Answer ---")
        print(answer)
    else:
        print("\nCould not run analysis due to initialization errors.")


# so see we got the answer as expected
# and in this way we made the RAG based model using LLama