# Setting up Google Colab and Hugging Face API

Open this notebook in [colab](https://colab.research.google.com/github/Chair-of-Banking-and-Finance/Bachelor_thesis_24_25_Template/blob/main/Llama_RAG/LAMA_3_local_RAG_v2.ipynb).

## Getting a Hugging Face API Token
1. **Create a Hugging Face account**: Go to [Hugging Face](https://huggingface.co/) and create an account if you don’t already have one.
2. **Generate an API Token**: After logging in, click on your profile icon in the top right corner, and go to "Settings".
3. **Access Tokens**: On the settings page, navigate to the "Access Tokens" tab.
4. **Create a new token**: Click on "New Token", give it a name, and set the role to "write". This token will be used to authenticate and download models.
5. **Copy the Token**: Copy the generated token and replace the `Hugging_Face_Token` variable in the script with your token.

---


In [1]:
Hugging_face_token = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXX"
from huggingface_hub import login
login(token=Hugging_face_token)
import os

os.environ["HF_TOKEN"] = Hugging_face_token


Visit [Hugging Face's model page for Llama 2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) and request access to the model.

### Install Required Libraries

In [2]:
# Install basic dependencies
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q torch
!pip install -q PyPDF2
!pip install -q tqdm
!pip install -q hnswlib
!pip install -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Import Libraries

In [3]:
import logging
import os
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()
import numpy as np
import hnswlib
from typing import List, Tuple, Optional
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [4]:
@dataclass
class RetrievedDocument:
    """Data class for storing retrieved documents and their metadata."""
    content: str
    similarity_score: float
    source: str = ""

class DocumentProcessor:
    """Handles document loading and preprocessing."""

    @staticmethod
    def load_document(file_path: str) -> Optional[str]:
        """Load document content from various file formats."""
        try:
            if file_path.endswith('.pdf'):
                with open(file_path, 'rb') as file:
                    reader = PdfReader(file)
                    return ' '.join(page.extract_text() for page in reader.pages)

            elif file_path.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as file:
                    return file.read()
            else:
                logger.warning(f"Unsupported file format: {file_path}")
                return None

        except Exception as e:
            logger.error(f"Error loading document {file_path}: {e}")
            return None

class HNSWRetriever:
    """Document retrieval system using HNSWLib for efficient similarity search."""

    def __init__(self,
                 embedding_model: str = 'BAAI/bge-small-en-v1.5',
                 space: str = 'cosine',
                 ef_construction: int = 200,
                 M: int = 16):
        """
        Initialize the retriever with HNSWLib index.

        Args:
            embedding_model: Name of the embedding model
            space: Distance metric ('cosine', 'l2', 'ip')
            ef_construction: Number of neighbors to consider during index construction
            M: Number of bi-directional links created for every new element
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.documents = []
        self.document_sources = []
        self.index = None
        self.space = space
        self.ef_construction = ef_construction
        self.M = M

    def add_documents(self, documents: List[str], sources: List[str] = None) -> None:
        """Add documents to the retrieval system."""
        if not documents:
            logger.warning("No documents provided for indexing")
            return

        logger.info(f"Adding {len(documents)} documents to the index")

        # Store documents and their sources
        start_idx = len(self.documents)
        self.documents.extend(documents)
        if sources:
            self.document_sources.extend(sources)

        # Create embeddings
        embeddings = self.embedding_model.encode(
            documents,
            show_progress_bar=True,
            batch_size=32
        )

        # Initialize or update HNSWLib index
        if self.index is None:
            dimension = embeddings.shape[1]
            self.index = hnswlib.Index(space=self.space, dim=dimension)
            self.index.init_index(
                max_elements=len(documents) * 2,  # Allow for future additions
                ef_construction=self.ef_construction,
                M=self.M
            )
            self.index.add_items(embeddings, list(range(len(documents))))
        else:
            self.index.resize_index(len(self.documents))
            self.index.add_items(
                embeddings,
                list(range(start_idx, start_idx + len(documents)))
            )

        logger.info("Documents successfully indexed")

    def retrieve(self, query: str, top_k: int = 3) -> List[RetrievedDocument]:
        """Retrieve most relevant documents for a query."""
        if not self.documents:
            logger.warning("No documents in the index")
            return []

        # Encode query
        query_embedding = self.embedding_model.encode([query])

        # Search index
        try:
            # Get nearest neighbors
            labels, distances = self.index.knn_query(query_embedding, k=min(top_k, len(self.documents)))

            # Convert distances to similarities if using cosine space
            if self.space == 'cosine':
                similarities = 1 - distances[0]
            else:
                similarities = -distances[0]  # Convert distance to similarity

            # Package results
            results = []
            for idx, similarity in zip(labels[0], similarities):
                source = self.document_sources[idx] if self.document_sources else ""
                results.append(RetrievedDocument(
                    content=self.documents[idx],
                    similarity_score=float(similarity),
                    source=source
                ))

            return results

        except Exception as e:
            logger.error(f"Error during retrieval: {e}")
            return []

class HNSWRAGPipeline:
    """RAG pipeline using HNSWLib for retrieval."""

    def __init__(
        self,
        model_name: str = 'meta-llama/Llama-2-7b-chat-hf',
        embedding_model: str = 'BAAI/bge-small-en-v1.5',
        device: str = 'auto',
        load_in_4bit: bool = True
    ):
        """
        Initialize the RAG pipeline.

        Args:
            model_name: Name of the language model to use
            embedding_model: Name of the embedding model
            device: Device to use ('auto', 'cuda', 'cpu')
            load_in_4bit: Whether to load model in 4-bit precision
        """
        logger.info(f"Initializing HNSWRAGPipeline with model: {model_name}")

        # Initialize retriever
        self.retriever = HNSWRetriever(embedding_model)

        # Initialize tokenizer and model
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                use_fast=True,
                trust_remote_code=True
            )

            if device == 'auto':
                device = 'cuda' if torch.cuda.is_available() else 'cpu'

            # Configure model loading based on available resources
            model_kwargs = {
                "device_map": device,
                "torch_dtype": torch.float16 if device == 'cuda' else torch.float32,
                "low_cpu_mem_usage": True
            }

            if device == 'cuda' and load_in_4bit:
                try:
                    from transformers import BitsAndBytesConfig

                    model_kwargs.update({
                        "quantization_config": BitsAndBytesConfig(
                            load_in_4bit=True,
                            bnb_4bit_compute_dtype=torch.float16,
                            bnb_4bit_use_double_quant=True,
                            bnb_4bit_quant_type="nf4"
                        )
                    })
                except ImportError:
                    logger.warning("bitsandbytes not available, falling back to 16-bit")

            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                **model_kwargs
            )
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise

        logger.info("Pipeline initialized successfully")

    def add_documents(self, file_paths: List[str]) -> None:
        """Add documents to the retrieval system."""
        documents = []
        valid_sources = []

        for path in tqdm(file_paths, desc="Loading documents"):
            content = DocumentProcessor.load_document(path)
            if content:
                documents.append(content)
                valid_sources.append(path)

        self.retriever.add_documents(documents, valid_sources)

    def _format_prompt(self, query: str, retrieved_docs: List[RetrievedDocument]) -> str:
        """Format the prompt with retrieved context."""
        context_str = "\n\n".join(
            f"[Document {i+1} (Relevance: {doc.similarity_score:.2f})]\n{doc.content}"
            for i, doc in enumerate(retrieved_docs)
        )

        return f"""[INST]
Using the following retrieved documents as context, please answer the question.
If the context doesn't contain relevant information, use your general knowledge
but indicate this in your response.

Context:
{context_str}

Question: {query}

Please provide a comprehensive and accurate answer based on the provided context.
If the context is insufficient, indicate what information comes from your general knowledge.[/INST]"""

    def generate_response(
        self,
        prompt: str,
        max_new_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9
    ) -> str:
        """Generate a response using the language model."""
        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=2048
            ).to(self.model.device)

            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.split("[/INST]")[-1].strip()

            return response

        except Exception as e:
            logger.error(f"Error generating response: {e}")
            return "I apologize, but I encountered an error while generating the response."

    def query(
        self,
        query: str,
        top_k: int = 3,
        max_new_tokens: int = 512
    ) -> Tuple[str, List[RetrievedDocument]]:
        """Process a query through the complete RAG pipeline."""
        logger.info(f"Processing query: {query}")

        # Retrieve relevant documents
        retrieved_docs = self.retriever.retrieve(query, top_k=top_k)

        if not retrieved_docs:
            logger.warning("No relevant documents found")
            prompt = f"""[INST]Please answer this question using your general knowledge:
{query}[/INST]"""
        else:
            prompt = self._format_prompt(query, retrieved_docs)

        # Generate response
        response = self.generate_response(prompt, max_new_tokens=max_new_tokens)

        return response, retrieved_docs

In [7]:
# DELETE IF YOU WORK WITH THE REAL DATA, ONLY AN EXAMPLE
# Define the text to be written to the file
roman_empire_text = """
The Roman Empire: An Overview
The Roman Empire was one of the most influential civilizations in human history, spanning over a millennium and leaving a legacy that shaped the world in areas such as governance, architecture, engineering, and law. Officially beginning in 27 BCE with the rise of Augustus Caesar, Rome transitioned from a republic to an empire, dominating vast territories that stretched from Britain in the northwest to Egypt in the southeast.

Formation and Expansion
The Roman Empire's foundation was built on centuries of conquest during the Roman Republic. Under Augustus, the empire ushered in a period of peace and stability known as the Pax Romana (Roman Peace), lasting about 200 years. During this time, Rome expanded its borders, solidifying control over Europe, North Africa, and parts of the Middle East.

The empire was characterized by a vast network of cities connected by advanced roads and aqueducts, facilitating trade, military movements, and cultural exchange. Notable conquests include Gaul (modern-day France) under Julius Caesar, the annexation of Egypt after Cleopatra's defeat, and the consolidation of power in regions such as Spain and the Balkans.

Culture and Society
Roman society was highly stratified, with a clear distinction between the elite patricians, common plebeians, and enslaved individuals. Roman culture blended Latin traditions with influences from Greece and the regions it conquered. This fusion led to remarkable achievements in literature (Virgil’s Aeneid), philosophy (Cicero, Seneca), and architecture (the Colosseum, aqueducts, and the Pantheon).

The Roman Empire was also a melting pot of religions. Initially polytheistic, it later became a cradle for Christianity, with Emperor Constantine legalizing the faith in 313 CE and Emperor Theodosius I declaring it the state religion by 380 CE.

Governance and Law
Rome was renowned for its administrative prowess and legal systems. The empire was divided into provinces, each governed by an appointed official. Roman law, codified in the Twelve Tables and later expanded, formed the foundation for many modern legal systems. Concepts like innocent until proven guilty and legal representation have their roots in Roman jurisprudence.

Decline and Fall
The decline of the Roman Empire was a gradual process influenced by internal and external factors. Political instability, economic struggles, and military overreach weakened the empire. The division of the empire into Eastern and Western halves in 395 CE further strained its cohesion. While the Western Roman Empire fell in 476 CE after being overrun by Germanic tribes, the Eastern Roman Empire, known as the Byzantine Empire, endured for another thousand years until the fall of Constantinople in 1453.

Legacy
The Roman Empire profoundly shaped Western civilization. Its contributions to governance, infrastructure, and culture remain influential today. Latin, the language of Rome, evolved into the Romance languages (Italian, French, Spanish, etc.), and Roman architecture inspired countless generations. The very concept of a republic and the rule of law owe much to Rome’s enduring influence.

In essence, the Roman Empire stands as a testament to humanity’s capacity for organization, innovation, and adaptation, making it a cornerstone of global history.
"""

# # Specify the directory and file name
# output_dir = "./data"
# file_name = "roman_empire_overview.txt"
# file_path = os.path.join(output_dir, file_name)

# # Ensure the output directory exists; if not, create it
# os.makedirs(output_dir, exist_ok=True)

# # Write the text to the file with UTF-8 encoding
# with open(file_path, 'w', encoding='utf-8') as file:
#     file.write(roman_empire_text)

In [9]:
# Specify the directory and file name
output_dir = "./data"
#file_name = "roman_empire_overview.txt"
file_path = os.path.join(output_dir, file_name)

# Ensure the output directory exists; if not, create it
os.makedirs(output_dir, exist_ok=True)

In [10]:
# DELETE IF YOU WORK WITH THE REAL DATA, ONLY AN EXAMPLE
# Write the text to the file with UTF-8 encoding
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(roman_empire_text)

In [11]:
# First, clear any existing CUDA memory
import torch
if torch.cuda.is_available():
  torch.cuda.empty_cache()

In [12]:
# Initialize pipeline forcing CPU mode for all components
rag = HNSWRAGPipeline(
    model_name='meta-llama/Llama-2-7b-chat-hf'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [18]:
import os

# Document directory
document_dir = "data"

# Get document files
document_files = [
    os.path.join(document_dir, f)
    for f in os.listdir(document_dir)
    if os.path.isfile(os.path.join(document_dir, f))
]

# Add documents to RAG
rag.add_documents(document_files)

def generate_clean_answer(query, retrieved_docs):
    if retrieved_docs:
        # Sort and filter relevant documents
        docs_sorted = sorted(retrieved_docs, key=lambda d: d.similarity_score, reverse=True)
        relevant_docs = [doc for doc in docs_sorted if doc.similarity_score > 0.5]

        if not relevant_docs:
            return "No relevant information found."

        # Create context from relevant docs
        context = "\n".join([doc.content[:5000] for doc in relevant_docs])

        # Create prompt
        prompt = f"""[INST]Based on the provided context, answer this question concisely: {query}.
If the information is not in the context, just say 'Information not found in the available context.'[/INST]"""

        # Get response
        response = rag.generate_response(prompt)

        # Clean response
        response = response.replace('[INST]', '').replace('[/INST]', '')
        response = response.replace('Based on the provided context,', '')
        response = response.replace('Based on the context,', '')
        response = response.strip()

        return response
    else:
        return "No relevant information found."

# Query and get clean response
query = "What are the different segments in which Ameriprise financial operates and what amount does each segment constitute to the total allocated capital in percentage in Q4 2018"
response, retrieved_docs = rag.query(query)
answer = generate_clean_answer(query, retrieved_docs)
print(answer)


Loading documents: 100%|██████████| 2/2 [00:15<00:00,  7.61s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Ameriprise Financial operates in the following segments:

1. Advice & Wealth Management: This segment provides financial planning, investment management, and other wealth management services to individuals, businesses, and institutions. In Q4 2018, this segment constituted 74% of the total allocated capital.
2. Asset Management: This segment manages and distributes mutual funds, exchange-traded funds (ETFs), and other investment products. In Q4 2018, this segment constituted 17% of the total allocated capital.

Information not found in the available context.
