In [2]:
# Import necessary libraries
import os
import json
import re
from typing import List, Dict, Any, Tuple, Union
from dataclasses import dataclass
from PyPDF2 import PdfReader
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from torch.utils.data import DataLoader, Dataset

# Import Transformers for Qwen2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch

In [3]:
torch.cuda.empty_cache()

In [4]:
# Choose the model variant
model_name = "meta-llama/Llama-2-7b-hf"  # or "meta-llama/Llama-2-7b-chat-hf"


# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # Switch to Float32 for better compatibility
    device_map='auto'
)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

print(f"Using device: {device}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Using device: cuda


In [5]:
batch_size = 128  # Adjust this based on your GPU memory
num_workers = 8 # Number of CPU workers for DataLoader

In [6]:
class TextDataset(Dataset):
    """Custom Dataset for batching text inputs"""
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

In [7]:
def get_embedding(text: str) -> List[float]:
    """Get embeddings from LLaMA 2"""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # Take the mean of the last hidden state as the embedding
    embedding = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu().numpy().tolist()
    return embedding


def get_embeddings_batch(texts: List[str], batch_size: int = 50) -> List[List[float]]:
    """Get embeddings for multiple texts with progress bar"""
    embeddings = []
    
    with tqdm(total=len(texts), desc="Generating embeddings") as pbar:
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = [get_embedding(text) for text in batch]
            embeddings.extend(batch_embeddings)
            pbar.update(len(batch))
    return embeddings

def get_embeddings_parallel(texts, batch_size=64, num_workers=4):
    """Parallelized embedding generation using DataLoader and FP16"""
    dataset = TextDataset(texts)
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)
    
    embeddings = []
    with torch.no_grad(), torch.cuda.amp.autocast():
        with tqdm(total=len(texts), desc="Generating embeddings") as pbar:
            for batch in loader:
                # Tokenize in batch for faster processing
                inputs = tokenizer(list(batch), return_tensors="pt", padding=True, truncation=True).to(device)
                outputs = model(**inputs, output_hidden_states=True)
                
                # Extract embeddings from the last hidden state
                batch_embeddings = outputs.hidden_states[-1].mean(dim=1).cpu().numpy().tolist()
                embeddings.extend(batch_embeddings)
                pbar.update(len(batch))
                
    return embeddings


def get_completion(
    messages: str,
    max_length: int = 200,
    temperature: float = 0.7
) -> str:
    """Get completion from LLaMA 2"""
    input_ids = tokenizer(messages, return_tensors="pt").input_ids.to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    completion = tokenizer.decode(output[0], skip_special_tokens=True)
    return completion



In [8]:
# Document Class
@dataclass
class Document:
    content: str
    metadata: Dict[str, Any] = None


# Document Processor
class DocumentProcessor:
    @staticmethod
    def clean_text(text: str) -> str:
        text = re.sub(r"\.{2,}", "", text)
        text = re.sub(r"\s*\u2002\s*", " ", text)
        text = re.sub(r"\s+", " ", text)

        lines = []
        for line in text.split("\n"):
            line = line.strip()
            if not line or all(c in ".-" for c in line):
                continue
            if line.startswith("=== Page"):
                lines.append(line)
                continue
            if re.match(r"^\d+[-–]\d+$", line):
                continue
            lines.append(line)

        return "\n".join(lines)

    @staticmethod
    def load_pdf(file_path: str) -> str:
        text = ""
        with open(file_path, "rb") as file:
            reader = PdfReader(file)
            total_pages = len(reader.pages)

            with tqdm(total=total_pages, desc="Loading PDF pages") as pbar:
                for page_num, page in enumerate(reader.pages, 1):
                    page_text = page.extract_text()
                    text += f"\n=== Page {page_num} ===\n{page_text}"
                    pbar.update(1)

        print("Cleaning extracted text...")
        return DocumentProcessor.clean_text(text)

    @staticmethod
    def chunk_text(
        text: str, chunk_size: int = 1000, chunk_overlap: int = 200
    ) -> List[Document]:
        separators = ["\n=== Page", "\n\n", "\n", ". ", "? ", "! "]

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            length_function=len,
            is_separator_regex=False,
        )

        chunks = splitter.split_text(text)
        processed_chunks = []

        with tqdm(chunks, desc="Processing chunks", unit="chunk") as pbar:
            for chunk in pbar:
                chunk = DocumentProcessor.clean_text(chunk)
                if not chunk.startswith("=== Page"):
                    sentence_boundaries = [". ", "? ", "! "]
                    first_boundary = float("inf")
                    for boundary in sentence_boundaries:
                        pos = chunk.find(boundary)
                        if pos != -1 and pos < first_boundary:
                            first_boundary = pos
                    if first_boundary < 100 and first_boundary != float("inf"):
                        chunk = chunk[first_boundary + 2 :]

                if chunk.strip():
                    processed_chunks.append(Document(content=chunk))
                pbar.set_postfix({"total_chunks": len(processed_chunks)})

        print(f"Final number of chunks: {len(processed_chunks)}")
        return processed_chunks

In [9]:
class VectorStore:
    def __init__(self, persist_directory: str = "rag_index"):
        self.index = None
        self.documents = []
        self.persist_directory = persist_directory
        os.makedirs(persist_directory, exist_ok=True)

    def _get_index_path(self) -> str:
        return os.path.join(self.persist_directory, "faiss.index")

    def _get_documents_path(self) -> str:
        return os.path.join(self.persist_directory, "documents.pkl")

    def load_local_index(self) -> bool:
        index_path = self._get_index_path()
        documents_path = self._get_documents_path()

        try:
            if os.path.exists(index_path) and os.path.exists(documents_path):
                self.index = faiss.read_index(index_path)
                with open(documents_path, "rb") as f:
                    self.documents = pickle.load(f)
                print(f"Loaded existing index with {len(self.documents)} documents")
                return True
        except Exception as e:
            print(f"Error loading index: {e}")
            self.index = None
            self.documents = []

        return False

    def save_local_index(self):
        if self.index is None or not self.documents:
            return
        try:
            faiss.write_index(self.index, self._get_index_path())
            with open(self._get_documents_path(), "wb") as f:
                pickle.dump(self.documents, f)
            print(f"Saved index with {len(self.documents)} documents")
        except Exception as e:
            print(f"Error saving index: {e}")

    def create_index(self, documents: List[Document], force_recreate: bool = False):
        if not force_recreate and self.load_local_index():
            return

        print("Creating new index...")
        self.documents = documents
        contents = [doc.content for doc in documents]
        embeddings = get_embeddings_batch(contents)

        embedding_dim = len(embeddings[0])
        # Use IndexFlatL2 instead of IndexFlatIP
        self.index = faiss.IndexFlatL2(embedding_dim)
        self.index.add(np.array(embeddings).astype("float32"))

        self.save_local_index()

    def search(self, query: str, k: int = 3) -> List[Tuple[Document, float]]:
        """Search for similar documents"""
        if self.index is None:
            raise ValueError("Index not initialized. Call create_index first.")

        query_embedding = get_embedding(query)
        distances, indices = self.index.search(
            np.array([query_embedding]).astype("float32"), k
        )

        # Convert distances to similarity scores
        similarities = 1 / (1 + distances)

        return [
            (self.documents[i], similarities[0][idx])
            for idx, i in enumerate(indices[0])
        ]

In [11]:
def load_documents(file_path):
    # Load and chunk document
    text = DocumentProcessor.load_pdf(file_path)
    documents = DocumentProcessor.chunk_text(text)
    
    # Create and return vector store
    vector_store = VectorStore()  # HybridSearcher()
    vector_store.create_index(documents, force_recreate=True)
    print(f"Processed {len(documents)} chunks")
    
    return vector_store

vector_store = load_documents("WF_benefits_book.pdf")


Loading PDF pages:   0%|          | 0/458 [00:00<?, ?it/s]

Cleaning extracted text...


Processing chunks:   0%|          | 0/2074 [00:00<?, ?chunk/s]

Final number of chunks: 2074
Creating new index...


Generating embeddings:   0%|          | 0/2074 [00:00<?, ?it/s]

Saved index with 2074 documents
Processed 2074 chunks


In [12]:
def query(query, top_k=5):
    contents = vector_store.search(query, k=top_k)
    contents = '\n'.join([x[0].content for x in contents])
    print(contents)
    
    prompt = f""" You are a helpful assistant.
    You have to answer the contents below:
    Context: {contents}
    Query: {query}
    
    Answer:"""
    
    answer = get_completion(prompt)
    print(answer)

query("What benefits are listed by Wells Fargo?")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Changing coverage What changes can you make during the year
Address to submit an order or notice A Medical Child Support Order or National Medical Support Notice should be submitted to: Wells Fargo MAC: N9160-014 DCCOE Ben Admin 1801 Park View Dr. 1st Floor Shoreview, MN 55126-5030 Determining if the order or notice is qualified The plan administrator delegates Wells Fargo HR Operations Benefits Administration to receive and process the orders
For more information, see the “Orthodontia claims” section on page 3-10 . Chapter 3: Dental Plan 3-8 === Page 197 === What is not covered Charges for some types of dental work will not be covered by the dental plan
Agent for service of legal process The agent for service of legal process is ARAG. All correspondence should be directed to ARAG at: ARAG Attn: Legal Department 500 Grand Ave
Unless otherwise indicated, references i n this chapter to the HSA Plan and Copay Plan with HRA include the applicable Out of Area coverage. • For interns and fle

ValueError: Input length of input_ids is 376, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.