In [1]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
import faiss
import numpy as np
import PyPDF2  # For PDF reading
import sentencepiece  # Ensure sentencepiece is imported

In [2]:
# Semantic segmentation model (BigBird)
tokenizer_segmentation = AutoTokenizer.from_pretrained('google/bigbird-roberta-base')
model_segmentation = AutoModel.from_pretrained('google/bigbird-roberta-base')



pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

In [3]:
# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Language model for generating answers
tokenizer_llm = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
model_llm = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [9]:
# Function to read document
def read_document(file_path):
    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    elif file_path.endswith('.pdf'):
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    else:
        print("Unsupported file type.")
        return None

# Semantic Segmentation (Chunking)
def semantic_segmentation(text, max_length=512):
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    current_chunk = ''
    for sentence in sentences:
        if len(tokenizer_segmentation.tokenize(current_chunk + sentence)) < max_length:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Embedding Chunks
def embed_chunks(chunks):
    embeddings = embedding_model.encode(chunks, convert_to_tensor=True)
    return embeddings

# Build FAISS Index
def build_faiss_index(embeddings):
    embeddings_np = embeddings.cpu().detach().numpy().astype('float32')
    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)
    return index

# Retrieve Relevant Chunks
def retrieve_relevant_chunks(question, index, chunks, top_k=5):
    question_embedding = embedding_model.encode([question], convert_to_tensor=True)
    question_embedding_np = question_embedding.cpu().detach().numpy().astype('float32')
    distances, indices = index.search(question_embedding_np, top_k)
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks

# Generate Answer Using LLM
def generate_answer(question, context):
    prompt = f"""
You are an expert assistant. Use the context below to answer the question.

Context:
{context}

Question:
{question}

Answer:
"""
    inputs = tokenizer_llm.encode(prompt, return_tensors='pt', max_length=1024, truncation=True)
    summary_ids = model_llm.generate(inputs, max_length=150, early_stopping=True)
    answer = tokenizer_llm.decode(summary_ids[0], skip_special_tokens=True)
    return answer

In [10]:
# Main Execution Flow
# 1. Read Document
file_path = '/Users/captn_jp/Downloads/project_charter.pdf'  # Replace with your document's path
document_text = read_document(file_path)

# 2. Process Document
print("Processing document...")
chunks = semantic_segmentation(document_text)
embeddings = embed_chunks(chunks)
index = build_faiss_index(embeddings)
print("Document processed and indexed successfully.")

# 3. Ask a Question
question = input("Ask a question about the document: ")

# 4. Generate Answer
print("Generating answer...")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
context = ' '.join(relevant_chunks)
answer = generate_answer(question, context)
print("Answer:")
print(answer)

Processing document...
Document processed and indexed successfully.
Generating answer...
Answer:
The user can perform 4 different types of analysis using the chat interface. Keeping in mind the scalability, response time, cost associated and ability to add more features in future. The different analysis the product must be able to perform are: Generate Plots for Key Variables • Conduct Regression Analysis • Identify Key Columns for Analysis (for problem specified by the user)
