In [1]:
!pip install -q transformers torch sentence-transformers PyPDF2 faiss-cpu numpy pandas scikit-learn
!pip install -q pdfplumber


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
import pdfplumber
import pandas as pd
import numpy as np
import re
import warnings
from typing import List, Dict, Any
import torch
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import json

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class PDFProcessor:
    def __init__(self):
        pass

    def extract_text_from_pdf(self, pdf_path):
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Error reading PDF: {e}")
        return text

    def clean_text(self, text):
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\$\%\.\,\-\(\)]', ' ', text)
        text = text.replace('$', '$ ')
        text = text.replace('%', ' %')
        return text.strip()

    def chunk_text(self, text, chunk_size=500, overlap=50):
        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append(chunk.strip())

        return chunks

In [None]:
class VectorStore:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.embeddings = None
        self.chunks = None
        self.index = None

    def create_embeddings(self, chunks):
        print(f"Creating embeddings for {len(chunks)} chunks...")
        self.chunks = chunks
        self.embeddings = self.model.encode(chunks, show_progress_bar=True)

        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)

        faiss.normalize_L2(self.embeddings.astype('float32'))
        self.index.add(self.embeddings.astype('float32'))

        print(f"Vector store created with {len(chunks)} chunks")

    def search(self, query, top_k=3):
        if self.index is None:
            raise ValueError("Vector store not initialized. Call create_embeddings first.")

        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding.astype('float32'))

        scores, indices = self.index.search(query_embedding.astype('float32'), top_k)

        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            results.append({
                'chunk': self.chunks[idx],
                'score': float(score),
                'rank': i + 1
            })

        return results

In [None]:
class BasicGenerator:
    def __init__(self):
        pass

    def generate_answer(self, query, context_chunks, max_length=150):
        context = "\n".join([chunk['chunk'] for chunk in context_chunks[:3]])
        query_lower = query.lower()

        if 'revenue' in query_lower and 'q1 2024' in query_lower:
            revenue_match = re.search(r'\$\s*(\d+[\d,]*)\s*(?:million|billion)', context, re.IGNORECASE)
            if revenue_match:
                return f"Based on the financial report, Meta's revenue in Q1 2024 was ${revenue_match.group(1)} million."

        if 'financial highlights' in query_lower or 'key highlights' in query_lower:
            highlights = []

            revenue_match = re.search(r'Revenue.*?\$\s*(\d+[\d,]*)', context, re.IGNORECASE)
            if revenue_match:
                highlights.append(f"Revenue: ${revenue_match.group(1)} million")

            income_match = re.search(r'Net income.*?\$\s*(\d+[\d,]*)', context, re.IGNORECASE)
            if income_match:
                highlights.append(f"Net income: ${income_match.group(1)} million")

            growth_match = re.search(r'(\d+)\s*%.*?(?:increase|growth)', context, re.IGNORECASE)
            if growth_match:
                highlights.append(f"Growth: {growth_match.group(1)}%")

            if highlights:
                return "Key financial highlights for Meta in Q1 2024: " + "; ".join(highlights)

        if context_chunks:
            best_chunk = context_chunks[0]['chunk']
            return f"Based on the financial report: {best_chunk[:200]}..."

        return "I couldn't find specific information to answer your question."

In [None]:
class BasicRAGPipeline:
    def __init__(self):
        self.pdf_processor = PDFProcessor()
        self.vector_store = VectorStore()
        self.generator = BasicGenerator()

    def setup(self, pdf_path):
        print("Setting up RAG pipeline...")

        raw_text = self.pdf_processor.extract_text_from_pdf(pdf_path)
        clean_text = self.pdf_processor.clean_text(raw_text)
        chunks = self.pdf_processor.chunk_text(clean_text)

        print(f"Processed {len(chunks)} text chunks")

        self.vector_store.create_embeddings(chunks)

        print("RAG pipeline setup complete!")

    def query(self, question, top_k=3):
        print(f"Query: {question}")

        retrieved_chunks = self.vector_store.search(question, top_k)

        print(f"Retrieved {len(retrieved_chunks)} relevant chunks")
        for i, chunk in enumerate(retrieved_chunks):
            print(f"Chunk {i+1} (score: {chunk['score']:.3f}): {chunk['chunk'][:100]}...")

        answer = self.generator.generate_answer(question, retrieved_chunks)

        print(f"Answer: {answer}")
        return {
            'question': question,
            'answer': answer,
            'retrieved_chunks': retrieved_chunks
        }

In [None]:
class EnhancedPDFProcessor:
    def __init__(self):
        self.text_chunks = []
        self.tables = []
        self.structured_data = {}

    def extract_text_and_tables(self, pdf_path):
        print("Extracting text and tables from PDF...")

        text = self._extract_text_pypdf2(pdf_path)
        tables = self._extract_tables_pdfplumber(pdf_path)

        return text, tables

    def _extract_text_pypdf2(self, pdf_path):
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Error reading PDF with PyPDF2: {e}")
        return text

    def _extract_tables_pdfplumber(self, pdf_path):
        tables = []
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    page_tables = page.extract_tables()
                    for table_num, table in enumerate(page_tables):
                        if table and len(table) > 1:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            df = self._clean_table_dataframe(df)

                            table_info = {
                                'page': page_num + 1,
                                'table_id': f"page_{page_num+1}_table_{table_num+1}",
                                'dataframe': df,
                                'raw_data': table
                            }
                            tables.append(table_info)

        except Exception as e:
            print(f"Error extracting tables with pdfplumber: {e}")

        print(f"Extracted {len(tables)} tables")
        return tables

    def _clean_table_dataframe(self, df):
        df = df.dropna(how='all').dropna(axis=1, how='all')

        df.columns = [str(col).strip().replace('\n', ' ') if col else f"Column_{i}"
                     for i, col in enumerate(df.columns)]

        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].astype(str).str.strip().str.replace('\n', ' ')

        df = self._convert_financial_data(df)

        return df

    def _convert_financial_data(self, df):
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].str.replace('$', '').str.replace(',', '')

                try:
                    df[col] = pd.to_numeric(df[col], errors='ignore')
                except:
                    pass

        return df

    def process_structured_data(self, tables):
        structured_data = {}

        for table_info in tables:
            table_id = table_info['table_id']
            df = table_info['dataframe']

            structured_data[table_id] = {
                'dataframe': df,
                'page': table_info['page'],
                'text_representation': self._table_to_text(df),
                'key_value_pairs': self._extract_key_value_pairs(df),
                'financial_metrics': self._extract_financial_metrics(df)
            }

        return structured_data

    def _table_to_text(self, df):
        text_parts = []

        text_parts.append("Table with columns: " + ", ".join(df.columns))

        for _, row in df.iterrows():
            row_text = []
            for col, value in row.items():
                if pd.notna(value) and str(value).strip():
                    row_text.append(f"{col}: {value}")
            if row_text:
                text_parts.append(" | ".join(row_text))

        return "\n".join(text_parts)

    def _extract_key_value_pairs(self, df):
        kv_pairs = {}

        if len(df.columns) == 2:
            key_col, value_col = df.columns[0], df.columns[1]
            for _, row in df.iterrows():
                key = str(row[key_col]).strip()
                value = str(row[value_col]).strip()
                if key and value and key != 'nan' and value != 'nan':
                    kv_pairs[key] = value

        return kv_pairs

    def _extract_financial_metrics(self, df):
        metrics = {}

        financial_terms = ['revenue', 'income', 'expense', 'margin', 'earnings', 'cost', 'growth']

        for col in df.columns:
            col_lower = col.lower()
            for term in financial_terms:
                if term in col_lower:
                    numeric_values = []
                    for value in df[col]:
                        if pd.notna(value):
                            numbers = re.findall(r'[\d,]+\.?\d*', str(value))
                            numeric_values.extend(numbers)

                    if numeric_values:
                        metrics[col] = numeric_values

        return metrics

In [None]:
class HybridRetriever:
    def __init__(self, embedding_model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

        self.text_embeddings = None
        self.text_chunks = []
        self.text_index = None

        self.structured_data = {}
        self.table_embeddings = None
        self.table_texts = []
        self.table_index = None

        self.tfidf_matrix = None
        self.all_texts = []

    def setup_text_retrieval(self, text_chunks):
        print(f"Setting up text retrieval for {len(text_chunks)} chunks...")

        self.text_chunks = text_chunks
        self.text_embeddings = self.embedding_model.encode(text_chunks, show_progress_bar=True)

        dimension = self.text_embeddings.shape[1]
        self.text_index = faiss.IndexFlatIP(dimension)
        faiss.normalize_L2(self.text_embeddings.astype('float32'))
        self.text_index.add(self.text_embeddings.astype('float32'))

    def setup_structured_retrieval(self, structured_data):
        print(f"Setting up structured data retrieval for {len(structured_data)} tables...")

        self.structured_data = structured_data

        self.table_texts = []
        for table_id, table_info in structured_data.items():
            self.table_texts.append(table_info['text_representation'])

        if self.table_texts:
            self.table_embeddings = self.embedding_model.encode(self.table_texts, show_progress_bar=True)

            dimension = self.table_embeddings.shape[1]
            self.table_index = faiss.IndexFlatIP(dimension)
            faiss.normalize_L2(self.table_embeddings.astype('float32'))
            self.table_index.add(self.table_embeddings.astype('float32'))

    def setup_keyword_search(self):
        print("Setting up keyword search...")

        self.all_texts = self.text_chunks + self.table_texts

        if self.all_texts:
            self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.all_texts)

    def search_text(self, query, top_k=3):
        if self.text_index is None:
            return []

        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding.astype('float32'))

        scores, indices = self.text_index.search(query_embedding.astype('float32'), top_k)

        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            results.append({
                'type': 'text',
                'content': self.text_chunks[idx],
                'score': float(score),
                'rank': i + 1
            })

        return results

    def search_structured(self, query, top_k=3):
        if self.table_index is None:
            return []

        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding.astype('float32'))

        scores, indices = self.table_index.search(query_embedding.astype('float32'), top_k)

        results = []
        table_ids = list(self.structured_data.keys())

        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(table_ids):
                table_id = table_ids[idx]
                results.append({
                    'type': 'structured',
                    'table_id': table_id,
                    'content': self.table_texts[idx],
                    'structured_data': self.structured_data[table_id],
                    'score': float(score),
                    'rank': i + 1
                })

        return results

    def search_keyword(self, query, top_k=3):
        if self.tfidf_matrix is None:
            return []

        query_vector = self.tfidf_vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        top_indices = similarities.argsort()[-top_k:][::-1]

        results = []
        for i, idx in enumerate(top_indices):
            if similarities[idx] > 0:
                content_type = 'text' if idx < len(self.text_chunks) else 'structured'
                content = self.all_texts[idx]

                results.append({
                    'type': content_type,
                    'content': content,
                    'score': float(similarities[idx]),
                    'rank': i + 1,
                    'search_method': 'keyword'
                })

        return results

    def hybrid_search(self, query, top_k=5):
        text_results = self.search_text(query, top_k)
        structured_results = self.search_structured(query, top_k)
        keyword_results = self.search_keyword(query, top_k)

        all_results = text_results + structured_results + keyword_results

        seen_content = set()
        unique_results = []

        for result in sorted(all_results, key=lambda x: x['score'], reverse=True):
            content_key = result['content'][:100]
            if content_key not in seen_content:
                seen_content.add(content_key)
                unique_results.append(result)

        return unique_results[:top_k]

In [None]:
class EnhancedGenerator:
    def __init__(self):
        pass

    def generate_answer(self, query, search_results, max_length=200):
        query_lower = query.lower()

        text_results = [r for r in search_results if r['type'] == 'text']
        structured_results = [r for r in search_results if r['type'] == 'structured']

        if self._is_comparative_query(query_lower):
            return self._generate_comparative_answer(query, text_results, structured_results)
        elif self._is_financial_metric_query(query_lower):
            return self._generate_financial_answer(query, text_results, structured_results)
        elif self._is_summary_query(query_lower):
            return self._generate_summary_answer(query, text_results, structured_results)
        else:
            return self._generate_general_answer(query, text_results, structured_results)

    def _is_comparative_query(self, query):
        comparative_terms = ['compared to', 'vs', 'versus', 'difference', 'change from', 'growth']
        return any(term in query for term in comparative_terms)

    def _is_financial_metric_query(self, query):
        financial_terms = ['revenue', 'income', 'expense', 'margin', 'earnings', 'cost', 'profit']
        return any(term in query for term in financial_terms)

    def _is_summary_query(self, query):
        summary_terms = ['summarize', 'summary', 'overview', 'highlights', 'key points']
        return any(term in query for term in summary_terms)

    def _generate_comparative_answer(self, query, text_results, structured_results):
        answer_parts = []

        for result in structured_results:
            if 'structured_data' in result:
                kv_pairs = result['structured_data'].get('key_value_pairs', {})
                financial_metrics = result['structured_data'].get('financial_metrics', {})

                comparison_data = self._extract_comparison_data(kv_pairs, financial_metrics)
                if comparison_data:
                    answer_parts.append(comparison_data)

        if text_results:
            context = text_results[0]['content']
            yoy_patterns = re.findall(r'(\d+)%.*?(?:increase|decrease|growth|change).*?year-over-year', context, re.IGNORECASE)
            if yoy_patterns:
                answer_parts.append(f"Year-over-year changes include: {', '.join(yoy_patterns)}% growth rates")

        if answer_parts:
            return ". ".join(answer_parts) + "."
        else:
            return "Based on the available data, I found the following comparison: " + text_results[0]['content'][:150] + "..."

    def _generate_financial_answer(self, query, text_results, structured_results):
        financial_figures = []

        for result in structured_results:
            if 'structured_data' in result:
                df = result['structured_data']['dataframe']

                query_terms = query.lower().split()
                for col in df.columns:
                    if any(term in col.lower() for term in query_terms):
                        values = df[col].dropna().tolist()
                        if values:
                            financial_figures.extend([str(v) for v in values[:3]])

        if text_results:
            text = text_results[0]['content']
            dollar_amounts = re.findall(r'\$\s*([\d,]+(?:\.\d+)?)\s*(?:million|billion)?', text, re.IGNORECASE)
            financial_figures.extend(dollar_amounts)

        if financial_figures:
            unique_figures = list(dict.fromkeys(financial_figures))
            answer = f"Based on the financial data: {', '.join(unique_figures[:3])}"

            if text_results:
                context_snippet = text_results[0]['content'][:100]
                answer += f". Context: {context_snippet}..."

            return answer
        else:
            return "I found the following financial information: " + text_results[0]['content'][:150] + "..."

    def _generate_summary_answer(self, query, text_results, structured_results):
        summary_parts = []

        for result in structured_results:
            if 'structured_data' in result:
                kv_pairs = result['structured_data'].get('key_value_pairs', {})

                relevant_pairs = []
                for key, value in list(kv_pairs.items())[:5]:
                    if any(term in key.lower() for term in ['revenue', 'income', 'expense', 'margin', 'growth']):
                        relevant_pairs.append(f"{key}: {value}")

                if relevant_pairs:
                    summary_parts.append("; ".join(relevant_pairs))

        if text_results:
            text = text_results[0]['content']
            sentences = text.split('.')
            key_sentences = [s.strip() for s in sentences if len(s.strip()) > 50][:2]
            summary_parts.extend(key_sentences)

        if summary_parts:
            return ". ".join(summary_parts) + "."
        else:
            return "Summary based on available data: " + text_results[0]['content'][:200] + "..."

    def _generate_general_answer(self, query, text_results, structured_results):
        if structured_results:
            structured_content = structured_results[0]['content']
            if text_results:
                text_content = text_results[0]['content'][:100]
                return f"Based on the data: {structured_content[:150]}... Additional context: {text_content}..."
            else:
                return f"Based on the structured data: {structured_content[:200]}..."
        elif text_results:
            return f"Based on the report: {text_results[0]['content'][:200]}..."
        else:
            return "I couldn't find specific information to answer your question."

    def _extract_comparison_data(self, kv_pairs, financial_metrics):
        comparison_text = []

        for key, value in kv_pairs.items():
            if any(year in key for year in ['2024', '2023']):
                comparison_text.append(f"{key}: {value}")

        if comparison_text:
            return "; ".join(comparison_text)

        return None

In [None]:
class EnhancedRAGPipeline:
    def __init__(self):
        self.pdf_processor = EnhancedPDFProcessor()
        self.retriever = HybridRetriever()
        self.generator = EnhancedGenerator()

        self.text_chunks = []
        self.structured_data = {}

    def setup(self, pdf_path, chunk_size=500, overlap=50):
        print("Setting up Enhanced RAG Pipeline")

        raw_text, tables = self.pdf_processor.extract_text_and_tables(pdf_path)

        clean_text = self._clean_text(raw_text)
        self.text_chunks = self._chunk_text(clean_text, chunk_size, overlap)

        self.structured_data = self.pdf_processor.process_structured_data(tables)

        self.retriever.setup_text_retrieval(self.text_chunks)
        self.retriever.setup_structured_retrieval(self.structured_data)
        self.retriever.setup_keyword_search()

        print(f"Pipeline setup complete!")
        print(f"   - Text chunks: {len(self.text_chunks)}")
        print(f"   - Tables: {len(self.structured_data)}")
        print(f"   - Ready for hybrid retrieval!")

    def query(self, question, top_k=5):
        print(f"Query: {question}")

        search_results = self.retriever.hybrid_search(question, top_k)

        print(f"Retrieved {len(search_results)} results:")
        for i, result in enumerate(search_results[:3]):
            result_type = result['type']
            score = result['score']
            content_preview = result['content'][:80] + "..." if len(result['content']) > 80 else result['content']
            print(f"   {i+1}. [{result_type.upper()}] Score: {score:.3f} - {content_preview}")

        answer = self.generator.generate_answer(question, search_results)

        print(f"Answer: {answer}")

        return {
            'question': question,
            'answer': answer,
            'search_results': search_results,
            'num_text_results': len([r for r in search_results if r['type'] == 'text']),
            'num_structured_results': len([r for r in search_results if r['type'] == 'structured'])
        }

    def _clean_text(self, text):
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\$\%\.\,\-\(\)]', ' ', text)
        text = text.replace('$', '$ ')
        text = text.replace('%', ' %')
        return text.strip()

    def _chunk_text(self, text, chunk_size, overlap):
        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append(chunk.strip())

        return chunks

    def get_structured_data_summary(self):
        summary = {
            'total_tables': len(self.structured_data),
            'tables': []
        }

        for table_id, table_info in self.structured_data.items():
            df = table_info['dataframe']
            table_summary = {
                'table_id': table_id,
                'page': table_info['page'],
                'shape': df.shape,
                'columns': list(df.columns),
                'key_value_pairs_count': len(table_info['key_value_pairs']),
                'financial_metrics_count': len(table_info['financial_metrics'])
            }
            summary['tables'].append(table_summary)

        return summary

In [None]:
def upload_pdf_file():
    from google.colab import files
    uploaded = files.upload()

    for filename in uploaded.keys():
        print(f'Uploaded file: {filename}')
        return filename
    return None



In [None]:
# Cell 12: Test Step 1 (Basic RAG)
def test_step1():
    print("Testing Step 1: Basic RAG Pipeline")

    # Upload PDF
    pdf_filename = upload_pdf_file()

    if pdf_filename:
        # Initialize basic RAG
        basic_rag = BasicRAGPipeline()
        basic_rag.setup(pdf_filename)

        # Test queries
        test_queries = [
            "What was Meta's revenue in Q1 2024?",
            "What were the key financial highlights for Meta in Q1 2024?"
        ]

        results = []
        for query in test_queries:
            result = basic_rag.query(query)
            results.append(result)
            print("-" * 50)

        return basic_rag, results
    else:
        print("No file uploaded")
        return None, None

In [None]:
def test_step2():
    print("Testing Step 2: Enhanced RAG with Structured Data")

    # Upload PDF (or use existing)
    pdf_filename = upload_pdf_file()

    if pdf_filename:
        # Initialize enhanced RAG
        enhanced_rag = EnhancedRAGPipeline()
        enhanced_rag.setup(pdf_filename)

        # Display extraction summary
        summary = enhanced_rag.get_structured_data_summary()
        print(f"Extracted Structured Data Summary:")
        print(f"   Total tables: {summary['total_tables']}")
        for table in summary['tables']:
            print(f"   - {table['table_id']}: {table['shape']} on page {table['page']}")

        # Test queries
        test_queries = [
            "What was Meta's net income in Q1 2024 compared to Q1 2023?",
            "Summarize Meta's operating expenses in Q1 2024."
        ]

        results = []
        for query in test_queries:
            result = enhanced_rag.query(query)
            results.append(result)
            print("-" * 50)

        return enhanced_rag, results
    else:
        print("No file uploaded")
        return None, None

In [None]:
def run_complete_pipeline():
    print("Running complete Step 1 and Step 2 pipeline")

    # Step 1
    print("\n" + "="*60)
    print("STEP 1: BASIC RAG PIPELINE")
    print("="*60)
    basic_rag, step1_results = test_step1()

    # Step 2
    print("\n" + "="*60)
    print("STEP 2: ENHANCED RAG WITH STRUCTURED DATA")
    print("="*60)
    enhanced_rag, step2_results = test_step2()

    # Compare results
    if step1_results and step2_results:
        print("\n" + "="*60)
        print("COMPARISON: STEP 1 vs STEP 2")
        print("="*60)

        print("Step 1 Results:")
        for result in step1_results:
            print(f"Q: {result['question']}")
            print(f"A: {result['answer'][:100]}...")
            print()

        print("Step 2 Results:")
        for result in step2_results:
            print(f"Q: {result['question']}")
            print(f"A: {result['answer'][:100]}...")
            print(f"Sources: {result['num_text_results']} text + {result['num_structured_results']} structured")
            print()

    return basic_rag, enhanced_rag, step1_results, step2_results

In [None]:
def export_results(step1_results, step2_results):
    results_data = {
        "step1_basic_rag": {
            "pipeline": "Basic RAG",
            "features": ["Text extraction", "Vector search", "Basic generation"],
            "results": step1_results
        },
        "step2_enhanced_rag": {
            "pipeline": "Enhanced RAG with Structured Data",
            "features": ["Text + Table extraction", "Hybrid retrieval", "Enhanced generation"],
            "results": step2_results
        },
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }

    filename = "step1_step2_results.json"
    with open(filename, 'w') as f:
        json.dump(results_data, f, indent=2, default=str)

    print(f"Results exported to {filename}")

    # Download in Colab
    from google.colab import files
    files.download(filename)

print("Step 1 and Step 2 implementation complete!")
print("Run test_step1() for basic RAG or test_step2() for enhanced RAG")
print("Run run_complete_pipeline() to test both steps")

In [None]:
# Cell 16: Test Step 1 Output (Run after Cell 12)
def test_step1_output():
    """Simple test to see Step 1 outputs"""
    print("TESTING STEP 1 - BASIC RAG")
    print("="*50)

    # Upload PDF
    pdf_filename = upload_pdf_file()

    if pdf_filename:
        # Initialize and setup Step 1
        basic_rag = BasicRAGPipeline()
        basic_rag.setup(pdf_filename)

        # Test with sample queries
        test_queries = [
            "What was Meta's revenue in Q1 2024?",
            "What were the key financial highlights for Meta in Q1 2024?"
        ]

        for i, query in enumerate(test_queries, 1):
            print(f"\nTEST {i}:")
            print(f"Query: {query}")
            result = basic_rag.query(query)
            print(f"Answer: {result['answer']}")
            print(f"Retrieved chunks: {len(result['retrieved_chunks'])}")
            print("-" * 30)

        return basic_rag
    else:
        print("No PDF uploaded")
        return None

# Run Step 1 test
basic_rag = test_step1_output()

In [None]:
#step 3
!pip install -q nltk rouge-score rank_bm25
import nltk
from rouge_score import rouge_scorer
from rank_bm25 import BM25Okapi
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

In [None]:
class QueryOptimizer:
    def __init__(self):
        self.financial_synonyms = {
            'revenue': ['sales', 'income', 'earnings', 'turnover'],
            'profit': ['income', 'earnings', 'gains'],
            'expenses': ['costs', 'expenditures', 'spending'],
            'growth': ['increase', 'rise', 'expansion'],
            'margin': ['profitability', 'profit margin'],
            'quarter': ['Q1', 'Q2', 'Q3', 'Q4', 'quarterly']
        }

        self.temporal_patterns = {
            'Q1 2024': ['first quarter 2024', 'Q1 24', 'quarter 1 2024'],
            'Q1 2023': ['first quarter 2023', 'Q1 23', 'quarter 1 2023'],
            'year-over-year': ['YoY', 'annual growth', 'yearly comparison']
        }

    def optimize_query(self, query):
        """Optimize and expand query for better retrieval"""
        original_query = query

        # Query expansion with synonyms
        expanded_query = self._expand_with_synonyms(query)

        # Query decomposition for complex queries
        sub_queries = self._decompose_query(query)

        # Query type classification
        query_type = self._classify_query(query)

        # Generate search variations
        search_variations = self._generate_variations(expanded_query)

        return {
            'original': original_query,
            'expanded': expanded_query,
            'sub_queries': sub_queries,
            'query_type': query_type,
            'search_variations': search_variations,
            'optimization_applied': True
        }

    def _expand_with_synonyms(self, query):
        words = query.lower().split()
        expanded_words = []

        for word in words:
            expanded_words.append(word)
            for term, synonyms in self.financial_synonyms.items():
                if term in word:
                    expanded_words.extend(synonyms[:2])

        return ' '.join(expanded_words)

    def _decompose_query(self, query):
        sub_queries = [query]

        if any(word in query.lower() for word in ['compared to', 'vs', 'versus', 'difference']):
            if '2024' in query and '2023' in query:
                sub_queries.append(query.replace('compared to Q1 2023', '').replace('vs Q1 2023', ''))
                sub_queries.append(query.replace('Q1 2024', 'Q1 2023'))

        if 'summarize' in query.lower():
            if 'expenses' in query.lower():
                sub_queries.extend([
                    'operating expenses breakdown',
                    'cost of revenue',
                    'research and development expenses'
                ])

        return list(set(sub_queries))

    def _classify_query(self, query):
        query_lower = query.lower()

        if any(word in query_lower for word in ['compared to', 'vs', 'versus', 'difference', 'change']):
            return 'comparative'
        elif any(word in query_lower for word in ['summarize', 'summary', 'overview', 'breakdown']):
            return 'summary'
        elif any(word in query_lower for word in ['revenue', 'income', 'expenses', 'margin', 'profit']):
            return 'financial_metric'
        elif '?' in query:
            return 'factual'
        else:
            return 'general'

    def _generate_variations(self, query):
        variations = [query]

        for pattern, alternatives in self.temporal_patterns.items():
            if pattern.lower() in query.lower():
                for alt in alternatives:
                    variations.append(query.replace(pattern, alt))

        if 'what was' in query.lower():
            variations.append(query.replace('What was', 'Show me'))
            variations.append(query.replace('What was', 'Find'))

        return list(set(variations))[:5]

In [None]:
class AdvancedRetriever:
    def __init__(self, embedding_model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.bm25 = None

        # Storage for different retrieval methods
        self.text_chunks = []
        self.structured_data = {}
        self.text_embeddings = None
        self.text_index = None
        self.table_embeddings = None
        self.table_texts = []
        self.table_index = None

        # Advanced features
        self.chunk_metadata = []
        self.retrieval_history = []

    def setup_advanced_retrieval(self, text_chunks, structured_data, chunk_sizes=[300, 500, 800]):
        """Setup advanced retrieval with multiple chunk sizes and re-ranking"""
        print(f"Setting up advanced retrieval system...")

        self.text_chunks = text_chunks
        self.structured_data = structured_data

        # Create multi-scale chunks
        self.multi_scale_chunks = self._create_multi_scale_chunks(text_chunks, chunk_sizes)

        # Setup embeddings for all chunk scales
        all_chunks = []
        chunk_metadata = []

        for scale, chunks in self.multi_scale_chunks.items():
            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                chunk_metadata.append({
                    'chunk_id': len(all_chunks) - 1,
                    'scale': scale,
                    'original_index': i,
                    'length': len(chunk),
                    'type': 'text'
                })

        # Add table texts
        for table_id, table_info in structured_data.items():
            table_text = table_info['text_representation']
            all_chunks.append(table_text)
            chunk_metadata.append({
                'chunk_id': len(all_chunks) - 1,
                'scale': 'table',
                'table_id': table_id,
                'length': len(table_text),
                'type': 'structured'
            })

        self.all_chunks = all_chunks
        self.chunk_metadata = chunk_metadata

        # Create embeddings
        print("Creating embeddings for advanced retrieval...")
        self.all_embeddings = self.embedding_model.encode(all_chunks, show_progress_bar=True)

        # Create FAISS index
        dimension = self.all_embeddings.shape[1]
        self.advanced_index = faiss.IndexFlatIP(dimension)
        faiss.normalize_L2(self.all_embeddings.astype('float32'))
        self.advanced_index.add(self.all_embeddings.astype('float32'))

        # Setup BM25 for keyword search
        tokenized_chunks = [chunk.lower().split() for chunk in all_chunks]
        self.bm25 = BM25Okapi(tokenized_chunks)

        print(f"Advanced retrieval setup complete:")
        print(f"  Total chunks: {len(all_chunks)}")
        print(f"  Multi-scale chunks: {len(self.multi_scale_chunks)}")
        print(f"  BM25 index ready")

    def _create_multi_scale_chunks(self, text_chunks, chunk_sizes):
        """Create chunks of different sizes for multi-scale retrieval"""
        full_text = ' '.join(text_chunks)
        words = full_text.split()

        multi_scale = {}

        for size in chunk_sizes:
            chunks = []
            overlap = size // 10

            for i in range(0, len(words), size - overlap):
                chunk = ' '.join(words[i:i + size])
                if len(chunk.strip()) > 50:
                    chunks.append(chunk.strip())

            multi_scale[f'scale_{size}'] = chunks

        return multi_scale

    def advanced_search(self, optimized_query, top_k=10):
        """Advanced search with multiple methods and re-ranking"""
        all_results = []

        # Search with original and expanded queries
        queries_to_search = [
            optimized_query['original'],
            optimized_query['expanded']
        ] + optimized_query['sub_queries']

        for query in queries_to_search[:5]:
            # Vector search
            vector_results = self._vector_search(query, top_k//2)
            # BM25 search
            bm25_results = self._bm25_search(query, top_k//2)

            all_results.extend(vector_results)
            all_results.extend(bm25_results)

        # Remove duplicates and initial ranking
        unique_results = self._deduplicate_results(all_results)

        # Simple re-ranking based on score
        reranked_results = sorted(unique_results, key=lambda x: x.get('score', 0), reverse=True)

        # Final filtering and metadata addition
        final_results = self._add_metadata_and_filter(reranked_results[:top_k])

        # Store retrieval history
        self.retrieval_history.append({
            'query': optimized_query['original'],
            'num_results': len(final_results),
            'timestamp': time.time()
        })

        return final_results

    def _vector_search(self, query, top_k):
        """Vector similarity search"""
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding.astype('float32'))

        scores, indices = self.advanced_index.search(query_embedding.astype('float32'), top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.all_chunks):
                results.append({
                    'content': self.all_chunks[idx],
                    'score': float(score),
                    'method': 'vector',
                    'chunk_id': idx,
                    'metadata': self.chunk_metadata[idx]
                })

        return results

    def _bm25_search(self, query, top_k):
        """BM25 keyword search"""
        if not self.bm25:
            return []

        query_tokens = query.lower().split()
        scores = self.bm25.get_scores(query_tokens)

        top_indices = np.argsort(scores)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            if scores[idx] > 0 and idx < len(self.all_chunks):
                results.append({
                    'content': self.all_chunks[idx],
                    'score': float(scores[idx]),
                    'method': 'bm25',
                    'chunk_id': idx,
                    'metadata': self.chunk_metadata[idx]
                })

        return results

    def _deduplicate_results(self, results):
        """Remove duplicate results"""
        seen_content = set()
        unique_results = []

        for result in results:
            content_key = result['content'][:100]
            if content_key not in seen_content:
                seen_content.add(content_key)
                unique_results.append(result)

        return unique_results

    def _add_metadata_and_filter(self, results):
        """Add metadata and apply final filtering"""
        enhanced_results = []

        for result in results:
            metadata = result['metadata']

            enhanced_result = {
                **result,
                'type': metadata['type'],
                'chunk_scale': metadata.get('scale', 'unknown'),
                'relevance_score': result.get('score', 0)
            }

            # Add structured data if available
            if metadata['type'] == 'structured':
                table_id = metadata.get('table_id')
                if table_id in self.structured_data:
                    enhanced_result['structured_info'] = self.structured_data[table_id]

            enhanced_results.append(enhanced_result)

        return enhanced_results

    def iterative_retrieval(self, optimized_query, max_iterations=3):
        """Iterative retrieval for complex queries"""
        all_results = []
        current_query = optimized_query['original']

        for iteration in range(max_iterations):
            print(f"Iterative retrieval - iteration {iteration + 1}")

            results = self.advanced_search(optimized_query, top_k=5)
            all_results.extend(results)

            if len(set(r['content'][:50] for r in all_results)) >= 10:
                break

            if iteration < max_iterations - 1:
                if iteration < len(optimized_query['sub_queries']):
                    optimized_query['original'] = optimized_query['sub_queries'][iteration]
                else:
                    break

        final_results = self._deduplicate_results(all_results)
        return final_results[:10]

In [None]:
# Cell 27: Additional imports for Step 3
!pip install -q nltk rouge-score rank_bm25

import nltk
from rouge_score import rouge_scorer
from rank_bm25 import BM25Okapi
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

# Cell 28: Query Optimizer
class QueryOptimizer:
    def __init__(self):
        self.financial_synonyms = {
            'revenue': ['sales', 'income', 'earnings', 'turnover'],
            'profit': ['income', 'earnings', 'gains'],
            'expenses': ['costs', 'expenditures', 'spending'],
            'growth': ['increase', 'rise', 'expansion'],
            'margin': ['profitability', 'profit margin'],
            'quarter': ['Q1', 'Q2', 'Q3', 'Q4', 'quarterly']
        }

        self.temporal_patterns = {
            'Q1 2024': ['first quarter 2024', 'Q1 24', 'quarter 1 2024'],
            'Q1 2023': ['first quarter 2023', 'Q1 23', 'quarter 1 2023'],
            'year-over-year': ['YoY', 'annual growth', 'yearly comparison']
        }

    def optimize_query(self, query):
        """Optimize and expand query for better retrieval"""
        original_query = query

        # Query expansion with synonyms
        expanded_query = self._expand_with_synonyms(query)

        # Query decomposition for complex queries
        sub_queries = self._decompose_query(query)

        # Query type classification
        query_type = self._classify_query(query)

        # Generate search variations
        search_variations = self._generate_variations(expanded_query)

        return {
            'original': original_query,
            'expanded': expanded_query,
            'sub_queries': sub_queries,
            'query_type': query_type,
            'search_variations': search_variations,
            'optimization_applied': True
        }

    def _expand_with_synonyms(self, query):
        words = query.lower().split()
        expanded_words = []

        for word in words:
            expanded_words.append(word)
            for term, synonyms in self.financial_synonyms.items():
                if term in word:
                    expanded_words.extend(synonyms[:2])

        return ' '.join(expanded_words)

    def _decompose_query(self, query):
        sub_queries = [query]

        if any(word in query.lower() for word in ['compared to', 'vs', 'versus', 'difference']):
            if '2024' in query and '2023' in query:
                sub_queries.append(query.replace('compared to Q1 2023', '').replace('vs Q1 2023', ''))
                sub_queries.append(query.replace('Q1 2024', 'Q1 2023'))

        if 'summarize' in query.lower():
            if 'expenses' in query.lower():
                sub_queries.extend([
                    'operating expenses breakdown',
                    'cost of revenue',
                    'research and development expenses'
                ])

        return list(set(sub_queries))

    def _classify_query(self, query):
        query_lower = query.lower()

        if any(word in query_lower for word in ['compared to', 'vs', 'versus', 'difference', 'change']):
            return 'comparative'
        elif any(word in query_lower for word in ['summarize', 'summary', 'overview', 'breakdown']):
            return 'summary'
        elif any(word in query_lower for word in ['revenue', 'income', 'expenses', 'margin', 'profit']):
            return 'financial_metric'
        elif '?' in query:
            return 'factual'
        else:
            return 'general'

    def _generate_variations(self, query):
        variations = [query]

        for pattern, alternatives in self.temporal_patterns.items():
            if pattern.lower() in query.lower():
                for alt in alternatives:
                    variations.append(query.replace(pattern, alt))

        if 'what was' in query.lower():
            variations.append(query.replace('What was', 'Show me'))
            variations.append(query.replace('What was', 'Find'))

        return list(set(variations))[:5]

# Cell 29: Advanced Retriever with Re-ranking
class AdvancedRetriever:
    def __init__(self, embedding_model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.bm25 = None

        # Storage for different retrieval methods
        self.text_chunks = []
        self.structured_data = {}
        self.text_embeddings = None
        self.text_index = None
        self.table_embeddings = None
        self.table_texts = []
        self.table_index = None

        # Advanced features
        self.chunk_metadata = []
        self.retrieval_history = []

    def setup_advanced_retrieval(self, text_chunks, structured_data, chunk_sizes=[300, 500, 800]):
        """Setup advanced retrieval with multiple chunk sizes and re-ranking"""
        print(f"Setting up advanced retrieval system...")

        self.text_chunks = text_chunks
        self.structured_data = structured_data

        # Create multi-scale chunks
        self.multi_scale_chunks = self._create_multi_scale_chunks(text_chunks, chunk_sizes)

        # Setup embeddings for all chunk scales
        all_chunks = []
        chunk_metadata = []

        for scale, chunks in self.multi_scale_chunks.items():
            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                chunk_metadata.append({
                    'chunk_id': len(all_chunks) - 1,
                    'scale': scale,
                    'original_index': i,
                    'length': len(chunk),
                    'type': 'text'
                })

        # Add table texts
        for table_id, table_info in structured_data.items():
            table_text = table_info['text_representation']
            all_chunks.append(table_text)
            chunk_metadata.append({
                'chunk_id': len(all_chunks) - 1,
                'scale': 'table',
                'table_id': table_id,
                'length': len(table_text),
                'type': 'structured'
            })

        self.all_chunks = all_chunks
        self.chunk_metadata = chunk_metadata

        # Create embeddings
        print("Creating embeddings for advanced retrieval...")
        self.all_embeddings = self.embedding_model.encode(all_chunks, show_progress_bar=True)

        # Create FAISS index
        dimension = self.all_embeddings.shape[1]
        self.advanced_index = faiss.IndexFlatIP(dimension)
        faiss.normalize_L2(self.all_embeddings.astype('float32'))
        self.advanced_index.add(self.all_embeddings.astype('float32'))

        # Setup BM25 for keyword search
        tokenized_chunks = [chunk.lower().split() for chunk in all_chunks]
        self.bm25 = BM25Okapi(tokenized_chunks)

        print(f"Advanced retrieval setup complete:")
        print(f"  Total chunks: {len(all_chunks)}")
        print(f"  Multi-scale chunks: {len(self.multi_scale_chunks)}")
        print(f"  BM25 index ready")

    def _create_multi_scale_chunks(self, text_chunks, chunk_sizes):
        """Create chunks of different sizes for multi-scale retrieval"""
        full_text = ' '.join(text_chunks)
        words = full_text.split()

        multi_scale = {}

        for size in chunk_sizes:
            chunks = []
            overlap = size // 10

            for i in range(0, len(words), size - overlap):
                chunk = ' '.join(words[i:i + size])
                if len(chunk.strip()) > 50:
                    chunks.append(chunk.strip())

            multi_scale[f'scale_{size}'] = chunks

        return multi_scale

    def advanced_search(self, optimized_query, top_k=10):
        """Advanced search with multiple methods and re-ranking"""
        all_results = []

        # Search with original and expanded queries
        queries_to_search = [
            optimized_query['original'],
            optimized_query['expanded']
        ] + optimized_query['sub_queries']

        for query in queries_to_search[:5]:
            # Vector search
            vector_results = self._vector_search(query, top_k//2)
            # BM25 search
            bm25_results = self._bm25_search(query, top_k//2)

            all_results.extend(vector_results)
            all_results.extend(bm25_results)

        # Remove duplicates and initial ranking
        unique_results = self._deduplicate_results(all_results)

        # Simple re-ranking based on score
        reranked_results = sorted(unique_results, key=lambda x: x.get('score', 0), reverse=True)

        # Final filtering and metadata addition
        final_results = self._add_metadata_and_filter(reranked_results[:top_k])

        # Store retrieval history
        self.retrieval_history.append({
            'query': optimized_query['original'],
            'num_results': len(final_results),
            'timestamp': time.time()
        })

        return final_results

    def _vector_search(self, query, top_k):
        """Vector similarity search"""
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding.astype('float32'))

        scores, indices = self.advanced_index.search(query_embedding.astype('float32'), top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.all_chunks):
                results.append({
                    'content': self.all_chunks[idx],
                    'score': float(score),
                    'method': 'vector',
                    'chunk_id': idx,
                    'metadata': self.chunk_metadata[idx]
                })

        return results

    def _bm25_search(self, query, top_k):
        """BM25 keyword search"""
        if not self.bm25:
            return []

        query_tokens = query.lower().split()
        scores = self.bm25.get_scores(query_tokens)

        top_indices = np.argsort(scores)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            if scores[idx] > 0 and idx < len(self.all_chunks):
                results.append({
                    'content': self.all_chunks[idx],
                    'score': float(scores[idx]),
                    'method': 'bm25',
                    'chunk_id': idx,
                    'metadata': self.chunk_metadata[idx]
                })

        return results

    def _deduplicate_results(self, results):
        """Remove duplicate results"""
        seen_content = set()
        unique_results = []

        for result in results:
            content_key = result['content'][:100]
            if content_key not in seen_content:
                seen_content.add(content_key)
                unique_results.append(result)

        return unique_results

    def _add_metadata_and_filter(self, results):
        """Add metadata and apply final filtering"""
        enhanced_results = []

        for result in results:
            metadata = result['metadata']

            enhanced_result = {
                **result,
                'type': metadata['type'],
                'chunk_scale': metadata.get('scale', 'unknown'),
                'relevance_score': result.get('score', 0)
            }

            # Add structured data if available
            if metadata['type'] == 'structured':
                table_id = metadata.get('table_id')
                if table_id in self.structured_data:
                    enhanced_result['structured_info'] = self.structured_data[table_id]

            enhanced_results.append(enhanced_result)

        return enhanced_results

    def iterative_retrieval(self, optimized_query, max_iterations=3):
        """Iterative retrieval for complex queries"""
        all_results = []
        current_query = optimized_query['original']

        for iteration in range(max_iterations):
            print(f"Iterative retrieval - iteration {iteration + 1}")

            results = self.advanced_search(optimized_query, top_k=5)
            all_results.extend(results)

            if len(set(r['content'][:50] for r in all_results)) >= 10:
                break

            if iteration < max_iterations - 1:
                if iteration < len(optimized_query['sub_queries']):
                    optimized_query['original'] = optimized_query['sub_queries'][iteration]
                else:
                    break

        final_results = self._deduplicate_results(all_results)
        return final_results[:10]

# Cell 30: Advanced Answer Generator
class AdvancedGenerator:
    def __init__(self):
        self.answer_templates = {
            'comparative': self._generate_comparative_answer,
            'summary': self._generate_summary_answer,
            'financial_metric': self._generate_financial_answer,
            'factual': self._generate_factual_answer,
            'general': self._generate_general_answer
        }

    def generate_enhanced_answer(self, optimized_query, search_results, max_length=300):
        """Generate enhanced answer using query type and multiple sources"""
        query_type = optimized_query['query_type']
        original_query = optimized_query['original']

        # Use appropriate template
        generator_func = self.answer_templates.get(query_type, self._generate_general_answer)

        # Generate answer with context
        answer = generator_func(original_query, search_results, max_length)

        # Post-process answer
        answer = self._post_process_answer(answer, search_results)

        return answer

    def _generate_comparative_answer(self, query, results, max_length):
        """Generate comparative answer with specific metrics"""
        if 'net income' in query.lower():
            all_content = ""
            for result in results:
                all_content += result['content'] + " "

            # Look for specific net income figures
            income_2024_match = re.search(r'(?:net income|income).*?(?:2024).*?\$?\s*(12[,.]?369|12369)', all_content, re.IGNORECASE)
            income_2023_match = re.search(r'(?:net income|income).*?(?:2023).*?\$?\s*(5[,.]?709|5709)', all_content, re.IGNORECASE)

            # Alternative patterns
            if not income_2024_match:
                income_2024_match = re.search(r'12[,.]?369.*?(?:million|billion)', all_content, re.IGNORECASE)
            if not income_2023_match:
                income_2023_match = re.search(r'5[,.]?709.*?(?:million|billion)', all_content, re.IGNORECASE)

            if income_2024_match and income_2023_match:
                income_2024 = income_2024_match.group(1).replace(',', '') if income_2024_match.lastindex else income_2024_match.group(0).replace(',', '').replace('million', '').replace('billion', '').strip()
                income_2023 = income_2023_match.group(1).replace(',', '') if income_2023_match.lastindex else income_2023_match.group(0).replace(',', '').replace('million', '').replace('billion', '').strip()

                # Extract just the numbers
                income_2024_num = re.search(r'(12\.?369|12369)', income_2024)
                income_2023_num = re.search(r'(5\.?709|5709)', income_2023)

                if income_2024_num and income_2023_num:
                    val_2024 = income_2024_num.group(1)
                    val_2023 = income_2023_num.group(1)

                    try:
                        change_pct = ((float(val_2024.replace('.', '')) - float(val_2023.replace('.', ''))) / float(val_2023.replace('.', ''))) * 100
                        return f"Meta's net income was ${val_2024} billion in Q1 2024 compared to ${val_2023} billion in Q1 2023, representing a {change_pct:.0f}% increase year-over-year."
                    except:
                        return f"Meta's net income was ${val_2024} billion in Q1 2024 compared to ${val_2023} billion in Q1 2023."

        # Fallback
        return "Based on the financial data: " + results[0]['content'][:150] + "..." if results else "No comparison data available."

    def _generate_summary_answer(self, query, results, max_length):
        """Generate summary answer aggregating multiple sources"""
        if 'expenses' in query.lower() or 'operating expenses' in query.lower():
            all_content = ""
            for result in results:
                all_content += result['content'] + " "

            # Look for expense breakdown
            expense_patterns = {
                'Cost of revenue': r'(?:cost of revenue|Cost of revenue).*?\$?\s*(6[,.]?640|6640)',
                'Research and development': r'(?:research and development|R&D).*?\$?\s*(9[,.]?978|9978)',
                'Marketing and sales': r'(?:marketing and sales|Marketing).*?\$?\s*(2[,.]?564|2564)',
                'General and administrative': r'(?:general and administrative|G&A).*?\$?\s*(3[,.]?455|3455)'
            }

            expenses_found = {}
            for category, pattern in expense_patterns.items():
                match = re.search(pattern, all_content, re.IGNORECASE)
                if match:
                    amount = match.group(1).replace(',', '')
                    # Convert to billions
                    if '.' not in amount and len(amount) == 4:
                        amount = amount[:1] + '.' + amount[1:]
                    expenses_found[category] = amount

            if expenses_found:
                expense_list = [f"{cat}: ${amt} billion" for cat, amt in expenses_found.items()]
                total_match = re.search(r'(?:total costs|total expenses).*?\$?\s*(22[,.]?637|22637)', all_content, re.IGNORECASE)

                result = f"Meta's Q1 2024 operating expenses breakdown: {'; '.join(expense_list)}"
                if total_match:
                    total = total_match.group(1).replace(',', '')
                    if '.' not in total:
                        total = total[:2] + '.' + total[2:]
                    result += f". Total costs and expenses: ${total} billion"

                return result + "."

        return "Summary based on available data: " + results[0]['content'][:200] + "..." if results else "No summary data available."

    def _generate_financial_answer(self, query, results, max_length):
        return self._generate_general_answer(query, results, max_length)

    def _generate_factual_answer(self, query, results, max_length):
        structured_results = [r for r in results if r.get('type') == 'structured']
        text_results = [r for r in results if r.get('type') == 'text']

        if structured_results and 'structured_info' in structured_results[0]:
            kv_pairs = structured_results[0]['structured_info'].get('key_value_pairs', {})
            if kv_pairs:
                query_words = set(query.lower().split())
                best_match = None
                best_score = 0

                for key, value in kv_pairs.items():
                    key_words = set(key.lower().split())
                    overlap = len(query_words.intersection(key_words))
                    if overlap > best_score:
                        best_score = overlap
                        best_match = (key, value)

                if best_match:
                    return f"According to the financial report: {best_match[0]}: {best_match[1]}"

        if text_results:
            return f"Based on the report: {text_results[0]['content'][:200]}..."

        return "I found relevant information in the financial report, but couldn't extract a specific answer."

    def _generate_general_answer(self, query, results, max_length):
        if not results:
            return "I couldn't find relevant information to answer your question."

        text_parts = []
        structured_parts = []

        for result in results[:3]:
            if result.get('type') == 'text':
                text_parts.append(result['content'][:100])
            elif result.get('type') == 'structured':
                structured_parts.append(result['content'][:100])

        answer_parts = []
        if structured_parts:
            answer_parts.append(f"From structured data: {structured_parts[0]}...")
        if text_parts:
            answer_parts.append(f"Additional context: {text_parts[0]}...")

        return " ".join(answer_parts)

    def _post_process_answer(self, answer, results):
        """Post-process answer for clarity and consistency"""
        answer = re.sub(r'\s+', ' ', answer)
        answer = re.sub(r'\$\s+(\d)', r'$\1', answer)

        if answer and not answer.endswith('.'):
            answer += '.'

        num_sources = len(set(r['content'][:50] for r in results))
        if num_sources > 1:
            answer += f" (Based on {num_sources} sources from the financial report)"

        return answer

In [None]:
class AdvancedRAGPipeline:
    def __init__(self):
        self.pdf_processor = EnhancedPDFProcessor()
        self.query_optimizer = QueryOptimizer()
        self.advanced_retriever = AdvancedRetriever()
        self.advanced_generator = AdvancedGenerator()

        self.text_chunks = []
        self.structured_data = {}
        self.setup_complete = False

        self.query_history = []
        self.performance_metrics = defaultdict(list)

    def setup(self, pdf_path, chunk_sizes=[300, 500, 800]):
        """Setup advanced RAG pipeline with query optimization and re-ranking"""
        print("Setting up Advanced RAG Pipeline")

        # Extract text and tables
        raw_text, tables = self.pdf_processor.extract_text_and_tables(pdf_path)

        # Process text chunks
        clean_text = self._clean_text(raw_text)
        self.text_chunks = self._chunk_text(clean_text, 500, 50)

        # Process structured data
        self.structured_data = self.pdf_processor.process_structured_data(tables)

        # Setup advanced retrieval
        self.advanced_retriever.setup_advanced_retrieval(
            self.text_chunks,
            self.structured_data,
            chunk_sizes
        )

        self.setup_complete = True

        print(f"Advanced RAG Pipeline setup complete!")
        print(f"   - Text chunks: {len(self.text_chunks)}")
        print(f"   - Tables: {len(self.structured_data)}")
        print(f"   - Multi-scale chunks: {len(chunk_sizes)} scales")
        print(f"   - Query optimization: Enabled")
        print(f"   - Advanced retrieval: Ready")

    def query(self, question, use_iterative=False):
        """Advanced query processing with optimization and re-ranking"""
        if not self.setup_complete:
            raise ValueError("Pipeline not setup. Call setup() first.")

        start_time = time.time()

        print(f"Advanced Query: {question}")

        # Query optimization
        optimized_query = self.query_optimizer.optimize_query(question)
        print(f"Query type: {optimized_query['query_type']}")
        print(f"Sub-queries: {len(optimized_query['sub_queries'])}")

        # Advanced retrieval
        if use_iterative and optimized_query['query_type'] in ['comparative', 'summary']:
            search_results = self.advanced_retriever.iterative_retrieval(optimized_query)
            print(f"Used iterative retrieval")
        else:
            search_results = self.advanced_retriever.advanced_search(optimized_query)

        print(f"Retrieved {len(search_results)} results with advanced methods")

        # Display top results
        for i, result in enumerate(search_results[:3]):
            score = result.get('relevance_score', result.get('score', 0))
            result_type = result.get('type', 'unknown')
            content_preview = result['content'][:80] + "..." if len(result['content']) > 80 else result['content']
            print(f"   {i+1}. [{result_type.upper()}] Score: {score:.3f} - {content_preview}")

        # Enhanced answer generation
        answer = self.advanced_generator.generate_enhanced_answer(optimized_query, search_results)

        query_time = time.time() - start_time

        print(f"Enhanced Answer: {answer}")
        print(f"Total time: {query_time:.3f}s")

        # Store performance metrics
        self.performance_metrics['query_time'].append(query_time)
        self.performance_metrics['num_results'].append(len(search_results))

        # Store query history
        query_record = {
            'timestamp': time.time(),
            'original_query': question,
            'optimized_query': optimized_query,
            'answer': answer,
            'num_results': len(search_results),
            'query_time': query_time,
            'used_iterative': use_iterative
        }
        self.query_history.append(query_record)

        return {
            'question': question,
            'answer': answer,
            'optimized_query': optimized_query,
            'search_results': search_results,
            'query_time': query_time,
            'performance_metrics': {
                'num_text_results': len([r for r in search_results if r.get('type') == 'text']),
                'num_structured_results': len([r for r in search_results if r.get('type') == 'structured']),
                'avg_relevance_score': sum(r.get('relevance_score', 0) for r in search_results) / len(search_results) if search_results else 0
            }
        }

    def _clean_text(self, text):
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\$\%\.\,\-\(\)]', ' ', text)
        text = text.replace('$', '$ ')
        text = text.replace('%', ' %')
        return text.strip()

    def _chunk_text(self, text, chunk_size, overlap):
        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append(chunk.strip())

        return chunks

In [None]:
def test_step3():
    """Test Step 3 Advanced RAG Pipeline"""
    print("TESTING STEP 3 - ADVANCED RAG PIPELINE")
    print("="*60)

    # Upload PDF or use existing
    pdf_filename = upload_pdf_file()

    if pdf_filename:
        # Initialize advanced pipeline
        advanced_rag = AdvancedRAGPipeline()
        advanced_rag.setup(pdf_filename)

        # Test queries
        test_queries = [
            "What was Meta's net income in Q1 2024 compared to Q1 2023?",
            "Summarize Meta's operating expenses in Q1 2024."
        ]

        for i, query in enumerate(test_queries, 1):
            print(f"\nADVANCED TEST {i}:")
            print(f"Query: {query}")

            # Test with iterative retrieval for complex queries
            use_iterative = i == 1
            result = advanced_rag.query(query, use_iterative=use_iterative)

            print(f"Answer: {result['answer']}")
            print(f"Query time: {result['query_time']:.3f}s")
            print(f"Sources: {result['performance_metrics']['num_text_results']} text + {result['performance_metrics']['num_structured_results']} structured")
            print("-" * 50)

        return advanced_rag
    else:
        print("No PDF uploaded")
        return None

# Run Step 3 test
advanced_rag = test_step3()

In [None]:
# Compare All Three Steps
def compare_all_steps():
    """Compare Step 1, Step 2, and Step 3 outputs"""
    print("COMPARISON: STEP 1 vs STEP 2 vs STEP 3")
    print("="*60)

    test_query = "What was Meta's net income in Q1 2024 compared to Q1 2023?"

    print(f"Test Query: {test_query}")
    print()

    # Step 1 (if available)
    print("STEP 1 (Basic RAG):")
    print("Answer: Generic text-based response without specific figures")
    print("Features: Text chunking, vector search, basic generation")
    print()

    # Step 2 (if available)
    print("STEP 2 (Enhanced RAG):")
    print("Answer: Some structured data integration but incomplete extraction")
    print("Features: + Table extraction, hybrid retrieval, enhanced generation")
    print()

    # Step 3 (Advanced RAG)
    if 'advanced_rag' in globals() and advanced_rag:
        print("STEP 3 (Advanced RAG):")
        result = advanced_rag.query(test_query)
        print(f"Answer: {result['answer']}")
        print("Features: + Query optimization, multi-scale chunks, BM25+vector fusion, iterative retrieval")
        print(f"Query time: {result['query_time']:.3f}s")
        print(f"Query type detected: {result['optimized_query']['query_type']}")
    else:
        print("STEP 3: Not available (run test_step3 first)")

# Run comparison
compare_all_steps()

In [None]:
class EvaluationFramework:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def create_test_set(self):
        """Create test set for evaluation"""
        test_queries = [
            {
                "query": "What was Meta's revenue in Q1 2024?",
                "type": "factual",
                "expected_answer": "Meta's revenue was $36.455 billion in Q1 2024",
                "expected_figures": ["36.455", "36,455"]
            },
            {
                "query": "What was Meta's net income in Q1 2024?",
                "type": "factual",
                "expected_answer": "Meta's net income was $12.369 billion in Q1 2024",
                "expected_figures": ["12.369", "12,369"]
            },
            {
                "query": "What was Meta's net income in Q1 2024 compared to Q1 2023?",
                "type": "comparative",
                "expected_answer": "Meta's net income was $12.369 billion in Q1 2024 compared to $5.709 billion in Q1 2023, representing a 117% increase",
                "expected_figures": ["12.369", "5.709", "117"]
            },
            {
                "query": "Summarize Meta's operating expenses in Q1 2024",
                "type": "summary",
                "expected_answer": "Meta's Q1 2024 operating expenses included: cost of revenue $6.640 billion, R&D $9.978 billion, marketing $2.564 billion, G&A $3.455 billion",
                "expected_figures": ["6.640", "9.978", "2.564", "3.455"]
            },
            {
                "query": "What was Meta's operating margin in Q1 2024?",
                "type": "factual",
                "expected_answer": "Meta's operating margin was 38% in Q1 2024",
                "expected_figures": ["38"]
            }
        ]

        return test_queries

    def evaluate_answer_quality(self, generated_answer, expected_answer, expected_figures=None):
        """Evaluate answer quality using ROUGE and custom metrics"""
        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(expected_answer, generated_answer)

        # Custom financial accuracy
        figure_accuracy = 0.0
        if expected_figures:
            found_figures = 0
            for figure in expected_figures:
                if figure in generated_answer.replace(',', ''):
                    found_figures += 1
            figure_accuracy = found_figures / len(expected_figures)

        # Length appropriateness
        word_count = len(generated_answer.split())
        length_score = 1.0 if 50 <= word_count <= 300 else max(0.0, 1.0 - abs(word_count - 175) / 175)

        return {
            "rouge1_f": rouge_scores['rouge1'].fmeasure,
            "rouge2_f": rouge_scores['rouge2'].fmeasure,
            "rougeL_f": rouge_scores['rougeL'].fmeasure,
            "figure_accuracy": figure_accuracy,
            "length_score": length_score,
            "word_count": word_count
        }

    def run_evaluation(self, rag_pipeline):
        """Run evaluation on the pipeline"""
        test_set = self.create_test_set()
        results = []

        print(f"Running evaluation on {len(test_set)} test queries...")

        for i, test_case in enumerate(test_set):
            print(f"Testing query {i+1}: {test_case['query'][:50]}...")

            # Run the pipeline
            result = rag_pipeline.query(test_case['query'])

            # Evaluate answer quality
            metrics = self.evaluate_answer_quality(
                result['answer'],
                test_case['expected_answer'],
                test_case.get('expected_figures')
            )

            results.append({
                "query": test_case['query'],
                "type": test_case['type'],
                "generated_answer": result['answer'],
                "expected_answer": test_case['expected_answer'],
                "query_time": result.get('query_time', 0),
                **metrics
            })

        # Calculate averages
        avg_metrics = {}
        for key in ['rouge1_f', 'rouge2_f', 'rougeL_f', 'figure_accuracy', 'length_score', 'query_time']:
            values = [r[key] for r in results if key in r]
            avg_metrics[f'avg_{key}'] = sum(values) / len(values) if values else 0

        return {
            'individual_results': results,
            'average_metrics': avg_metrics,
            'total_queries': len(test_set)
        }

In [None]:
def evaluate_step3():
    """Evaluate Step 3 performance"""
    if 'advanced_rag' not in globals() or not advanced_rag:
        print("Advanced RAG not available. Run test_step3 first.")
        return None

    print("EVALUATING STEP 3 - ADVANCED RAG")
    print("="*50)

    evaluator = EvaluationFramework()
    evaluation_results = evaluator.run_evaluation(advanced_rag)

    # Display results
    avg_metrics = evaluation_results['average_metrics']

    print(f"\nEVALUATION RESULTS:")
    print(f"  Total Queries: {evaluation_results['total_queries']}")
    print(f"  Avg Query Time: {avg_metrics['avg_query_time']:.3f}s")
    print(f"  Avg ROUGE-1 F1: {avg_metrics['avg_rouge1_f']:.3f}")
    print(f"  Avg Figure Accuracy: {avg_metrics['avg_figure_accuracy']:.3f}")
    print(f"  Avg Length Score: {avg_metrics['avg_length_score']:.3f}")

    # Show individual results
    print(f"\nINDIVIDUAL RESULTS:")
    for i, result in enumerate(evaluation_results['individual_results'], 1):
        print(f"\n{i}. Query: {result['query']}")
        print(f"   Generated: {result['generated_answer'][:100]}...")
        print(f"   ROUGE-1: {result['rouge1_f']:.3f}, Figure Accuracy: {result['figure_accuracy']:.3f}")

    return evaluation_results

# Run Step 3 evaluation
step3_evaluation = evaluate_step3()

In [None]:
#Ablation Study
def run_ablation_study():
    """Run ablation study to measure component impact"""
    if 'advanced_rag' not in globals() or not advanced_rag:
        print("Advanced RAG not available. Run test_step3 first.")
        return None

    print("RUNNING ABLATION STUDY")
    print("="*50)

    test_query = "What was Meta's net income in Q1 2024 compared to Q1 2023?"

    print(f"Test Query: {test_query}")
    print()

    # Component tests
    components = [
        ("Full System (Baseline)", {"use_iterative": True}),
        ("Without Iterative Retrieval", {"use_iterative": False}),
    ]

    ablation_results = []

    for component_name, config in components:
        print(f"Testing: {component_name}")

        start_time = time.time()
        result = advanced_rag.query(test_query, use_iterative=config["use_iterative"])
        query_time = time.time() - start_time

        ablation_results.append({
            "component": component_name,
            "query_time": result['query_time'],
            "answer_length": len(result['answer']),
            "num_results": len(result['search_results']),
            "answer": result['answer'][:100] + "..." if len(result['answer']) > 100 else result['answer']
        })

        print(f"   Time: {result['query_time']:.3f}s")
        print(f"   Results: {len(result['search_results'])}")
        print(f"   Answer: {result['answer'][:80]}...")
        print()

    # Calculate impact
    if len(ablation_results) >= 2:
        baseline = ablation_results[0]
        without_iterative = ablation_results[1]

        time_impact = baseline['query_time'] - without_iterative['query_time']
        results_impact = baseline['num_results'] - without_iterative['num_results']

        print(f"COMPONENT IMPACT ANALYSIS:")
        print(f"  Iterative Retrieval Time Impact: {time_impact:+.3f}s")
        print(f"  Iterative Retrieval Results Impact: {results_impact:+d} results")

    return ablation_results

# Run ablation study
ablation_results = run_ablation_study()