In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install openai sentence-transformers faiss-cpu beautifulsoup4
!pip install chromadb langchain langchain_community langchain_openai

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json
import uuid
import logging
import numpy as np
import faiss
from typing import List, Dict
from sentence_transformers import SentenceTransformer

class AdvancedDocumentChunker:
    def __init__(self, embedding_model='sentence-transformers/all-mpnet-base-v2'):
        """
        Initialize document chunker and embedding model
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.vector_index = None
        self.metadata = []
        self.dimension = None

    def _process_table(self, table) -> str:
        """
        Convert table to pure text paragraph without any markdown symbols
        Format: "Table [caption] shows: [header1] is [value1], [header2] is [value2]..."
        """
        try:
            # Skip navigation tables
            if table.get('class') and any(c in ['navbox', 'sidebar', 'infobox'] for c in table.get('class')):
                return ''

            # 1. Extract caption
            caption = table.find('caption')
            caption_text = caption.get_text(strip=True) if caption else "the data"

            # 2. Extract headers (th or first row's td)
            headers = []
            header_row = table.find('tr')
            if header_row:
                headers = [th.get_text(" ", strip=True)
                        for th in header_row.find_all(['th', 'td'])]

            # 3. Process all rows
            paragraphs = []
            for i, tr in enumerate(table.find_all('tr')[1 if headers else 0:]):
                cells = [td.get_text(" ", strip=True) for td in tr.find_all(['td', 'th'])]
                if not cells or not any(cell.strip() for cell in cells):
                    continue

                # Build sentence for each row
                if headers:
                    # With headers: "ColumnA is value1, ColumnB is value2"
                    row_desc = []
                    for j, val in enumerate(cells):
                        header = headers[j] if j < len(headers) else f"Column {j+1}"
                        if val.strip():
                            row_desc.append(f"{header} is {val}")
                else:
                    # Without headers: "Row 1: value1, value2..."
                    row_desc = [f"Row {i+1}: {', '.join(cells)}"]

                paragraphs.append(", ".join(row_desc) + ".")

            if not paragraphs:
                return ""

            # 4. Combine into final paragraph
            return f"Table '{caption_text}' shows: " + " ".join(paragraphs)

        except Exception as e:
            logging.error(f"Table processing error: {str(e)}")
            return ""

    def clean_text(self, text: str) -> str:
        """Clean text content"""
        try:
            soup = BeautifulSoup(text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
        except Exception as e:
            logging.warning(f"HTML parsing warning: {e}")

        text = re.sub(r'\s+', ' ', text).strip()

        return text
    def extract_sections(self, text: str) -> List[Dict]:
        """
        Enhanced section extraction with comprehensive reference removal
        Properly handles H4 tags to maintain heading hierarchy
        """
        if not isinstance(text, str):
            text = str(text)

        soup = BeautifulSoup(text, 'html.parser')

        # Enhanced reference removal - handle all known reference forms
        reference_selectors = [
            # Wikipedia standard references
            'sup.reference', 'span.mw-cite-backlink',
            # General citation markers
            'span.citation', 'span.footnote', 'div.footnotes',
            # Reference blocks
            'ol.references', 'div.reflist', 'div.refbegin',
            # Hidden content
            'div.noprint', 'span.mw-editsection',
            # Citation links
            'a[href^="#cite"]', 'a[href*="wikisource"]',
            # Modern HTML5 notes
            '[role="doc-noteref"]', '[role="doc-endnotes"]'
        ]

        for selector in reference_selectors:
            for ref in soup.select(selector):
                ref.decompose()

        # Additionally clean [1][2] text references
        for element in soup.find_all(string=True):
            if isinstance(element, str):
                cleaned = re.sub(r'\[\d+\]', '', element)
                if cleaned != element:
                    element.replace_with(cleaned)

        sections = []
        is_wikipedia = 'wikipedia' in text.lower() or bool(soup.select_one('.mw-parser-output'))

        if is_wikipedia:
            main_content = soup.select_one('.mw-parser-output')
            if main_content:
                current_hierarchy = []
                current_content = []
                current_h4 = None  # Track current H4 title

                for elem in main_content.children:
                    if elem.name in ['h2', 'h3']:
                        # Save current section
                        if current_hierarchy and current_content:
                            # Include current H4 in the title if it exists
                            title = ' > '.join(current_hierarchy)
                            if current_h4:
                                title = f"{title} > {current_h4}"
                                current_h4 = None  # Reset H4 tracking

                            sections.append({
                                'document_id': str(uuid.uuid4()),
                                'section': {
                                    'title': title,
                                    'content': self.clean_text(' '.join(current_content))
                                }
                            })
                            current_content = []

                        # Update hierarchy
                        level = int(elem.name[1])
                        title = elem.get_text(strip=True)
                        current_hierarchy = current_hierarchy[:level-2] + [title]

                    elif elem.name == 'h4':
                        # If we already have content, save the current subsection before starting a new H4
                        if current_content and current_hierarchy:
                            title = ' > '.join(current_hierarchy)
                            if current_h4:
                                title = f"{title} > {current_h4}"

                            sections.append({
                                'document_id': str(uuid.uuid4()),
                                'section': {
                                    'title': title,
                                    'content': self.clean_text(' '.join(current_content))
                                }
                            })
                            current_content = []

                        # Set new H4 as current
                        current_h4 = elem.get_text(strip=True)

                    elif elem.name in ['p', 'ul', 'ol', 'table']:
                        # Process content elements
                        if elem.name == 'p':
                            text = elem.get_text(strip=True)
                            if text: current_content.append(text)
                        elif elem.name in ['ul', 'ol']:
                            items = [li.get_text(strip=True) for li in elem.find_all('li')]
                            if items: current_content.append('; '.join(items))
                        elif elem.name == 'table':
                            if not any(c in elem.get('class', [])
                                    for c in ['navbox', 'infobox', 'sidebar']):
                                table_text = self._process_table(elem)
                                if table_text: current_content.append(table_text)

                # Add the last section
                if current_hierarchy and current_content:
                    title = ' > '.join(current_hierarchy)
                    if current_h4:
                        title = f"{title} > {current_h4}"

                    sections.append({
                        'document_id': str(uuid.uuid4()),
                        'section': {
                            'title': title,
                            'content': self.clean_text(' '.join(current_content))
                        }
                    })

        # Generic processing for non-Wikipedia content (similar approach can be applied here)
        if not sections:
            current_section = None
            current_h4 = None
            for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'ol', 'table']):
                if elem.name in ['h1', 'h2', 'h3']:
                    if current_section and current_section['section']['content']:
                        # Include H4 in title if present
                        if current_h4:
                            current_section['section']['title'] += f" > {current_h4}"
                            current_h4 = None

                        # Convert content from list to string
                        if isinstance(current_section['section']['content'], list):
                            current_section['section']['content'] = self.clean_text(' '.join(current_section['section']['content']))

                        sections.append(current_section)

                    current_section = {
                        'document_id': str(uuid.uuid4()),
                        'section': {
                            'title': elem.get_text(strip=True),
                            'content': []
                        }
                    }
                elif elem.name == 'h4':
                    # Save current subsection if it has content
                    if current_section and current_section['section']['content']:
                        title = current_section['section']['title']
                        if current_h4:
                            title += f" > {current_h4}"

                        sections.append({
                            'document_id': str(uuid.uuid4()),
                            'section': {
                                'title': title,
                                'content': self.clean_text(' '.join(current_section['section']['content']))
                            }
                        })

                        # Reset content but keep the same parent section
                        current_section = {
                            'document_id': str(uuid.uuid4()),
                            'section': {
                                'title': title.split(' > ')[0],  # Keep original parent title
                                'content': []
                            }
                        }

                    # Update H4 title
                    current_h4 = elem.get_text(strip=True)

                elif current_section:
                    if elem.name == 'p':
                        text = elem.get_text(strip=True)
                        if text: current_section['section']['content'].append(text)
                    elif elem.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in elem.find_all('li')]
                        if items: current_section['section']['content'].append('; '.join(items))
                    elif elem.name == 'table':
                        table_text = self._process_table(elem)
                        if table_text: current_section['section']['content'].append(table_text)

            if current_section and current_section['section']['content']:
                # Include H4 in final section if present
                if current_h4:
                    current_section['section']['title'] += f" > {current_h4}"

                # Convert content from list to string if needed
                if isinstance(current_section['section']['content'], list):
                    current_section['section']['content'] = self.clean_text(' '.join(current_section['section']['content']))

                sections.append(current_section)

        # Fallback: if no sections were found, return the entire document
        if not sections:
            full_text = ' '.join(p.get_text(strip=True) for p in soup.find_all(['p', 'li']))
            if full_text:
                sections.append({
                    'document_id': str(uuid.uuid4()),
                    'section': {
                        'title': 'Full Document',
                        'content': self.clean_text(full_text)
                    }
                })

        logging.info(f"Extracted {len(sections)} sections")
        return sections


    def chunk_and_index(self, documents, max_documents=100):
        """Improved version with better debugging"""
        documents = documents[:max_documents]
        all_sections = []

        for doc_index, doc in enumerate(documents):
            try:
                text_fields = ['document_text', 'html_content', 'text', 'content']
                text = next((doc[field] for field in text_fields if field in doc and doc[field]), str(doc))

                sections = self.extract_sections(text)
                print(f"\nDocument {doc_index} - Extracted {len(sections)} sections:")

                for i, section in enumerate(sections, 1):
                    print(f"  Section {i}: {section['section']['title']} "
                          f"(Length: {len(section['section']['content'])} chars)")
                    all_sections.append({
                        'document_id': f'Doc_{doc_index}_Sec_{i}',
                        'section': section['section']
                    })

            except Exception as e:
                print(f"Error processing document {doc_index}: {str(e)}")

        if not all_sections:
            print("Warning: No sections extracted - using fallback chunking")
            return self.fallback_chunking(documents)

        # Generate embeddings with title emphasis
        section_texts = [
            f"TITLE: {s['section']['title']}\nCONTENT: {s['section']['content']}"
            for s in all_sections
        ]

        print(f"\nGenerating embeddings for {len(section_texts)} sections...")
        embeddings = self.embedding_model.encode(section_texts, convert_to_numpy=True)

        # Create and populate index
        self.dimension = embeddings.shape[1]
        print(f"Embedding dimension: {self.dimension}")

        self.vector_index = faiss.IndexFlatIP(self.dimension)  # Using Inner Product for cosine similarity
        faiss.normalize_L2(embeddings)  # Normalize for cosine similarity
        self.vector_index.add(embeddings)
        self.metadata = all_sections

        print(f"Index built with {self.vector_index.ntotal} vectors")
        return all_sections

    def retrieve_sections(self, query: str, top_k: int = 3):
        """Improved retrieval with better scoring"""
        if self.vector_index is None:
            return []

        # Encode and normalize query
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)[0]
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Search with cosine similarity
        D, I = self.vector_index.search(np.array([query_embedding]).astype('float32'), top_k)

        results = []
        for i, idx in enumerate(I[0]):
            meta = self.metadata[idx]
            similarity = (1 + D[0][i]) / 2  # Convert from [-1,1] to [0,1] range
            results.append({
                'document': meta['document_id'],
                'title': meta['section']['title'],
                'content': meta['section']['content'],
                'similarity': f"{similarity:.2%}"
            })

        return sorted(results, key=lambda x: float(x['similarity'].strip('%')), reverse=True)[:top_k]

    def save_to_vectorstore(self, persist_directory, embedding_model=None):
        """
        Save the processed sections to a Chroma vector store for later retrieval

        Args:
            persist_directory: Directory to save the vector store
            embedding_model: LangChain embedding model (default: OpenAIEmbeddings)
        """
        try:
            from langchain_community.vectorstores import Chroma
            from langchain.embeddings import OpenAIEmbeddings
        except ImportError:
            print("Need to install langchain packages: pip install langchain langchain-community")
            return None

        if embedding_model is None:
            try:
                embedding_model = OpenAIEmbeddings()
            except Exception as e:
                print(f"Error creating OpenAIEmbeddings: {e}")
                print("Using a dummy embedding function instead")
                class DummyEmbeddings:
                    def embed_documents(self, texts):
                        return [self.embed_query(text) for text in texts]
                    def embed_query(self, text):
                        return [0.0] * self.dimension
                embedding_model = DummyEmbeddings()
                embedding_model.dimension = self.dimension

        # Convert sections to LangChain documents
        from langchain.schema import Document
        documents = []
        for section in self.metadata:
            documents.append(
                Document(
                    page_content=section['section']['content'],
                    metadata={
                        "title": section['section']['title'],
                        "source": section['document_id']
                    }
                )
            )

        # Create and persist the vector store
        vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=embedding_model,
            persist_directory=persist_directory
        )
        vectorstore.persist()

        print(f"Saved {len(documents)} sections to vector store at {persist_directory}")
        return vectorstore

# Example usage
def main():
    # Wikipedia URL for The Avengers film
    url = 'https://en.wikipedia.org/wiki/The_Avengers_(2012_film)'

    # Fetch HTML content
    response = requests.get(url)

    # Create a test document dictionary
    test_document = [{
        'document_text': response.text
    }]

    # Initialize chunker
    chunker = AdvancedDocumentChunker()

    # Process the URL and get sections
    sections = chunker.extract_sections(response.text)

    # Print out the sections for verification
    print(f"Total Sections: {len(sections)}\n")
    for i, section in enumerate(sections, 1):
        print(f"Section {i} - Title: {section['section']['title']}")
        print(f"Content Preview: {section['section']['content']}...\n")

    # Chunk and index
    print("\n===== Chunk and Index =====")
    indexed_sections = chunker.chunk_and_index(test_document)

    print("\n===== Saving to Vector Store =====")
    vector_store_path = "L1_vector_final"
    chunker.save_to_vectorstore(vector_store_path)

    # Test retrieval
    test_queries = [
        "Who are the main actors?",
        "When was the movie released?",
        "What is the plot of the movie?"
    ]

    print("\n===== Query Retrieval =====")
    for query in test_queries:
        print(f"\nQuery: {query}")
        results = chunker.retrieve_sections(query, top_k=2)
        for i, result in enumerate(results, 1):
            print(f"\nResult {i} (Similarity: {result['similarity']}):")
            print(f"Title: {result['title']}")
            print(f"Content: {result['content'][:100]}...")

if __name__ == '__main__':
    main()