In [None]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import time
from collections import deque
import random
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import queue
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging

class OptimizedWebCrawler:
    def __init__(self, base_url, max_depth=3, delay=1, max_workers=5, use_proxies=False):
        self.base_url = base_url
        self.base_domain = urlparse(base_url).netloc
        self.max_depth = max_depth
        self.delay = delay
        self.max_workers = max_workers
        self.use_proxies = use_proxies
        self.visited_urls = set()
        self.all_links = set()
        self.failed_urls = set()
        self.lock = threading.Lock()

        # Enhanced headers rotation for better stealth
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0'
        ]

        # Oxylabs credentials (if using proxies)
        self.username = "wazirali_kmWml"
        self.password = "9291105=Shigar"
        self.api_url = "https://realtime.oxylabs.io/v1/queries"

        # Setup session with retry strategy
        self.session = self.create_session()

        # Setup logging
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def create_session(self):
        """Create a session with retry strategy and connection pooling"""
        session = requests.Session()

        # Retry strategy - compatible with different urllib3 versions
        try:
            # Try new parameter name first
            retry_strategy = Retry(
                total=3,
                status_forcelist=[429, 500, 502, 503, 504],
                allowed_methods=["HEAD", "GET", "OPTIONS"],
                backoff_factor=1
            )
        except TypeError:
            # Fallback to old parameter name
            retry_strategy = Retry(
                total=3,
                status_forcelist=[429, 500, 502, 503, 504],
                method_whitelist=["HEAD", "GET", "OPTIONS"],
                backoff_factor=1
            )

        adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=20, pool_maxsize=20)
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        return session

    def get_random_headers(self):
        """Generate random headers to avoid detection"""
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        }

    def should_skip_link(self, href, full_url):
        """Determine if a link should be skipped based on various criteria"""
        if not href or not full_url:
            return True

        # Skip fragments and javascript links
        if href.startswith('#') or href.startswith('javascript:') or href.startswith('mailto:') or href.startswith('tel:'):
            return True

        # Skip malformed URLs that contain the domain name in the path
        if self.base_domain in full_url.replace('https://', '').replace('http://', '').replace(self.base_domain, '', 1):
            if full_url.count(self.base_domain) > 1:
                return True

        # Skip external links (different domain)
        parsed_url = urlparse(full_url)
        if parsed_url.netloc and parsed_url.netloc != self.base_domain:
            return True

        # Skip social media links
        social_media_patterns = [
            r'facebook\.com', r'instagram\.com', r'twitter\.com', r'linkedin\.com',
            r'pinterest\.com', r'youtube\.com', r'tiktok\.com', r'snapchat\.com',
            r'whatsapp\.com', r'telegram\.', r'discord\.', r'reddit\.com'
        ]

        # Skip authentication links
        auth_patterns = [
            r'signup', r'signin', r'login', r'logout', r'register', r'account',
            r'auth', r'authentication', r'password', r'reset'
        ]

        # Skip file downloads and images
        file_extensions = [
            r'\.jpg$', r'\.jpeg$', r'\.png$', r'\.gif$', r'\.svg$', r'\.webp$',
            r'\.pdf$', r'\.doc$', r'\.docx$', r'\.zip$', r'\.rar$', r'\.exe$',
            r'\.mp4$', r'\.mp3$', r'\.avi$', r'\.mov$', r'\.wmv$'
        ]

        # Skip admin and system pages
        admin_patterns = [
            r'admin', r'dashboard', r'cms', r'wp-admin', r'wp-content',
            r'\.xml$', r'\.json$', r'sitemap', r'robots\.txt'
        ]

        # Check all patterns
        all_patterns = social_media_patterns + auth_patterns + file_extensions + admin_patterns

        for pattern in all_patterns:
            if re.search(pattern, full_url, re.IGNORECASE):
                return True

        return False

    def extract_links_direct(self, url):
        """Extract links using direct requests (faster method)"""
        try:
            headers = self.get_random_headers()

            # Add random delay to mimic human behavior
            time.sleep(random.uniform(0.5, 1.5))

            response = self.session.get(url, headers=headers, timeout=15)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            return self.parse_links_from_soup(soup, url)

        except Exception as e:
            self.logger.error(f"Direct fetch error for {url}: {e}")
            return set()

    def extract_links_proxy(self, url):
        """Extract links using Oxylabs proxy (slower but more reliable for blocked sites)"""
        try:
            payload = {
                "source": "universal",
                "url": url,
                "geo_location": "Pakistan",
                "render": "html",
                "parse": False,
                "user_agent_type": "desktop_chrome"
            }

            headers = {
                'Content-Type': 'application/json',
                'User-Agent': random.choice(self.user_agents)
            }

            self.logger.info(f"Using Oxylabs proxy for: {url}")

            response = requests.post(
                self.api_url,
                auth=(self.username, self.password),
                json=payload,
                headers=headers,
                timeout=60  # Increased timeout for proxy
            )
            response.raise_for_status()

            api_response = response.json()
            self.logger.info(f"Oxylabs response status: {response.status_code}")

            # Handle different response formats from Oxylabs
            html_content = None

            if 'results' in api_response and api_response['results']:
                # Direct content in results
                result = api_response['results'][0]
                if 'content' in result:
                    html_content = result['content']
                    self.logger.info(f"Got content directly from results")
                elif 'url' in result:
                    # Sometimes content is in a separate URL
                    content_url = result['url']
                    self.logger.info(f"Fetching content from: {content_url}")
                    content_response = requests.get(
                        content_url,
                        auth=(self.username, self.password),
                        timeout=30
                    )
                    content_response.raise_for_status()
                    html_content = content_response.text

            elif 'job' in api_response and '_links' in api_response['job']:
                # Try to get content URL from job links
                for link in api_response['job']['_links']:
                    if link.get('rel') == 'results-content-html' and 'href' in link:
                        content_url = link['href']
                        self.logger.info(f"Fetching content from job link: {content_url}")
                        content_response = requests.get(
                            content_url,
                            auth=(self.username, self.password),
                            timeout=30
                        )
                        content_response.raise_for_status()
                        html_content = content_response.text
                        break
                    elif link.get('rel') == 'results-content-html' and 'href_list' in link:
                        content_url = link['href_list'][0]
                        self.logger.info(f"Fetching content from href_list: {content_url}")
                        content_response = requests.get(
                            content_url,
                            auth=(self.username, self.password),
                            timeout=30
                        )
                        content_response.raise_for_status()
                        html_content = content_response.text
                        break

            if not html_content:
                self.logger.error(f"No HTML content found in Oxylabs response for {url}")
                self.logger.debug(f"Response structure: {json.dumps(api_response, indent=2)}")
                return set()

            soup = BeautifulSoup(html_content, 'html.parser')
            links = self.parse_links_from_soup(soup, url)
            self.logger.info(f"Extracted {len(links)} links using proxy")
            return links

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Proxy request error for {url}: {e}")
            return set()
        except Exception as e:
            self.logger.error(f"Proxy parsing error for {url}: {e}")
            return set()

    def parse_links_from_soup(self, soup, base_url):
        """Parse links from BeautifulSoup object"""
        page_links = set()

        # Extract links from <a> tags
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href'].strip()

            # Clean up malformed hrefs
            if href.startswith('www.') and not href.startswith('http'):
                href = f"https://{href}"
            elif href.startswith(self.base_domain) and not href.startswith('http'):
                href = f"https://{href}"

            full_url = urljoin(base_url, href)

            # Additional cleanup for malformed URLs
            if full_url.count(self.base_domain) > 1:
                continue

            if not self.should_skip_link(href, full_url):
                page_links.add(full_url)

        # Extract links from buttons with onclick redirects
        for button in soup.find_all('button'):
            onclick = button.get('onclick', '')
            patterns = [
                r"window\.location\.href\s*=\s*['\"]([^'\"]+)['\"]",
                r"location\.href\s*=\s*['\"]([^'\"]+)['\"]",
                r"window\.open\s*\(\s*['\"]([^'\"]+)['\"]",
                r"document\.location\s*=\s*['\"]([^'\"]+)['\"]"
            ]

            for pattern in patterns:
                href_match = re.search(pattern, onclick)
                if href_match:
                    href = href_match.group(1)
                    full_url = urljoin(base_url, href)

                    if not self.should_skip_link(href, full_url):
                        page_links.add(full_url)
                    break

        # Extract links from form actions
        for form in soup.find_all('form', action=True):
            action = form['action'].strip()
            if action and not action.startswith('#'):
                full_url = urljoin(base_url, action)

                if not self.should_skip_link(action, full_url):
                    page_links.add(full_url)

        return page_links

    def process_url(self, url_data):
        """Process a single URL (for multithreading)"""
        url, depth = url_data

        self.logger.info(f"Depth {depth}: Processing {url}")

        # Use proxy method primarily as requested
        if self.use_proxies or hasattr(self, 'username'):
            page_links = self.extract_links_proxy(url)
            # If proxy fails, try direct as fallback
            if not page_links:
                self.logger.info(f"Proxy method failed for {url}, trying direct...")
                page_links = self.extract_links_direct(url)
        else:
            # Try direct method first, fallback to proxy if available
            page_links = self.extract_links_direct(url)
            if not page_links and hasattr(self, 'username'):
                self.logger.info(f"Direct method failed for {url}, trying proxy...")
                page_links = self.extract_links_proxy(url)

        if not page_links:
            with self.lock:
                self.failed_urls.add(url)

        return url, depth, page_links

    def crawl_parallel(self):
        """Main crawling function using parallel processing"""
        queue = deque([(self.base_url, 0)])
        self.visited_urls.add(self.base_url)

        self.logger.info(f"Starting parallel crawl of {self.base_url}")
        self.logger.info(f"Max depth: {self.max_depth}, Workers: {self.max_workers}")
        self.logger.info("=" * 60)

        while queue:
            # Get current batch of URLs to process
            current_batch = []
            current_depth = None

            # Collect URLs of the same depth level
            while queue and len(current_batch) < self.max_workers:
                url, depth = queue.popleft()
                if current_depth is None:
                    current_depth = depth
                elif depth != current_depth:
                    # Put back the URL and break if different depth
                    queue.appendleft((url, depth))
                    break

                if depth <= self.max_depth:
                    current_batch.append((url, depth))

            if not current_batch:
                break

            # Process batch in parallel
            new_links_batch = []
            with ThreadPoolExecutor(max_workers=min(self.max_workers, len(current_batch))) as executor:
                future_to_url = {executor.submit(self.process_url, url_data): url_data for url_data in current_batch}

                for future in as_completed(future_to_url):
                    try:
                        url, depth, page_links = future.result()
                        new_links_batch.extend([(link, depth + 1) for link in page_links])

                        # Thread-safe update of all_links
                        with self.lock:
                            new_links = page_links - self.all_links
                            self.all_links.update(page_links)

                        self.logger.info(f"Found {len(page_links)} total links, {len(new_links)} new from {url}")

                    except Exception as e:
                        self.logger.error(f"Error processing URL: {e}")

            # Add new links to queue for next depth level
            if current_depth < self.max_depth:
                for link, next_depth in new_links_batch:
                    if link not in self.visited_urls and next_depth <= self.max_depth:
                        queue.append((link, next_depth))
                        self.visited_urls.add(link)

            # Add delay between batches
            if self.delay > 0 and queue:
                time.sleep(random.uniform(self.delay * 0.5, self.delay * 1.5))

        return self.all_links

    def crawl(self):
        """Main crawling function - choose between parallel and sequential"""
        if self.max_workers > 1:
            return self.crawl_parallel()
        else:
            return self.crawl_sequential()

    def crawl_sequential(self):
        """Sequential crawling (original method but optimized)"""
        queue = deque([(self.base_url, 0)])
        self.visited_urls.add(self.base_url)

        self.logger.info(f"Starting sequential crawl of {self.base_url}")
        self.logger.info("=" * 60)

        while queue:
            current_url, depth = queue.popleft()

            if depth > self.max_depth:
                continue

            url, _, page_links = self.process_url((current_url, depth))

            # Add new links to our collection
            new_links = page_links - self.all_links
            self.all_links.update(page_links)

            self.logger.info(f"Found {len(page_links)} total links, {len(new_links)} new links")

            # Add unvisited links to queue for next depth level
            if depth < self.max_depth:
                for link in new_links:
                    if link not in self.visited_urls:
                        queue.append((link, depth + 1))
                        self.visited_urls.add(link)

            # Random delay to avoid detection
            if self.delay > 0:
                time.sleep(random.uniform(self.delay * 0.5, self.delay * 1.5))

        return self.all_links

    def print_results(self):
        """Print the crawling results"""
        print("\n" + "=" * 60)
        print("CRAWLING COMPLETED")
        print("=" * 60)
        print(f"Total unique links found: {len(self.all_links)}")
        print(f"Total pages visited: {len(self.visited_urls)}")
        print(f"Failed URLs: {len(self.failed_urls)}")

        if self.failed_urls:
            print("\nFailed URLs:")
            for url in sorted(self.failed_urls):
                print(f"  - {url}")

        print("\nAll extracted links:")
        print("-" * 40)

        # Sort links for better readability
        sorted_links = sorted(self.all_links)
        for i, link in enumerate(sorted_links, 1):
            print(f"{i:3d}. {link}")

    def save_to_file(self, filename="extracted_links.txt"):
        """Save results to a text file"""
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Website: {self.base_url}\n")
            f.write(f"Total links found: {len(self.all_links)}\n")
            f.write(f"Total pages visited: {len(self.visited_urls)}\n")
            f.write(f"Failed URLs: {len(self.failed_urls)}\n")
            f.write(f"Max depth: {self.max_depth}\n")
            f.write(f"Workers used: {self.max_workers}\n")
            f.write("=" * 50 + "\n\n")

            if self.failed_urls:
                f.write("Failed URLs:\n")
                for url in sorted(self.failed_urls):
                    f.write(f"  - {url}\n")
                f.write("\n")

            f.write("All Links:\n")
            for i, link in enumerate(sorted(self.all_links), 1):
                f.write(f"{i:3d}. {link}\n")

        print(f"\nResults saved to {filename}")

# Main execution
if __name__ == "__main__":
    # Configuration
    website_url = "https://ptcl.com.pk/"
    max_crawl_depth = 2
    request_delay = 1
    max_workers = 3       # Fewer workers for proxy usage
    use_proxies = True    # Enable proxies by default

    print("Optimized Web Link Crawler")
    print("=" * 30)
    print(f"Target website: {website_url}")
    print(f"Max depth: {max_crawl_depth}")
    print(f"Request delay: {request_delay} seconds")
    print(f"Max workers: {max_workers}")
    print(f"Use proxies: {use_proxies}")
    print()

    # Create and run crawler
    crawler = OptimizedWebCrawler(
        website_url,
        max_depth=max_crawl_depth,
        delay=request_delay,
        max_workers=max_workers,
        use_proxies=use_proxies
    )

    try:
        # Start crawling
        start_time = time.time()
        all_links = crawler.crawl()
        end_time = time.time()

        print(f"\nCrawling completed in {end_time - start_time:.2f} seconds")

        # Display results
        crawler.print_results()

        # Save to file
        crawler.save_to_file("optimized_links.txt")

    except KeyboardInterrupt:
        print("\nCrawling interrupted by user")
        print(f"Partial results: {len(crawler.all_links)} links found so far")
        crawler.print_results()

    except Exception as e:
        print(f"An error occurred: {e}")
        if crawler.all_links:
            print("Partial results:")
            crawler.print_results()

Optimized Web Link Crawler
Target website: https://ptcl.com.pk/
Max depth: 2
Request delay: 1 seconds
Max workers: 3
Use proxies: True



ERROR:__main__:Proxy request error for https://ptcl.com.pk/Home/PageDetail?ItemId=617&linkId=5420: HTTPSConnectionPool(host='realtime.oxylabs.io', port=443): Read timed out. (read timeout=60)
ERROR:__main__:Proxy request error for https://ptcl.com.pk/Home/PageDetail?ItemId=694: HTTPSConnectionPool(host='realtime.oxylabs.io', port=443): Read timed out. (read timeout=60)
ERROR:__main__:Proxy request error for https://ptcl.com.pk/Home/PageDetail?ItemId=454&linkId=5507: HTTPSConnectionPool(host='realtime.oxylabs.io', port=443): Read timed out. (read timeout=60)
ERROR:__main__:Proxy request error for https://ptcl.com.pk/Search/Index: HTTPSConnectionPool(host='realtime.oxylabs.io', port=443): Read timed out. (read timeout=60)
ERROR:__main__:Proxy request error for https://ptcl.com.pk/SME/Product?type=36: HTTPSConnectionPool(host='realtime.oxylabs.io', port=443): Read timed out. (read timeout=60)
ERROR:__main__:Proxy request error for https://ptcl.com.pk/Home/PageDetailBusiness?ItemId=565&lin


Crawling interrupted by user
Partial results: 567 links found so far

CRAWLING COMPLETED
Total unique links found: 567
Total pages visited: 561
Failed URLs: 0

All extracted links:
----------------------------------------
  1. https://ptcl.com.pk/
  2. https://ptcl.com.pk/#0
  3. https://ptcl.com.pk/?linkId=12
  4. https://ptcl.com.pk/Business
  5. https://ptcl.com.pk/Complaint/CustomerComplaints?linkId=623
  6. https://ptcl.com.pk/Complaint/Status
  7. https://ptcl.com.pk/Complaint/Status?linkId=5022
  8. https://ptcl.com.pk/Complaint/customerordertracking
  9. https://ptcl.com.pk/Complaint/customerordertracking?linkId=5025
 10. https://ptcl.com.pk/ComplaintStatus
 11. https://ptcl.com.pk/ContactUs/Feedback
 12. https://ptcl.com.pk/ContactUs/Feedback?linkId=617
 13. https://ptcl.com.pk/ContactUs/Feedback?linkId=677
 14. https://ptcl.com.pk/Customer/BroadbandUsage
 15. https://ptcl.com.pk/Customer/BroadbandUsage?linkId=5011
 16. https://ptcl.com.pk/Customer/CharjiEvoUsage?linkId=5012


In [None]:
!pip install faiss-cpu
!pip install sentence_transformers
!pip install openai
!pip install transformers
!pip install torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [17]:


import pandas as pd
import numpy as np
import faiss
import pickle
import os
import re
from typing import List, Dict, Tuple, Optional
from sentence_transformers import SentenceTransformer
import openai
from transformers import AutoTokenizer, AutoModel
import torch
import json
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DocumentProcessor:
    """Handle document chunking and preprocessing"""

    def __init__(self, chunk_size=512, chunk_overlap=50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if not text or pd.isna(text):
            return ""

        # Convert to string and basic cleaning
        text = str(text).strip()

        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters that might interfere
        text = re.sub(r'[^\w\s\.\,\!\?\-\:\;\(\)]', ' ', text)

        return text

    def chunk_text(self, text: str, metadata: dict) -> List[Dict]:
        """Split text into overlapping chunks"""
        if not text:
            return []

        words = text.split()
        chunks = []

        for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = ' '.join(chunk_words)

            if len(chunk_text.strip()) < 50:  # Skip very short chunks
                continue

            chunk_metadata = metadata.copy()
            chunk_metadata.update({
                'chunk_id': len(chunks),
                'chunk_start': i,
                'chunk_end': min(i + self.chunk_size, len(words)),
                'total_words': len(words),
                'text': chunk_text  # Store text in metadata too
            })

            chunks.append({
                'text': chunk_text,
                'metadata': chunk_metadata
            })

        return chunks

    def process_dataframe(self, df: pd.DataFrame) -> List[Dict]:
        """Process entire dataframe into chunks"""
        all_chunks = []

        # Clean the dataframe
        df = df.drop(columns=['word_count', 'status', 'extraction_date'], errors='ignore')

        logger.info(f"Processing {len(df)} documents...")

        for idx, row in df.iterrows():
            # Combine all text fields
            combined_text_parts = []

            if pd.notna(row.get('title', '')):
                combined_text_parts.append(f"Title: {self.clean_text(row['title'])}")

            if pd.notna(row.get('meta_description', '')):
                combined_text_parts.append(f"Description: {self.clean_text(row['meta_description'])}")

            if pd.notna(row.get('headings', '')):
                combined_text_parts.append(f"Headings: {self.clean_text(row['headings'])}")

            if pd.notna(row.get('content', '')):
                combined_text_parts.append(self.clean_text(row['content']))

            combined_text = ' '.join(combined_text_parts)

            if not combined_text.strip():
                continue

            # Create metadata
            metadata = {
                'doc_id': idx,
                'url': row.get('url', ''),
                'title': self.clean_text(row.get('title', '')),
                'source': 'web_crawl'
            }

            # Create chunks
            doc_chunks = self.chunk_text(combined_text, metadata)
            all_chunks.extend(doc_chunks)

        logger.info(f"Created {len(all_chunks)} chunks from {len(df)} documents")
        return all_chunks

class EmbeddingGenerator:
    """Generate embeddings using different models"""

    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize embedding model
        Popular options:
        - 'all-MiniLM-L6-v2' (384 dims, fast)
        - 'all-mpnet-base-v2' (768 dims, better quality)
        - 'multi-qa-MiniLM-L6-cos-v1' (384 dims, good for Q&A)
        """
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        self.embedding_dim = self.model.get_sentence_embedding_dimension()

        logger.info(f"Loaded embedding model: {model_name} (dim: {self.embedding_dim})")

    def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Generate embeddings for a list of texts"""
        logger.info(f"Generating embeddings for {len(texts)} texts...")

        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.model.encode(
                batch_texts,
                convert_to_numpy=True,
                show_progress_bar=True if i == 0 else False
            )
            embeddings.append(batch_embeddings)

        all_embeddings = np.vstack(embeddings)
        logger.info(f"Generated embeddings shape: {all_embeddings.shape}")

        return all_embeddings

class FAISSVectorStore:
    """FAISS-based vector store for similarity search"""

    def __init__(self, embedding_dim: int, index_type: str = 'flat'):
        """
        Initialize FAISS index
        index_type options:
        - 'flat': Exact search (good for small datasets)
        - 'ivf': Inverted file index (faster for large datasets)
        - 'hnsw': Hierarchical NSW (good balance of speed/accuracy)
        """
        self.embedding_dim = embedding_dim
        self.index_type = index_type
        self.index = None
        self.metadata = []
        self.texts = []  # Store texts separately for retrieval

        self._create_index()

    def _create_index(self):
        """Create appropriate FAISS index"""
        if self.index_type == 'flat':
            self.index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product (cosine similarity)
        elif self.index_type == 'ivf':
            quantizer = faiss.IndexFlatIP(self.embedding_dim)
            self.index = faiss.IndexIVFFlat(quantizer, self.embedding_dim, 100)  # 100 clusters
        elif self.index_type == 'hnsw':
            self.index = faiss.IndexHNSWFlat(self.embedding_dim, 32)
        else:
            raise ValueError(f"Unsupported index type: {self.index_type}")

        logger.info(f"Created FAISS index: {self.index_type} (dim: {self.embedding_dim})")

    def add_embeddings(self, embeddings: np.ndarray, metadata: List[Dict], texts: List[str]):
        """Add embeddings, metadata, and texts to the index"""
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)

        if self.index_type == 'ivf' and not self.index.is_trained:
            logger.info("Training IVF index...")
            self.index.train(embeddings)

        self.index.add(embeddings)
        self.metadata.extend(metadata)
        self.texts.extend(texts)  # Store texts for retrieval

        logger.info(f"Added {len(embeddings)} embeddings to index. Total: {self.index.ntotal}")

    def search(self, query_embedding: np.ndarray, k: int = 5) -> List[Dict]:
        """Search for similar embeddings"""
        # Normalize query embedding
        query_embedding = query_embedding.reshape(1, -1)
        faiss.normalize_L2(query_embedding)

        # Search
        if self.index_type == 'ivf':
            self.index.nprobe = 10  # Number of clusters to search

        scores, indices = self.index.search(query_embedding, k)

        # Get results
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx != -1 and idx < len(self.metadata):  # Valid result
                results.append({
                    'score': float(score),
                    'metadata': self.metadata[idx],
                    'text': self.texts[idx],  # Include the actual text
                    'rank': i + 1
                })

        return results

    def save_index(self, filepath: str):
        """Save FAISS index, metadata, and texts"""
        # Save FAISS index
        faiss.write_index(self.index, f"{filepath}.faiss")

        # Save metadata and texts
        with open(f"{filepath}.metadata", 'wb') as f:
            pickle.dump({'metadata': self.metadata, 'texts': self.texts}, f)

        logger.info(f"Saved index to {filepath}")

    def load_index(self, filepath: str):
        """Load FAISS index, metadata, and texts"""
        # Load FAISS index
        self.index = faiss.read_index(f"{filepath}.faiss")

        # Load metadata and texts
        with open(f"{filepath}.metadata", 'rb') as f:
            data = pickle.load(f)
            self.metadata = data['metadata']
            self.texts = data['texts']

        logger.info(f"Loaded index from {filepath}")

class RAGSystem:
    """Complete RAG system combining retrieval and generation"""

    def __init__(self,
                 embedding_model: str = 'all-MiniLM-L6-v2',
                 index_type: str = 'flat',
                 llm_provider: str = 'openai',  # 'openai' or 'huggingface'
                 llm_model: str = 'gpt-3.5-turbo'):

        self.processor = DocumentProcessor()
        self.embedder = EmbeddingGenerator(embedding_model)
        self.vector_store = FAISSVectorStore(self.embedder.embedding_dim, index_type)

        self.llm_provider = llm_provider
        self.llm_model = llm_model

        # Initialize LLM
        if llm_provider == 'openai':
            # Make sure to set your OpenAI API key: openai.api_key = "your-key"
            pass
        elif llm_provider == 'huggingface':
            # You can load a local model here
            pass

    def build_index(self, df: pd.DataFrame):
        """Build the complete RAG index from dataframe"""
        logger.info("Building RAG index...")

        # Process documents into chunks
        chunks = self.processor.process_dataframe(df)

        if not chunks:
            raise ValueError("No valid chunks created from dataframe")

        # Generate embeddings
        texts = [chunk['text'] for chunk in chunks]
        embeddings = self.embedder.generate_embeddings(texts)

        # Add to vector store
        metadata = [chunk['metadata'] for chunk in chunks]
        self.vector_store.add_embeddings(embeddings, metadata, texts)

        logger.info("RAG index built successfully!")
        return len(chunks)

    def search(self, query: str, k: int = 5) -> List[Dict]:
        """Search for relevant documents"""
        # Generate query embedding
        query_embedding = self.embedder.generate_embeddings([query])

        # Search vector store
        results = self.vector_store.search(query_embedding[0], k)

        return results

    def generate_answer(self, query: str, context_docs: List[Dict]) -> str:
        """Generate answer using retrieved context"""
        # Prepare context
        context_texts = []
        for doc in context_docs:
            metadata = doc['metadata']
            context_texts.append(f"Source: {metadata.get('title', 'Unknown')}\n{metadata.get('url', '')}\n{doc['text'][:500]}...")

        context = "\n\n".join(context_texts)

        # Create prompt
        prompt = f"""Based on the following context, please answer the question. If the answer cannot be found in the context, say so.

Context:
{context}

Question: {query}

Answer:"""

        if self.llm_provider == 'openai':
            try:
                response = openai.ChatCompletion.create(
                    model=self.llm_model,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=500,
                    temperature=0.1
                )
                return response.choices[0].message.content.strip()
            except Exception as e:
                return f"Error generating answer: {str(e)}"
        else:
            return "Answer generation not implemented for this LLM provider"

    def query(self, question: str, k: int = 5) -> Dict:
        """Complete RAG query: retrieve + generate"""
        logger.info(f"Processing query: {question}")

        # Retrieve relevant documents
        search_results = self.search(question, k)

        # Generate answer
        answer = self.generate_answer(question, search_results)

        return {
            'question': question,
            'answer': answer,
            'sources': search_results,
            'timestamp': datetime.now().isoformat()
        }

    def save_system(self, filepath: str):
        """Save the complete RAG system"""
        self.vector_store.save_index(filepath)

        # Save system configuration
        config = {
            'embedding_model': self.embedder.model_name,
            'embedding_dim': self.embedder.embedding_dim,
            'index_type': self.vector_store.index_type,
            'llm_provider': self.llm_provider,
            'llm_model': self.llm_model
        }

        with open(f"{filepath}.config", 'w') as f:
            json.dump(config, f, indent=2)

        logger.info(f"RAG system saved to {filepath}")

    def load_system(self, filepath: str):
        """Load a saved RAG system"""
        self.vector_store.load_index(filepath)

        with open(f"{filepath}.config", 'r') as f:
            config = json.load(f)

        logger.info(f"RAG system loaded from {filepath}")

    def get_detailed_results(self, query: str, k: int = 5):
        """Get detailed search results with full text content"""
        print(f"\nDetailed Search Results for: '{query}'")
        print("=" * 80)

        # Get search results
        results = self.search(query, k)

        for i, result in enumerate(results, 1):
            print(f"\n[Result {i}]")
            print(f"Relevance Score: {result['score']:.4f}")
            print(f"Document Title: {result['metadata'].get('title', 'Unknown')}")
            print(f"Source URL: {result['metadata'].get('url', 'Unknown')}")
            print(f"Document ID: {result['metadata'].get('doc_id', 'Unknown')}")
            print(f"Chunk ID: {result['metadata'].get('chunk_id', 'Unknown')}")

            print(f"\nFull Content:")
            print("-" * 50)

            # Display the text content
            full_text = result.get('text', '')
            if full_text:
                if len(full_text) > 1000:
                    print(full_text[:1000] + "\n\n[... Content truncated for display, showing first 1000 chars ...]")
                    print(f"\nTotal length: {len(full_text)} characters")
                else:
                    print(full_text)
            else:
                print("No text content available")

            print("-" * 50)

        return results

# Example usage and testing
if __name__ == "__main__":
    # Load your extracted content
    print("Loading extracted content...")
    df = pd.read_csv("extracted_content.csv")

    # Clean the dataframe as specified
    df = df.drop(columns=['word_count', "status", "extraction_date"], errors='ignore')
    print(f"Loaded {len(df)} documents")

    # Initialize RAG system
    print("\nInitializing RAG system...")
    rag = RAGSystem(
        embedding_model='all-MiniLM-L6-v2',  # Fast and good quality
        index_type='flat',  # Good for small to medium datasets
        llm_provider='openai',  # Change to 'huggingface' if you prefer
        llm_model='gpt-3.5-turbo'
    )

    # Try to load existing system, otherwise build new one
    try:
        rag.load_system("ptcl_rag_system")
        print("Loaded existing RAG system")
    except:
        print("Building new RAG index...")
        num_chunks = rag.build_index(df)
        print(f"Index built with {num_chunks} chunks")
        rag.save_system("ptcl_rag_system")

    # Test queries with full text display
    test_queries = [
        "What services does PTCL offer?",
        "How can I contact PTCL customer support?",
        "What are PTCL's internet packages?",
        "How to make a complaint to PTCL?"
    ]

    print("\nTesting RAG system with full text content...")
    for query in test_queries:
        results = rag.get_detailed_results(query, k=3)
        print("\n" + "="*80 + "\n")

Loading extracted content...
Loaded 1178 documents

Initializing RAG system...
Loaded existing RAG system

Testing RAG system with full text content...

Detailed Search Results for: 'What services does PTCL offer?'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[Result 1]
Relevance Score: 0.7785
Document Title: Another first by PTCL: Landline service available on mobile phones through SmartLink App
Source URL: https://ptcl.com.pk/Home/PressReleaseDetail/?ItemId=464&linkId=130
Document ID: 959
Chunk ID: 1

Full Content:
--------------------------------------------------
another first by PTCL and endorses our continuous efforts to bring innovative technology-led solutions to our customers , added Adnan Shahid. The application is currently available in 41 cities and can be easily downloaded from Google Playstore. Apple iOS App is also coming soon and will be made available for iphones. PTCL subscribers can benefit from this application through a simple registration process. All voice, video calls and messaging is free-of-cost from SmartLink to SmartLink, while standard landline tariff is applicable for calls to PTCL landline and mobile operators. PSX PTC 11.7 -0.01 Copyright 2021 PTCL. All Rights Reserved. All visual media by PTCL and or its me

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[Result 1]
Relevance Score: 0.7208
Document Title: PTCL holds 19th Annual General Meeting
Source URL: https://ptcl.com.pk/Home/PressReleaseDetail/?ItemId=417&linkId=130
Document ID: 283
Chunk ID: 1

Full Content:
--------------------------------------------------
and superior customer experience. PTCL is in a strong position to embrace the challenges facing the telecom industry, which has enabled it to continue to add value for its customers as well as the society at large and we shall continue to focus on providing a unique experience and superior service in the future. By putting customers at the heart of everything, we will further enhance our current success to achieve our future growth, added Walid Irshaid The shareholders unanimously approved all the agenda items of the meeting including financial statements for the year ended December 31, 2013. In the end, Chairman PTCL Board, Ikhlaq Ahmed Tarar thanked all the shareholders for their continuous support. PSX PTC 11.7 -0.01 Copyr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[Result 1]
Relevance Score: 0.6840
Document Title: PTCL launches 2 Mbps Economy Package
Source URL: https://ptcl.com.pk/Home/PressReleaseDetail/?ItemId=395&linkId=130
Document ID: 786
Chunk ID: 1

Full Content:
--------------------------------------------------
pricing and affordability. PTCL broadband internet is empowering people across Pakistan to reach out to the world, enabling convenient access to knowledge and information nationwide. PSX PTC 11.7 -0.01 Copyright 2021 PTCL. All Rights Reserved. All visual media by PTCL and or its media providers. Terms and Conditions General Privacy Notice The website was last updated on 13-July-2021 Navigation: Menu Shop Business Solutions Support Coverage Areas Work at PTCL Customer Sign In Telephone Add-ons Packages Value Added Services Telephone Sets Call Tariff Dailing Codes Internet Packages for Voice Users Internet Wired Internet Products Packages Fast Path for Online Gamers Internet Static IP Lists: Business Solutions Support Coverage Ar

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[Result 1]
Relevance Score: 0.6481
Document Title: Whistle Blowing Policy
Source URL: https://ptcl.com.pk/Home/PageDetail?ItemId=653&linkId=5562
Document ID: 1070
Chunk ID: 1

Full Content:
--------------------------------------------------
in confidence and receive feedback on any action taken. In addition, it reinforces the value PTCL places on stakeholders to be honest and respected members of their individual professions. It provides a method of properly addressing bona-fide concerns that, individuals within PTCL might have, while also offering Whistle Blowers protection from victimization, harassment or disciplinary proceedings. The policy however discourages employees and other stakeholders from making complaints with a malicious intent. Other stakeholders, for this purpose, include vendors, customers and shareholders. Specific examples of circumstances where a Whistle Blower can raise concerns could include but shall not be limited to: Vendors and customers also can blow the wh