In [None]:
!pip install requests beautifulsoup4 reportlab pillow pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
from PIL import Image
import io
import base64
import json
from datetime import datetime
import logging

# Optional imports for advanced text extraction
try:
    import pytesseract
    from PIL import Image, ImageEnhance, ImageFilter
    OCR_AVAILABLE = True
except ImportError:
    OCR_AVAILABLE = False
    print("⚠️ OCR not available. Install pytesseract and Pillow for image text extraction")

try:
    import cv2
    import numpy as np
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False
    print("⚠️ OpenCV not available. Install opencv-python for advanced image processing")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MOSDACTextExtractor:
    def __init__(self, base_url, output_folder="mosdac_text_data", max_depth=2):
        self.base_url = base_url
        self.output_folder = output_folder
        self.max_depth = max_depth
        self.visited_urls = set()
        self.all_text_content = []
        self.temp_folder = os.path.join(output_folder, "temp")

        # Create directories
        os.makedirs(output_folder, exist_ok=True)
        os.makedirs(self.temp_folder, exist_ok=True)

    def clean_text(self, html_content):
        """Clean and extract meaningful text from HTML"""
        soup = BeautifulSoup(html_content, 'lxml')

        # Remove unwanted elements
        for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
            element.extract()

        # Get text with better formatting
        text = soup.get_text(separator='\n', strip=True)
        text = re.sub(r'\n\s*\n', '\n\n', text)
        text = re.sub(r'\s+', ' ', text)

        return text

    def extract_text_from_image(self, img_url, img_element):
        """Extract text from images using OCR and context analysis"""
        extracted_text = ""

        try:
            # Download image
            response = requests.get(img_url, timeout=10, stream=True)
            if response.status_code != 200:
                return f"[Image: {img_url} - Could not download]"

            # Open image
            image = Image.open(io.BytesIO(response.content))

            # Get image context from HTML
            alt_text = img_element.get('alt', '')
            title_text = img_element.get('title', '')

            # Check if image might contain text/data
            width, height = image.size

            # Analyze image type based on context and attributes
            img_description = self.analyze_image_content(image, img_url, alt_text, title_text)

            # Try OCR if available and image seems to contain text
            if OCR_AVAILABLE and self.might_contain_text(img_url, alt_text, title_text):
                try:
                    # Enhance image for better OCR
                    enhanced_image = self.enhance_image_for_ocr(image)
                    ocr_text = pytesseract.image_to_string(enhanced_image, lang='eng')

                    if ocr_text.strip() and len(ocr_text.strip()) > 5:
                        img_description += f"\n\nText extracted from image: {ocr_text.strip()}"

                except Exception as ocr_error:
                    logger.debug(f"OCR failed for {img_url}: {ocr_error}")

            extracted_text = img_description

        except Exception as e:
            logger.error(f"Error processing image {img_url}: {e}")
            extracted_text = f"[Image: {img_url} - Processing failed]"

        return extracted_text

    def might_contain_text(self, img_url, alt_text, title_text):
        """Determine if image might contain text worth extracting"""
        text_indicators = [
            'chart', 'graph', 'diagram', 'table', 'data', 'statistics',
            'figure', 'plot', 'map', 'infographic', 'screenshot',
            'document', 'text', 'caption', 'label', 'title'
        ]

        combined_text = f"{img_url} {alt_text} {title_text}".lower()

        return any(indicator in combined_text for indicator in text_indicators)

    def enhance_image_for_ocr(self, image):
        """Enhance image quality for better OCR results"""
        try:
            # Convert to grayscale
            if image.mode != 'L':
                image = image.convert('L')

            # Enhance contrast
            enhancer = ImageEnhance.Contrast(image)
            image = enhancer.enhance(2.0)

            # Enhance sharpness
            enhancer = ImageEnhance.Sharpness(image)
            image = enhancer.enhance(2.0)

            # Resize if too small
            width, height = image.size
            if width < 300 or height < 300:
                scale = max(300/width, 300/height)
                new_size = (int(width * scale), int(height * scale))
                image = image.resize(new_size, Image.LANCZOS)

            return image

        except Exception as e:
            logger.debug(f"Image enhancement failed: {e}")
            return image

    def analyze_image_content(self, image, img_url, alt_text, title_text):
        """Analyze image content and generate descriptive text"""
        description_parts = []

        # Basic image info
        width, height = image.size
        mode = image.mode

        description_parts.append(f"[IMAGE ANALYSIS]")
        description_parts.append(f"Source: {img_url}")
        description_parts.append(f"Dimensions: {width}x{height} pixels")

        # Add alt text and title if available
        if alt_text:
            description_parts.append(f"Alt text: {alt_text}")
        if title_text:
            description_parts.append(f"Title: {title_text}")

        # Analyze image type based on URL and context
        img_type = self.classify_image_type(img_url, alt_text, title_text)
        description_parts.append(f"Likely content type: {img_type}")

        # Analyze colors and composition
        try:
            color_analysis = self.analyze_image_colors(image)
            description_parts.append(f"Color analysis: {color_analysis}")
        except:
            pass

        # Check for common patterns
        pattern_analysis = self.detect_image_patterns(img_url, alt_text, title_text, width, height)
        if pattern_analysis:
            description_parts.append(f"Pattern analysis: {pattern_analysis}")

        return "\n".join(description_parts)

    def classify_image_type(self, img_url, alt_text, title_text):
        """Classify image type based on available information"""
        combined_text = f"{img_url} {alt_text} {title_text}".lower()

        if any(word in combined_text for word in ['logo', 'brand', 'header']):
            return "Logo/Branding element"
        elif any(word in combined_text for word in ['chart', 'graph', 'plot']):
            return "Data visualization (chart/graph)"
        elif any(word in combined_text for word in ['map', 'satellite', 'geographic']):
            return "Geographic/Satellite imagery"
        elif any(word in combined_text for word in ['diagram', 'flowchart', 'schema']):
            return "Technical diagram/flowchart"
        elif any(word in combined_text for word in ['screenshot', 'interface', 'ui']):
            return "User interface screenshot"
        elif any(word in combined_text for word in ['photo', 'picture', 'image']):
            return "Photograph/Image"
        elif any(word in combined_text for word in ['icon', 'button', 'symbol']):
            return "Icon/Symbol"
        else:
            return "General image content"

    def analyze_image_colors(self, image):
        """Analyze dominant colors in the image"""
        try:
            # Convert to RGB if needed
            if image.mode != 'RGB':
                image = image.convert('RGB')

            # Get image data
            image_data = list(image.getdata())

            # Sample colors (take every 100th pixel to avoid performance issues)
            sampled_colors = image_data[::100]

            # Analyze brightness
            brightness_values = [sum(pixel)/3 for pixel in sampled_colors if len(pixel) >= 3]
            avg_brightness = sum(brightness_values) / len(brightness_values) if brightness_values else 128

            if avg_brightness > 200:
                return "Predominantly light/bright image"
            elif avg_brightness < 80:
                return "Predominantly dark image"
            else:
                return "Mixed brightness levels"

        except Exception as e:
            return "Color analysis unavailable"

    def detect_image_patterns(self, img_url, alt_text, title_text, width, height):
        """Detect common patterns in images"""
        patterns = []

        # Aspect ratio analysis
        aspect_ratio = width / height if height > 0 else 1

        if aspect_ratio > 2:
            patterns.append("Wide format (possibly banner or header)")
        elif aspect_ratio < 0.5:
            patterns.append("Tall format (possibly sidebar or vertical chart)")
        elif 0.9 <= aspect_ratio <= 1.1:
            patterns.append("Square format")

        # Size analysis
        if width * height > 1000000:  # > 1MP
            patterns.append("High resolution image")
        elif width * height < 10000:  # < 10K pixels
            patterns.append("Small icon or thumbnail")

        return "; ".join(patterns) if patterns else None

    def extract_table_text(self, soup, page_url):
        """Extract and format table content as readable text"""
        tables_text = []

        for i, table in enumerate(soup.find_all('table')):
            try:
                table_text = [f"\n[TABLE {i+1} FROM {page_url}]"]

                # Extract caption if available
                caption = table.find('caption')
                if caption:
                    table_text.append(f"Table Caption: {caption.get_text(strip=True)}")

                # Extract table rows
                rows = table.find_all('tr')
                for row_idx, row in enumerate(rows):
                    cells = row.find_all(['td', 'th'])
                    if cells:
                        row_data = []
                        for cell in cells:
                            cell_text = cell.get_text(strip=True)
                            if cell_text:
                                row_data.append(cell_text)

                        if row_data:
                            if row_idx == 0:  # Header row
                                table_text.append(f"Headers: {' | '.join(row_data)}")
                            else:
                                table_text.append(f"Row {row_idx}: {' | '.join(row_data)}")

                if len(table_text) > 1:  # Only add if we found actual content
                    tables_text.append("\n".join(table_text))

            except Exception as e:
                logger.error(f"Error extracting table: {e}")
                continue

        return "\n\n".join(tables_text) if tables_text else ""

    def extract_list_text(self, soup):
        """Extract and format list content"""
        lists_text = []

        # Extract ordered and unordered lists
        for list_type in ['ul', 'ol']:
            for i, list_elem in enumerate(soup.find_all(list_type)):
                list_items = list_elem.find_all('li')
                if list_items:
                    list_text = [f"\n[{list_type.upper()} LIST {i+1}]"]
                    for idx, item in enumerate(list_items, 1):
                        item_text = item.get_text(strip=True)
                        if item_text:
                            prefix = f"{idx}." if list_type == 'ol' else "•"
                            list_text.append(f"{prefix} {item_text}")

                    if len(list_text) > 1:
                        lists_text.append("\n".join(list_text))

        return "\n\n".join(lists_text) if lists_text else ""

    def extract_metadata_text(self, soup, page_url):
        """Extract metadata as readable text"""
        metadata_text = [f"\n[PAGE METADATA FOR {page_url}]"]

        # Title
        title = soup.find('title')
        if title:
            metadata_text.append(f"Page Title: {title.get_text(strip=True)}")

        # Meta description
        description_meta = soup.find('meta', attrs={'name': 'description'})
        if description_meta:
            metadata_text.append(f"Description: {description_meta.get('content', '')}")

        # Meta keywords
        keywords_meta = soup.find('meta', attrs={'name': 'keywords'})
        if keywords_meta:
            metadata_text.append(f"Keywords: {keywords_meta.get('content', '')}")

        # Headings structure
        headings = []
        for h_level in range(1, 7):
            h_tags = soup.find_all(f'h{h_level}')
            for h_tag in h_tags:
                heading_text = h_tag.get_text(strip=True)
                if heading_text:
                    headings.append(f"H{h_level}: {heading_text}")

        if headings:
            metadata_text.append("Page Structure:")
            metadata_text.extend(headings)

        return "\n".join(metadata_text) if len(metadata_text) > 1 else ""

    def crawl_static(self, url, visited, depth=0, max_depth=2):
        """Enhanced crawl function that extracts all content as text"""
        if url in visited or depth > max_depth:
            return

        visited.add(url)

        try:
            logger.info(f"Crawling: {url} (depth: {depth})")
            response = requests.get(url, timeout=15, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })

            if response.status_code != 200:
                logger.warning(f"Failed to fetch {url}: Status {response.status_code}")
                return

            soup = BeautifulSoup(response.text, 'lxml')

            # Collect all text content
            page_content = []

            # Add page header
            page_content.append(f"\n{'='*80}")
            page_content.append(f"PAGE: {url}")
            page_content.append(f"CRAWLED: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            page_content.append(f"DEPTH: {depth}")
            page_content.append(f"{'='*80}\n")

            # Extract main text content
            main_text = self.clean_text(response.text)
            if main_text.strip():
                page_content.append("[MAIN CONTENT]")
                page_content.append(main_text)
                page_content.append("")

            # Extract metadata
            metadata_text = self.extract_metadata_text(soup, url)
            if metadata_text:
                page_content.append(metadata_text)
                page_content.append("")

            # Extract table content
            tables_text = self.extract_table_text(soup, url)
            if tables_text:
                page_content.append(tables_text)
                page_content.append("")

            # Extract list content
            lists_text = self.extract_list_text(soup)
            if lists_text:
                page_content.append(lists_text)
                page_content.append("")

            # Extract text from images
            base_url = "{0.scheme}://{0.netloc}".format(urlparse(url))
            images_text = []

            for img in soup.find_all('img'):
                img_src = img.get('src')
                if img_src:
                    img_url = urljoin(base_url, img_src)
                    if not img_url.startswith('data:'):  # Skip data URLs
                        img_text = self.extract_text_from_image(img_url, img)
                        if img_text:
                            images_text.append(img_text)
                            # Add delay to be respectful
                            time.sleep(0.5)

            if images_text:
                page_content.append("\n[IMAGES AND VISUAL CONTENT]")
                page_content.extend(images_text)
                page_content.append("")

            # Combine all content
            full_page_content = "\n".join(page_content)

            # Store the content
            self.all_text_content.append({
                'url': url,
                'content': full_page_content,
                'extracted_at': datetime.now().isoformat(),
                'depth': depth
            })

            # Save individual text file
            filename = urlparse(url).path.replace("/", "_") or "home"
            text_path = os.path.join(self.output_folder, f"{filename}_depth{depth}.txt")
            with open(text_path, "w", encoding="utf-8") as f:
                f.write(full_page_content)

            logger.info(f"Extracted {len(full_page_content)} characters from {url}")

            # Recursively crawl other internal links
            if depth < max_depth:
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    abs_url = urljoin(base_url, href)

                    # Only crawl internal links
                    if (urlparse(abs_url).netloc == urlparse(url).netloc and
                        abs_url.startswith("http") and abs_url not in visited):

                        # Add delay between requests
                        time.sleep(2)
                        self.crawl_static(abs_url, visited, depth + 1, max_depth)

        except Exception as e:
            logger.error(f"Error crawling {url}: {e}")

    def generate_pdf_report(self, output_filename="MOSDAC.pdf"):
        """Generate text-only PDF report"""
        pdf_path = os.path.join(self.output_folder, output_filename)
        doc = SimpleDocTemplate(pdf_path, pagesize=A4)
        styles = getSampleStyleSheet()
        story = []

        # Custom styles
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Title'],
            fontSize=20,
            spaceAfter=30,
            alignment=TA_CENTER
        )

        # Title page
        story.append(Paragraph("MOSDAC Complete Text Data Extraction", title_style))
        story.append(Spacer(1, 20))
        story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal']))
        story.append(Paragraph(f"Base URL: {self.base_url}", styles['Normal']))
        story.append(Paragraph(f"Total Pages: {len(self.all_text_content)}", styles['Normal']))
        story.append(Paragraph(f"Max Depth: {self.max_depth}", styles['Normal']))
        story.append(Spacer(1, 30))

        # Summary
        total_words = sum(len(page['content'].split()) for page in self.all_text_content)
        story.append(Paragraph(f"Total Words Extracted: {total_words:,}", styles['Heading2']))
        story.append(Paragraph("This document contains all textual content extracted from the MOSDAC website, including text from images, tables, metadata, and visual content descriptions.", styles['Normal']))

        story.append(PageBreak())

        # Add all content
        for i, page_data in enumerate(self.all_text_content):
            try:
                # Convert content to paragraphs for PDF
                content_lines = page_data['content'].split('\n')

                for line in content_lines:
                    if line.strip():
                        # Handle different formatting
                        if line.startswith('='):
                            continue  # Skip separator lines
                        elif line.startswith('[') and line.endswith(']'):
                            # Section headers
                            story.append(Paragraph(line, styles['Heading3']))
                        elif line.startswith('PAGE:'):
                            story.append(Paragraph(line, styles['Heading2']))
                        else:
                            # Regular content
                            # Escape HTML characters and handle long lines
                            escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                            if len(escaped_line) > 500:
                                # Break very long lines
                                words = escaped_line.split()
                                current_line = []
                                for word in words:
                                    current_line.append(word)
                                    if len(' '.join(current_line)) > 400:
                                        story.append(Paragraph(' '.join(current_line), styles['Normal']))
                                        current_line = []
                                if current_line:
                                    story.append(Paragraph(' '.join(current_line), styles['Normal']))
                            else:
                                story.append(Paragraph(escaped_line, styles['Normal']))
                    else:
                        story.append(Spacer(1, 6))

                # Add page break after every few pages to manage PDF size
                if (i + 1) % 5 == 0:
                    story.append(PageBreak())

            except Exception as e:
                logger.error(f"Error adding content to PDF: {e}")
                story.append(Paragraph(f"[Error processing content from {page_data['url']}]", styles['Normal']))

        # Build PDF
        try:
            doc.build(story)
            logger.info(f"PDF report generated successfully: {pdf_path}")
            return pdf_path
        except Exception as e:
            logger.error(f"Error generating PDF: {e}")
            return None

def main():
    """Main execution function"""
    start_url = "https://www.mosdac.gov.in"

    print("Starting MOSDAC comprehensive text extraction...")
    print(f"Target URL: {start_url}")
    print(f"OCR Available: {OCR_AVAILABLE}")
    print(f"OpenCV Available: {CV2_AVAILABLE}")
    print(f"Timestamp: {datetime.now()}")
    print("-" * 60)

    # Initialize extractor
    extractor = MOSDACTextExtractor(start_url, max_depth=2)

    # Start extraction
    start_time = time.time()
    visited = set()
    extractor.crawl_static(start_url, visited, max_depth=2)

    # Generate PDF report
    print("\n Generating comprehensive PDF report...")
    pdf_path = extractor.generate_pdf_report("MOSDAC.pdf")

    # Save combined text file as backup
    all_text = "\n\n".join(page['content'] for page in extractor.all_text_content)
    backup_path = os.path.join(extractor.output_folder, "MOSDAC_complete_text.txt")
    with open(backup_path, 'w', encoding='utf-8') as f:
        f.write(all_text)

    # Print summary
    end_time = time.time()
    duration = end_time - start_time
    total_words = sum(len(page['content'].split()) for page in extractor.all_text_content)

    print("\n" + "="*70)
    print(" TEXT EXTRACTION COMPLETED!")
    print("="*70)
    print(f" Pages processed: {len(extractor.all_text_content)}")
    print(f" Total words extracted: {total_words:,}")
    print(f" URLs visited: {len(visited)}")
    print(f"  Time taken: {duration:.2f} seconds")
    print(f" Output folder: {extractor.output_folder}")

    if pdf_path:
        print(f" Main PDF: {pdf_path}")

    print("\n Comprehensive text data ready for bot training!")
    print("   - All visual content analyzed and converted to text")
    print("   - Tables formatted as readable text")
    print("   - Images analyzed with OCR where applicable")
    print("   - Complete metadata extraction")

if __name__ == "__main__":
    main()

Starting MOSDAC comprehensive text extraction...
Target URL: https://www.mosdac.gov.in
OCR Available: True
OpenCV Available: True
Timestamp: 2025-08-22 15:44:49.701796
------------------------------------------------------------



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.text, 'lxml')

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, 'lxml')



 Generating comprehensive PDF report...

 TEXT EXTRACTION COMPLETED!
 Pages processed: 107
 Total words extracted: 429,954
 URLs visited: 108
  Time taken: 2367.43 seconds
 Output folder: mosdac_text_data
 Main PDF: mosdac_text_data/MOSDAC.pdf

 Comprehensive text data ready for bot training!
   - All visual content analyzed and converted to text
   - Tables formatted as readable text
   - Images analyzed with OCR where applicable
   - Complete metadata extraction
