In [1]:
import os

In [2]:
%pwd

'e:\\Text-Summarizer-Project\\research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'e:\\Text-Summarizer-Project'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [7]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [9]:
import os
from urllib.request import urlretrieve
import zipfile
from textSummarizer.logging import logger
from textSummarizer.utils.common import get_size

In [10]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def _fix_github_url(self, url):
        """
        Convert GitHub blob URL to raw download URL
        """
        if "github.com" in url and "/blob/" in url:
            fixed_url = url.replace("github.com", "raw.githubusercontent.com")
            fixed_url = fixed_url.replace("/blob/", "/")
            logger.info(f"Fixed GitHub URL: {url} -> {fixed_url}")
            return fixed_url
        return url
    
    def _validate_zip_file(self, file_path):
        """
        Validate that the file is actually a ZIP file
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # Check file size
        file_size = os.path.getsize(file_path)
        if file_size == 0:
            raise ValueError("Downloaded file is empty")
        
        # Check file signature
        with open(file_path, 'rb') as f:
            first_bytes = f.read(100)
        
        # Check if it's HTML (common issue with GitHub URLs)
        if first_bytes.startswith(b'<!DOCTYPE') or first_bytes.startswith(b'<html'):
            logger.error("Downloaded HTML page instead of ZIP file")
            # Show HTML content for debugging
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                html_content = f.read(300)
                logger.error(f"HTML content: {html_content}")
            raise ValueError("Downloaded HTML page instead of ZIP file. Check the URL.")
        
        # Check ZIP signature
        if not first_bytes.startswith(b'PK'):
            raise ValueError(f"File is not a ZIP file. First bytes: {first_bytes[:20].hex()}")
        
        # Validate ZIP file structure
        try:
            with zipfile.ZipFile(file_path, 'r') as zip_test:
                file_list = zip_test.namelist()
                logger.info(f"ZIP file is valid with {len(file_list)} files")
                return True
        except zipfile.BadZipFile as e:
            raise zipfile.BadZipFile(f"Corrupted ZIP file: {e}")
    
    def download_file(self):
        """
        Download file with proper URL handling and validation
        """
        try:
            if not os.path.exists(self.config.local_data_file):
                # Create directory if it doesn't exist
                os.makedirs(os.path.dirname(self.config.local_data_file), exist_ok=True)
                
                # Fix GitHub URL if needed
                download_url = self._fix_github_url(self.config.source_URL)
                
                logger.info(f"Downloading from: {download_url}")
                
                # Download the file
                filename, headers = urlretrieve(
                    url=download_url,
                    filename=self.config.local_data_file
                )
                
                logger.info(f"{filename} downloaded! Headers: \n{headers}")
                
                # Validate the downloaded file immediately
                self._validate_zip_file(self.config.local_data_file)
                logger.info("File validated successfully as ZIP")
                
            else:
                file_size = get_size(Path(self.config.local_data_file))
                logger.info(f"File already exists of size: {file_size}")
                
                # Still validate existing file
                try:
                    self._validate_zip_file(self.config.local_data_file)
                except (ValueError, zipfile.BadZipFile) as e:
                    logger.warning(f"Existing file is invalid: {e}")
                    logger.info("Removing invalid file and re-downloading...")
                    os.remove(self.config.local_data_file)
                    # Recursively call to re-download
                    return self.download_file()
                    
        except Exception as e:
            # Clean up failed download
            if os.path.exists(self.config.local_data_file):
                os.remove(self.config.local_data_file)
                logger.error("Removed failed download")
            
            logger.error(f"Download failed: {e}")
            raise

    def extract_zip_file(self):
        """
        Extract ZIP file with validation and error handling
        """
        try:
            # Validate file before extraction
            self._validate_zip_file(self.config.local_data_file)
            
            unzip_path = self.config.unzip_dir
            os.makedirs(unzip_path, exist_ok=True)
            
            logger.info(f"Extracting ZIP file to: {unzip_path}")
            
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                # Get file list
                file_list = zip_ref.namelist()
                logger.info(f"Extracting {len(file_list)} files...")
                
                # Extract all files
                zip_ref.extractall(unzip_path)
                
                # Log some extracted files
                if file_list:
                    logger.info("Sample extracted files:")
                    for i, filename in enumerate(file_list[:5]):
                        logger.info(f"  {i+1}. {filename}")
                    if len(file_list) > 5:
                        logger.info(f"  ... and {len(file_list) - 5} more files")
                
                logger.info("ZIP extraction completed successfully!")
                
        except zipfile.BadZipFile as e:
            logger.error(f"ZIP file error: {e}")
            logger.error("Possible causes:")
            logger.error("1. Downloaded file is corrupted")
            logger.error("2. Wrong URL (downloaded HTML instead of ZIP)")
            logger.error("3. File is not actually a ZIP file")
            raise
        except Exception as e:
            logger.error(f"Extraction failed: {e}")
            raise

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2025-09-28 10:25:02,258: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-09-28 10:25:02,264: INFO: common: yaml file: params.yaml loaded successfully]
[2025-09-28 10:25:02,266: INFO: common: created directory at: artifacts]
[2025-09-28 10:25:02,269: INFO: common: created directory at: artifacts/data_ingestion]
[2025-09-28 10:25:02,269: INFO: 2561972074: Fixed GitHub URL: https://github.com/AkibNayan/Project-Datasets/blob/main/summarizer-data.zip -> https://raw.githubusercontent.com/AkibNayan/Project-Datasets/main/summarizer-data.zip]
[2025-09-28 10:25:02,271: INFO: 2561972074: Downloading from: https://raw.githubusercontent.com/AkibNayan/Project-Datasets/main/summarizer-data.zip]
[2025-09-28 10:25:09,243: INFO: 2561972074: artifacts/data_ingestion/data.zip downloaded! Headers: 
Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "d