In [1]:
# set directory to main directory
import os 

os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path
import os
import urllib.request as request
import zipfile
from src.textSummarizer.logging.logging_config import setup_logger
from src.textSummarizer.utils.common import get_size
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories
from datasets import DatasetDict

[2025-07-02 23:30:41,918: INFO: config: PyTorch version 2.7.1+cu128 available.]
[2025-07-02 23:30:41,923: INFO: config: JAX version 0.4.30 available.]


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    dataset_name: str
    dataset_version: str
    local_data_dir: Path

In [4]:
# Initialize the logger
logger = setup_logger("logs/running_logs.log")  # Specify the log file path
# Example usage of the logger
logger.info("Application started.")

2025-07-02 23:30:42,303 - TextSummarizer - INFO - Application started.


[2025-07-02 23:30:42,303: INFO: 3658843850: Application started.]


In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath: str = "config/config.yaml",  # Update with your actual path
            params_filepath: str = "config/params.yaml"):  # Update with your actual path
        
        # Convert string paths to Path objects
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))

        # Create the root directory for artifacts
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        # Create the directory for data ingestion
        create_directories([config.root_dir])

        # Create and return the DataIngestionConfig object
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            dataset_name=config.dataset_name,
            dataset_version=config.dataset_version,
            local_data_dir=Path(config.local_data_dir)
        )

        return data_ingestion_config


In [6]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        logger.info("DataIngestion initialized with config: %s", self.config)
    
    def download_data(self) -> DatasetDict:
        """
        Downloads the dataset from Hugging Face and saves it locally
        Returns:
            DatasetDict: The loaded dataset containing train, validation and test splits
        """
        # Create directories if they don't exist
        os.makedirs(self.config.local_data_dir, exist_ok=True)
        logger.info("Starting data download...")
        try:
            # Load dataset from Hugging Face
            from datasets import load_dataset
            dataset = load_dataset(
                path=self.config.dataset_name,
                name=self.config.dataset_version,
                cache_dir=self.config.local_data_dir
            )

            logger.debug("Loading dataset from %s", self.config.dataset_name)

            # Save datasets to local files
            dataset['train'].to_csv(os.path.join(self.config.local_data_dir, 'train.csv'))
            dataset['validation'].to_csv(os.path.join(self.config.local_data_dir, 'validation.csv')) 
            dataset['test'].to_csv(os.path.join(self.config.local_data_dir, 'test.csv'))
            
            logger.info("Data download completed successfully.")

            return dataset
            
        except Exception as e:
            # raise Exception(f"Error downloading dataset: {str(e)}")
            logger.error("Error downloading data: %s", str(e))

    def get_data(self) -> DatasetDict:
        """
        Public method to get the downloaded data
        Returns:
            DatasetDict: The dataset dictionary
        """
        return self.download_data()


In [None]:
# Create an instance of ConfigurationManager
config_manager = ConfigurationManager()

# Get the data ingestion configuration
data_ingestion_config = config_manager.get_data_ingestion_config()

# Access the configuration attributes
print(data_ingestion_config.dataset_name)
print(data_ingestion_config.local_data_dir)

data_ingestion = DataIngestion(config=data_ingestion_config)
dataset = data_ingestion.get_data()

[2025-07-02 23:33:41,757: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-02 23:33:41,760: INFO: common: yaml file: config\params.yaml loaded successfully]
[2025-07-02 23:33:41,761: INFO: common: created directory at: artifacts]
[2025-07-02 23:33:41,762: INFO: common: created directory at: artifacts/data_ingestion]


2025-07-02 23:33:41,763 - TextSummarizer - INFO - DataIngestion initialized with config: DataIngestionConfig(root_dir=WindowsPath('artifacts/data_ingestion'), dataset_name='abisee/cnn_dailymail', dataset_version='3.0.0', local_data_dir=WindowsPath('artifacts/data_ingestion/cnn_dailymail_data'))


abisee/cnn_dailymail
artifacts\data_ingestion\cnn_dailymail_data
[2025-07-02 23:33:41,763: INFO: 416219855: DataIngestion initialized with config: DataIngestionConfig(root_dir=WindowsPath('artifacts/data_ingestion'), dataset_name='abisee/cnn_dailymail', dataset_version='3.0.0', local_data_dir=WindowsPath('artifacts/data_ingestion/cnn_dailymail_data'))]


2025-07-02 23:33:41,766 - TextSummarizer - INFO - Starting data download...


[2025-07-02 23:33:41,766: INFO: 416219855: Starting data download...]


Generating train split: 100%|██████████| 287113/287113 [00:02<00:00, 114006.49 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 110683.42 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 105166.74 examples/s]


[2025-07-02 23:33:57,734: DEBUG: 416219855: Loading dataset from abisee/cnn_dailymail]


Creating CSV from Arrow format: 100%|██████████| 288/288 [00:22<00:00, 12.80ba/s]
Creating CSV from Arrow format: 100%|██████████| 14/14 [00:01<00:00, 13.84ba/s]
Creating CSV from Arrow format: 100%|██████████| 12/12 [00:00<00:00, 13.70ba/s]
2025-07-02 23:34:22,141 - TextSummarizer - INFO - Data download completed successfully.


[2025-07-02 23:34:22,141: INFO: 416219855: Data download completed successfully.]
