## Data Inestion module

In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Text-Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Text-Summarizer'

In [5]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common import read_yaml,create_directories

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir : Path
    source_URL : Path
    local_data_file : Path
    unzip_dir : Path

In [16]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([str(self.config.artifacts_root)])
        
    def get_data_ingestion_config(self) ->DataIngestionConfig:
        
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config

In [17]:
import os
import urllib.request as request
import zipfile
from src.TextSummarizer.logging import logger


In [18]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        try:
            if not os.path.exists(self.config.local_data_file):
                filename, headers = request.urlretrieve(
                    url=self.config.source_URL,
                    filename=self.config.local_data_file
                )
                logger.info(f"Downloaded file {filename} to {self.config.local_data_file}, size: {os.path.getsize(self.config.local_data_file)} bytes")
            else:
                logger.info(f"Local data file {self.config.local_data_file} already exists.")
        except Exception as e:
            logger.error(f"Error downloading file from {self.config.source_URL}: {e}")

    def extract_zip_file(self):
        try:
            unzip_path = self.config.unzip_dir
            os.makedirs(unzip_path, exist_ok=True)
            
            logger.info(f"Attempting to extract zip file: {self.config.local_data_file}")
            
            
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
                logger.info(f"Extracted zip file to {unzip_path}")
        except zipfile.BadZipFile as e:
            logger.error(f"Failed to unzip file {self.config.local_data_file}: {e}")
        except FileNotFoundError as e:
            logger.error(f"File not found: {self.config.local_data_file}: {e}")
        except Exception as e:
            logger.error(f"Unexpected error during file extraction: {e}")

In [20]:
config = ConfigurationManager()

data_ingestion_config = config.get_data_ingestion_config()

data_ingestion = DataIngestion(data_ingestion_config)

data_ingestion.download_file()

data_ingestion.extract_zip_file()

[2024-11-07 13:03:48,089 - common.py - read_yaml - line 25 - INFO - YAML file: config\config.yaml loaded successfully]
[2024-11-07 13:03:48,091 - common.py - read_yaml - line 25 - INFO - YAML file: params.yaml loaded successfully]
[2024-11-07 13:03:48,093 - common.py - create_directories - line 46 - INFO - Directory: artifacts created successfully]
[2024-11-07 13:03:48,095 - common.py - create_directories - line 46 - INFO - Directory: artifacts/data_ingestion created successfully]
[2024-11-07 13:03:48,097 - 217046736.py - download_file - line 14 - INFO - Local data file data/data.zip already exists.]
[2024-11-07 13:03:48,100 - 217046736.py - extract_zip_file - line 23 - INFO - Attempting to extract zip file: data/data.zip]
[2024-11-07 13:03:48,297 - 217046736.py - extract_zip_file - line 28 - INFO - Extracted zip file to artifacts/data_ingestion]
