In [36]:
import os 

In [37]:
%pwd

'd:\\projects\\project2'

In [40]:
os.chdir(path='d:\\projects\\project2')

In [41]:
%pwd

'd:\\projects\\project2'

In [42]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class data_ingestionConfig:
    root_dir : Path
    source_URL :str
    local_data_file : Path
    unzip_dir : Path
    

In [44]:
    from src.Text_summarizer.constants import *
    from src.Text_summarizer.utils.common import create_directories,read_yaml
    from src.Text_summarizer.logging import logging

In [20]:
class ConfigurationManager:
    def __init__(self, config_file_path = CONFIG_FILE_PATH, params_file_path = PARAMS_FILE_PATH):
        self.config=read_yaml(config_file_path)
        self.params= read_yaml(params_file_path) 
        
        create_directories([self.config.artifacts_root])
        logging.info("created directory named artifacts")
        
        
        
    def get_data_ingestion_config(self)-> data_ingestionConfig:
        ## this is where we read the yaml file , to get the file path saved in the yaml file 
        config=self.config.data_ingestion
        
        create_directories([config.root_dir])
        logging.info("creating directory named root_dir")
        ## again reading the yaml file 
        data_ingestion_config=data_ingestionConfig(    
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir)
        
        return data_ingestion_config
            
        
        

In [45]:
##loading all the important libraries 
import os 
from pathlib import Path
import zipfile
from src.Text_summarizer.logging import logging 
from src.Text_summarizer.utils.common import get_size
from urllib.request import Request,urlretrieve


In [48]:
class DataIngestion:
    def __init__(self,config : data_ingestionConfig):
        self.config=config
        
    def download_data(self):
        ## created the directory if it does not exist
        directory = os.path.dirname(self.config.local_data_file)
        if not os.path.exists(directory):
            os.makedirs(directory)
        if not os.path.exists(self.config.local_data_file):
            filename,header = urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )    
            logging.info(f"{filename} downaloaded with following info : {header}")
        else:
            logging.info(f"file already exists of size : {get_size(Path(self.config.local_data_file))} ") 
            
    def extract_zip_file(self):
        unzip_path= self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [49]:
try:
    config=ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion=DataIngestion(config=data_ingestion_config)
    data_ingestion.download_data()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e    

[ 2024-07-30 01:09:42,924 ] 16 root - INFO - yaml file:<_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='UTF-8'> loaded successfully
[ 2024-07-30 01:09:42,928 ] 16 root - INFO - yaml file:<_io.TextIOWrapper name='params.yaml' mode='r' encoding='UTF-8'> loaded successfully
[ 2024-07-30 01:09:42,928 ] 27 root - INFO - created directory at :artifacts
[ 2024-07-30 01:09:42,932 ] 27 root - INFO - created directory at :artifacts/data_ingestion
[ 2024-07-30 01:09:47,021 ] 14 root - INFO - artifacts/data_inegstion/data.zip downaloaded with following info : Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: F49E:16A6FB:3B9BE6:4D7ADD:6

In [33]:
create_directories(["artifacts/data_inegstion/data.zip"])

[ 2024-07-29 01:13:46,094 ] 27 root - INFO - created directory at :artifacts/data_inegstion/data.zip
