In [1]:
%pwd

'/workspaces/Howler-Monkey/research'

In [2]:
import os
os.chdir("../")

In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    data_id: str
    local_data_file: Path
    unzip_dir: Path

In [14]:
from HowlerMonkey.constants import *
from HowlerMonkey.utils.common import read_yaml, create_directories

In [19]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_train_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir        = config.root_dir,
            data_id         = config.train_data_id,
            local_data_file = config.local_train_data_file,
            unzip_dir       = config.unzip_train_dir
        )

        return data_ingestion_config


    def get_val_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir        = config.root_dir,
            data_id         = config.val_data_id,
            local_data_file = config.local_val_data_file,
            unzip_dir       = config.unzip_val_dir
        )

        return data_ingestion_config

In [27]:
import os
import zipfile
import shutil
import gdown
from HowlerMonkey import logger
from HowlerMonkey.utils.common import get_size

In [44]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_id = self.config.data_id
            zip_download_dir = self.config.local_data_file
            os.makedirs(self.config.root_dir, exist_ok=True)
            logger.info(f"Downloading data with id {dataset_id} into file {zip_download_dir}")

            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(
                prefix+dataset_id,
                zip_download_dir
            )

            logger.info(f"Downloaded data with id {dataset_id} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        os.makedirs(os.path.join(unzip_path, 'images'), exist_ok=True)
        os.makedirs(os.path.join(unzip_path, 'labels'), exist_ok=True)
        

        with zipfile.ZipFile(self.config.local_data_file) as zip_file:
            for member in zip_file.namelist():
                filename = os.path.basename(member)
                # skip directories
                if not filename:
                    continue
            
                # copy file (taken from zipfile's extract)
                source = zip_file.open(member)
                if filename.endswith('.png') or filename.endswith('.jpg'):
                    target = open(os.path.join(unzip_path, 'images', filename), "wb")
                elif filename.endswith('.txt'):
                    target = open(os.path.join(unzip_path, 'labels', filename), "wb")
                else:
                    continue
                
                with source, target:
                    shutil.copyfileobj(source, target)

In [45]:
try: 
    config = ConfigurationManager()
    
    train_data_ingestion_config = config.get_train_data_ingestion_config()
    traning_data_ingestion = DataIngestion(train_data_ingestion_config)
    traning_data_ingestion.download_file()
    traning_data_ingestion.extract_zip_file()

    val_data_ingestion_config = config.get_val_data_ingestion_config()
    val_data_ingestion = DataIngestion(val_data_ingestion_config)
    val_data_ingestion.download_file()
    val_data_ingestion.extract_zip_file()

except Exception as e:
    raise e

[2024-06-14 13:41:47,017: INFO: common] yaml file: config/config.yaml loaded successfully
[2024-06-14 13:41:47,020: INFO: common] yaml file: params.yaml loaded successfully
[2024-06-14 13:41:47,021: INFO: common] Creating directory: artifacts
[2024-06-14 13:41:47,022: INFO: common] Creating directory: artifacts/data_ingestion
[2024-06-14 13:41:47,023: INFO: 1397511870] Downloading data with id 1JqD4SRcFq1Fuj0wPTvLd-mfWMlQlxR7T into file artifacts/data_ingestion/train.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1JqD4SRcFq1Fuj0wPTvLd-mfWMlQlxR7T
From (redirected): https://drive.google.com/uc?/export=download&id=1JqD4SRcFq1Fuj0wPTvLd-mfWMlQlxR7T&confirm=t&uuid=a1257a32-d79c-4c0e-8f52-52aed40af5c5
To: /workspaces/Howler-Monkey/artifacts/data_ingestion/train.zip
100%|██████████| 653M/653M [00:18<00:00, 36.0MB/s] 

[2024-06-14 13:42:06,408: INFO: 1397511870] Downloaded data with id 1JqD4SRcFq1Fuj0wPTvLd-mfWMlQlxR7T into file artifacts/data_ingestion/train.zip





[2024-06-14 13:42:13,041: INFO: common] Creating directory: artifacts/data_ingestion
[2024-06-14 13:42:13,042: INFO: 1397511870] Downloading data with id 18l6fmuuQKKtLKULw2UF2vVqiQzFRe5g9 into file artifacts/data_ingestion/val.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=18l6fmuuQKKtLKULw2UF2vVqiQzFRe5g9
From (redirected): https://drive.google.com/uc?/export=download&id=18l6fmuuQKKtLKULw2UF2vVqiQzFRe5g9&confirm=t&uuid=1845c552-c91c-4c3d-a88c-a22e60e4a582
To: /workspaces/Howler-Monkey/artifacts/data_ingestion/val.zip
100%|██████████| 128M/128M [00:04<00:00, 30.7MB/s] 

[2024-06-14 13:42:18,406: INFO: 1397511870] Downloaded data with id 18l6fmuuQKKtLKULw2UF2vVqiQzFRe5g9 into file artifacts/data_ingestion/val.zip



