In [25]:
%pwd

'/workspaces/Howler-Monkey'

In [26]:
import os
os.chdir("/workspaces/Howler-Monkey/")

In [27]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    data_id: str
    data_porcentage: int
    local_data_file: Path
    unzip_dir: Path

In [28]:
from HowlerMonkey.constants import *
from HowlerMonkey.utils.common import read_yaml, create_directories

In [29]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_train1_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir        = Path(config.root_dir),
            data_id         = config.train_data_id_1,
            data_porcentage = int(config.porcent_data_1),
            local_data_file = Path(config.local_train_data_file),
            unzip_dir       = Path(config.unzip_train_dir)
        )

        return data_ingestion_config
    
    def get_train2_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir        = Path(config.root_dir),
            data_id         = config.train_data_id_2,
            data_porcentage = int(config.porcent_data_2),
            local_data_file = Path(config.local_train_data_file),
            unzip_dir       = Path(config.unzip_train_dir)
        )

        return data_ingestion_config
    
    def get_val_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir        = Path(config.root_dir),
            data_id         = config.val_data_id,
            data_porcentage = 100,
            local_data_file = Path(config.local_val_data_file),
            unzip_dir       = Path(config.unzip_val_dir)
        )

        return data_ingestion_config

In [30]:
import os
import zipfile
import shutil
import gdown
import random
from HowlerMonkey import logger
from HowlerMonkey.utils.common import get_size

In [31]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_id = self.config.data_id
            zip_download_dir = self.config.local_data_file
            os.makedirs(self.config.root_dir, exist_ok=True)
            logger.info(f"Downloading data with id {dataset_id} into file {zip_download_dir}")

            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(
                prefix+dataset_id,
                str(zip_download_dir)
            )

            logger.info(f"Downloaded data with id {dataset_id} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir

        os.makedirs(unzip_path, exist_ok=True)
        os.makedirs(os.path.join(unzip_path, 'images'), exist_ok=True)
        os.makedirs(os.path.join(unzip_path, 'labels'), exist_ok=True)
        

        logger.info(f"Extracting data from {self.config.local_data_file} into {unzip_path}")
        with zipfile.ZipFile(self.config.local_data_file) as zip_file:

            all_images = [file_path for file_path in zip_file.namelist() if file_path.endswith('.png') or file_path.endswith('.jpg')]

            logger.info(f"Filtering {self.config.data_porcentage}% of {len(all_images)} samples")

            num_images = len(all_images)*(self.config.data_porcentage/100)
            filtered_images = random.sample(all_images, int(num_images))
            
            logger.info(f"Extracting {len(filtered_images)} samples")
            for image in filtered_images:

                image_name = os.path.basename(image)
                label_name = image_name.replace('.png', '.txt').replace('.jpg', '.txt')

                image_source = zip_file.open(image)
                label_source = zip_file.open(image.replace('images', 'labels').replace(image_name, label_name))

                image_target = open(os.path.join(unzip_path, 'images', image_name), "wb")
                label_target = open(os.path.join(unzip_path, 'labels', label_name), "wb")
                
                with image_source, image_target:
                    shutil.copyfileobj(image_source, image_target)

                with label_source, label_target:
                    shutil.copyfileobj(label_source, label_target)

            logger.info(f"Extracted {len(all_images)} samples into {unzip_path}")
                

In [32]:
try: 

    
    config = ConfigurationManager()
    
    train_data1_ingestion_config = config.get_train1_data_ingestion_config()
    traning_data_ingestion = DataIngestion(train_data1_ingestion_config)
    traning_data_ingestion.download_file()
    traning_data_ingestion.extract_zip_file()

    train_data2_ingestion_config = config.get_train2_data_ingestion_config()
    traning_data_ingestion = DataIngestion(train_data2_ingestion_config)
    traning_data_ingestion.download_file()
    traning_data_ingestion.extract_zip_file()

    val_data_ingestion_config = config.get_val_data_ingestion_config()
    val_data_ingestion = DataIngestion(val_data_ingestion_config)
    val_data_ingestion.download_file()
    val_data_ingestion.extract_zip_file()

except Exception as e:
    raise e

[2024-06-23 00:51:39,969: INFO: common] yaml file: config/config.yaml loaded successfully
[2024-06-23 00:51:39,972: INFO: common] yaml file: params.yaml loaded successfully
[2024-06-23 00:51:39,973: INFO: common] Creating directory: artifacts
[2024-06-23 00:51:39,974: INFO: common] Creating directory: artifacts/data_ingestion
[2024-06-23 00:51:39,975: INFO: 737725553] Downloading data with id 1RfwpW6tCbjgeQVXzXgvhZO2JThPKzdc8 into file artifacts/data_ingestion/train.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1RfwpW6tCbjgeQVXzXgvhZO2JThPKzdc8
From (redirected): https://drive.google.com/uc?/export=download&id=1RfwpW6tCbjgeQVXzXgvhZO2JThPKzdc8&confirm=t&uuid=aa9bbe23-4a66-471d-875e-083ccebad5a2
To: /workspaces/Howler-Monkey/artifacts/data_ingestion/train.zip
100%|██████████| 128M/128M [00:05<00:00, 24.1MB/s] 

[2024-06-23 00:51:46,609: INFO: 737725553] Downloaded data with id 1RfwpW6tCbjgeQVXzXgvhZO2JThPKzdc8 into file artifacts/data_ingestion/train.zip
[2024-06-23 00:51:46,611: INFO: 737725553] Extracting data from artifacts/data_ingestion/train.zip into artifacts/data_ingestion/data/train
[2024-06-23 00:51:46,635: INFO: 737725553] Filtering 50% of 1800 samples
[2024-06-23 00:51:46,637: INFO: 737725553] Extracting 900 samples





[2024-06-23 00:51:48,230: INFO: 737725553] Extracted 1800 samples into artifacts/data_ingestion/data/train
[2024-06-23 00:51:48,231: INFO: common] Creating directory: artifacts/data_ingestion
[2024-06-23 00:51:48,232: INFO: 737725553] Downloading data with id 16Hwu7e5p_N5KYZUW_Fdad60_jRAcyCMS into file artifacts/data_ingestion/train.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=16Hwu7e5p_N5KYZUW_Fdad60_jRAcyCMS
From (redirected): https://drive.google.com/uc?/export=download&id=16Hwu7e5p_N5KYZUW_Fdad60_jRAcyCMS&confirm=t&uuid=7e68d3c4-ff32-4dfe-9166-7bbd81fc46d3
To: /workspaces/Howler-Monkey/artifacts/data_ingestion/train.zip
100%|██████████| 59.6M/59.6M [00:02<00:00, 27.5MB/s]

[2024-06-23 00:51:51,727: INFO: 737725553] Downloaded data with id 16Hwu7e5p_N5KYZUW_Fdad60_jRAcyCMS into file artifacts/data_ingestion/train.zip
[2024-06-23 00:51:51,729: INFO: 737725553] Extracting data from artifacts/data_ingestion/train.zip into artifacts/data_ingestion/data/train
[2024-06-23 00:51:51,748: INFO: 737725553] Filtering 50% of 1800 samples
[2024-06-23 00:51:51,749: INFO: 737725553] Extracting 900 samples





[2024-06-23 00:51:52,826: INFO: 737725553] Extracted 1800 samples into artifacts/data_ingestion/data/train
[2024-06-23 00:51:52,828: INFO: common] Creating directory: artifacts/data_ingestion
[2024-06-23 00:51:52,829: INFO: 737725553] Downloading data with id 1x96HUNENYCfvyH7F4A2u5f8rZoEPEGfg into file artifacts/data_ingestion/val.zip


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1x96HUNENYCfvyH7F4A2u5f8rZoEPEGfg
From (redirected): https://drive.google.com/uc?/export=download&id=1x96HUNENYCfvyH7F4A2u5f8rZoEPEGfg&confirm=t&uuid=3228ae61-38b1-4d01-9098-954a5761dda4
To: /workspaces/Howler-Monkey/artifacts/data_ingestion/val.zip
100%|██████████| 653M/653M [00:20<00:00, 31.7MB/s] 


[2024-06-23 00:52:14,782: INFO: 737725553] Downloaded data with id 1x96HUNENYCfvyH7F4A2u5f8rZoEPEGfg into file artifacts/data_ingestion/val.zip
[2024-06-23 00:52:14,790: INFO: 737725553] Extracting data from artifacts/data_ingestion/val.zip into artifacts/data_ingestion/data/val
[2024-06-23 00:52:14,823: INFO: 737725553] Filtering 100% of 1800 samples
[2024-06-23 00:52:14,825: INFO: 737725553] Extracting 1800 samples
[2024-06-23 00:52:22,650: INFO: 737725553] Extracted 1800 samples into artifacts/data_ingestion/data/val
