# IMPORT LIBRARIES

In [1]:
import sys
import requests
import zipfile

from tqdm import tqdm
from pathlib import Path
from SIBI_classifier.configuration.configuration import ConfigurationManager
from SIBI_classifier.exception import SIBIClassificationException
from SIBI_classifier.logger.logging import *

# SETUP CONSTANTS
In this section, we will define some constants that are important for data processing and model training.
These constants will help us set the dataset folder path, the extension pattern of the image files to be collected, and which folders to access.
Later, these constants will be used in various parts of the code.

In [2]:
CONFIG = ConfigurationManager()
DATA_INGESTION_CONFIG = CONFIG.get_data_ingestion_config()

logger = log_manager.setup_logger("DataIngestionLogger")

[32m[ 2024-12-10 19:51:45 ] create_directories_logger[0m - [32mINFO[0m - created directory at: [96martifacts[0m
[32m[ 2024-12-10 19:51:45 ] create_directories_logger[0m - [32mINFO[0m - created directory at: [96martifacts/<model_name>/data_ingestion/SIBI_dataset[0m


# GETTING DATA FROM URL

## function

In [3]:
def download_zip(
    url: str, 
    save_zip_file_path: Path, 
    chunk_size: int = 1024
) -> None:
    """
    Downloads a file from a given URL to the specified path.
    
    Args:
        url (str): URL of the file to download.
        save_zip_file_path (Path): The path where the file will be saved.
        chunk_size (int): The chunk size for download. Default is 1024 (1 KB).
    
    Raises:
        requests.exceptions.RequestException: If an error occurs during download.
    """
    try:
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()  # Check for HTTP errors
        
        total_size = int(response.headers.get('content-length', 0))
        with open(save_zip_file_path, "wb") as file, tqdm(
                desc=f"Downloading {save_zip_file_path}",
                total=total_size,
                unit='B', unit_scale=True, unit_divisor=1024,
        ) as bar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                file.write(chunk)
                bar.update(len(chunk))

        DOWNLOAD_ZIP_LOGGER.debug(f"File downloaded to {log_manager.color_text(save_zip_file_path, 'blue')}")

    except requests.exceptions.RequestException as e:
        raise Exception(f"Error downloading the file: {e}")
    
    except Exception as e:
        raise SIBIClassificationException(e, sys)

In [4]:
def extract_zip(
    zip_file_path: Path, 
    extract_dir: Path, 
    is_file_removed: bool = True
) -> None:
    """
    Extracts a zip file to a specified directory.

    Args:
        zip_file_path (Path): The path to the zip file.
        extract_dir (Path): The directory where files will be extracted.
        is_file_removed (bool): Delete the zip file after extraction if True.
    
    Raises:
        zipfile.BadZipFile: If the file is not a valid zip file.
    """
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
            EXTRACT_ZIP_LOGGER.debug(f"Files extracted to {log_manager.color_text(extract_dir, 'green')}")

        # Remove zip file if specified
        if is_file_removed and zip_file_path.exists():
            zip_file_path.unlink()
            EXTRACT_ZIP_LOGGER.debug("Downloaded zip file removed.")

    except zipfile.BadZipFile:
        raise Exception("Error: The downloaded file is not a valid zip file.")
    
    except Exception as e:
        raise SIBIClassificationException(e, sys)

## main program

### Download the zip file to the specified path

In [5]:
download_zip(
    url=DATA_INGESTION_CONFIG.data_download_url,
    save_zip_file_path=DATA_INGESTION_CONFIG.zip_file_path,
)

Downloading artifacts/<model_name>/data_ingestion/SIBI_dataset/SIBI_datasets.zip: 100%|██████████| 97.7M/97.7M [00:25<00:00, 4.10MB/s]
[36m[ 2024-12-10 19:52:12 ] download zip logger[0m - [36mDEBUG[0m - File downloaded to [94martifacts/<model_name>/data_ingestion/SIBI_dataset/SIBI_datasets.zip[0m


In [6]:
# Extract the dataset from the downloaded zip file
logger.info(f"Extracting the dataset from the downloaded zip file: {log_manager.color_text(DATA_INGESTION_CONFIG.zip_file_path, 'cyan')}")

# Extract the dataset to the specified directory
extract_zip(
    zip_file_path=DATA_INGESTION_CONFIG.zip_file_path,
    extract_dir=DATA_INGESTION_CONFIG.data_download_store_dir_path
)

logger.info(f"Got the data from URL: {log_manager.color_text(DATA_INGESTION_CONFIG.data_download_url, 'blue')}")

[32m[ 2024-12-10 19:52:12 ] DataIngestionLogger[0m - [32mINFO[0m - Extracting the dataset from the downloaded zip file: [96martifacts/<model_name>/data_ingestion/SIBI_dataset/SIBI_datasets.zip[0m
[36m[ 2024-12-10 19:52:14 ] extract zip logger[0m - [36mDEBUG[0m - Files extracted to [92martifacts/<model_name>/data_ingestion/SIBI_dataset[0m
[36m[ 2024-12-10 19:52:14 ] extract zip logger[0m - [36mDEBUG[0m - Downloaded zip file removed.[0m
[32m[ 2024-12-10 19:52:14 ] DataIngestionLogger[0m - [32mINFO[0m - Got the data from URL: [94mhttps://huggingface.co/datasets/Anggads01/SIBI-datasets/resolve/main/SIBI_datasets.zip[0m


In [7]:
log_manager.clean_log_file()