In [1]:
import os

In [2]:
%pwd

'c:\\New_Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\New_Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from src.DocumindAI.constants import *
from src.DocumindAI.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [8]:
from huggingface_hub import snapshot_download
import zipfile
from src.DocumindAI.logging import logger
from src.DocumindAI.utils.common import get_size

In [None]:
# class DataIngestion:
#     def __init__(self, config: DataIngestionConfig):
#         self.config = config


    
#     def download_file(self):
#         if not os.path.exists(self.config.local_data_file):
#             filename, headers = request.urlretrieve(
#                 url = self.config.source_URL,
#                 filename = self.config.local_data_file
#             )
#             logger.info(f"{filename} download! with following info: \n{headers}")
#         else:
#             logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  

        
    
#     def extract_zip_file(self):
#         """
#         zip_file_path: str
#         Extracts the zip file into the data directory
#         Function returns None
#         """
#         unzip_path = self.config.unzip_dir
#         os.makedirs(unzip_path, exist_ok=True)
#         with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
#             zip_ref.extractall(unzip_path)

In [None]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    def download_file(self):
        """
        Downloads the dataset from Hugging Face Hub using snapshot_download().
        If the dataset already exists locally, it skips downloading.
        """
        dataset_dir = Path(self.config.root_dir)
        os.makedirs(dataset_dir, exist_ok=True)

        if not os.path.exists(self.config.unzip_dir) or len(os.listdir(self.config.unzip_dir)) == 0:
            logger.info(f"Downloading dataset from Hugging Face: {self.config.source_URL}")
            repo_id = self.config.source_URL.replace("https://huggingface.co/datasets/OCR_datset", "").strip("/")

            local_path = snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                local_dir=dataset_dir,
                token=os.getenv("HF_TOKEN", None) 
            )

            logger.info(f"Dataset downloaded at: {local_path}")
        else:
            logger.info(f"Dataset already exists at {self.config.unzip_dir} (Size: {get_size(Path(self.config.unzip_dir))})")


    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)

        for file in os.listdir(self.config.root_dir):
            if file.endswith(".zip"):
                zip_path = os.path.join(self.config.root_dir, file)
                logger.info(f"Extracting {zip_path} to {unzip_path}")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(unzip_path)

        logger.info("✅ Extraction complete!")

In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2026-01-22 07:00:19,179: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-01-22 07:00:19,186: INFO: common: yaml file: params.yaml loaded successfully]
[2026-01-22 07:00:19,188: INFO: common: created directory at: artifacts_root]
[2026-01-22 07:00:19,190: INFO: common: created directory at: artifacts/data_ingestion]
[2026-01-22 07:00:19,193: INFO: 85418478: Dataset already exists at artifacts/data_ingestion (Size: ~ 0 KB)]


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\anssa\AppData\Local\Programs\Python\Python311\Lib\logging\__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\anssa\AppData\Local\Programs\Python\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 42: character maps to <undefined>
Call stack:


[2026-01-22 07:00:19,195: INFO: 85418478: ✅ Extraction complete!]


  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\New_Project\projectenv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\New_Project\projectenv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\New_Project\projectenv\Lib\site-packages\ipykernel\kernelapp.py", line 758, in start
    self.io_loop.start()
  File "c:\New_Project\projectenv\Lib\site-packages\tornado\platform\asyncio.py", line 211, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\anssa\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 608, in run_forever
    self._run_once()
  File "C:\Users\anssa\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 1936, in _run_once
    handle._run()
  File "C:\Users\anssa\AppData\Local\Programs\Python\Python311\Lib\asyncio\events.py", line 84, in _run
  