In [1]:
import os   

In [2]:
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow\\research'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\akato\\Desktop\\MLOps\\datascienceproject_fullflow'

In [None]:
# code transfered to src entity.config.configuration.py

from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

# Configuration Update
We're transitioning from ZIP file handling to direct CSV handling. This requires updates to:
1. DataIngestionConfig class structure
2. Configuration file (config.yaml)
3. Data ingestion implementation

In [None]:
# Import constants and utility functions from the project
# Code transfered to src.datascienceproject.config.configuration.py
from src.datascienceproject.constant import *  # Import all constants
from src.datascienceproject.utils.common import read_yaml, create_directories  # Import specific utility functions

In [68]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        # Convert paths to strings for read_yaml
        self.config = read_yaml(str(config_filepath))
        self.params = read_yaml(str(params_filepath))
        self.schema = read_yaml(str(schema_filepath))

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

In [None]:
# transfered to src.components.data_ingestion.py 
import os
import urllib.request as request
from src.datascienceproject import logger
import zipfile

In [69]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def verify_zip_file(self, file_path: str) -> bool:
        """
        Verify if the file is a valid ZIP file
        """
        try:
            with open(file_path, 'rb') as f:
                # Check ZIP file signature (PK\x03\x04)
                is_zip = f.read(4).startswith(b'PK\x03\x04')
                if not is_zip:
                    logger.error(f"File {file_path} does not have a valid ZIP signature")
                return is_zip
        except Exception as e:
            logger.error(f"Error verifying ZIP file: {str(e)}")
            return False
    
    def download_file(self):
        """
        Downloads file from source URL if it doesn't exist locally
        """
        try:
            file_path = str(self.config.local_data_file)
            if not os.path.exists(file_path):
                # Create directory if it doesn't exist
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                
                logger.info(f"Downloading from {self.config.source_URL} to {file_path}")
                filename, headers = request.urlretrieve(
                    url=self.config.source_URL,
                    filename=file_path
                )
                logger.info(f"File downloaded successfully to: {filename}")
                logger.debug(f"Download headers: {headers}")
            
            # Verify the file whether it was just downloaded or existed before
            if not self.verify_zip_file(file_path):
                # If file exists but is invalid, try to download again
                logger.warning("Invalid ZIP file detected, attempting to download again...")
                if os.path.exists(file_path):
                    os.remove(file_path)
                filename, headers = request.urlretrieve(
                    url=self.config.source_URL,
                    filename=file_path
                )
                if not self.verify_zip_file(file_path):
                    raise Exception("Failed to download a valid ZIP file")
            
        except Exception as e:
            logger.error(f"Error downloading file: {str(e)}")
            raise e

    def extract_zip_file(self):
        """
        Extracts the zip file into the specified directory
        """
        try:
            file_path = str(self.config.local_data_file)
            unzip_path = str(self.config.unzip_dir)
            
            # Verify zip file before attempting to extract
            if not self.verify_zip_file(file_path):
                raise Exception("Cannot extract: Invalid ZIP file")
            
            os.makedirs(unzip_path, exist_ok=True)
            
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # List contents before extracting
                logger.info(f"ZIP file contains: {zip_ref.namelist()}")
                zip_ref.extractall(unzip_path)
            logger.info(f"File extracted successfully to: {unzip_path}")
        except Exception as e:
            logger.error(f"Error extracting file: {str(e)}")
            raise e

In [70]:
try:
    logger.info("Starting data ingestion process...")
    
    # Initialize configuration
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    
    # Create data ingestion object
    data_ingestion = DataIngestion(config=data_ingestion_config)
    
    # Download and extract file
    logger.info("Downloading file...")
    data_ingestion.download_file()
    
    logger.info("Extracting file...")
    data_ingestion.extract_zip_file()
    
    logger.info("Data ingestion completed successfully!")
except Exception as e:
    logger.error(f"Error in data ingestion process: {str(e)}")
    raise e

[2025-08-24 07:24:33,375] INFO in 2571272890: Starting data ingestion process...
[2025-08-24 07:24:33,376] INFO in common: YAML file configs\config.yaml loaded successfully.
[2025-08-24 07:24:33,376] INFO in common: YAML file configs\config.yaml loaded successfully.
[2025-08-24 07:24:33,382] INFO in common: YAML file params.yaml loaded successfully.
[2025-08-24 07:24:33,387] INFO in common: YAML file schema.yaml loaded successfully.
[2025-08-24 07:24:33,390] INFO in common: created directory at: artifacts
[2025-08-24 07:24:33,391] INFO in common: created directory at: artifacts/data_ingestion
[2025-08-24 07:24:33,382] INFO in common: YAML file params.yaml loaded successfully.
[2025-08-24 07:24:33,387] INFO in common: YAML file schema.yaml loaded successfully.
[2025-08-24 07:24:33,390] INFO in common: created directory at: artifacts
[2025-08-24 07:24:33,391] INFO in common: created directory at: artifacts/data_ingestion
[2025-08-24 07:24:33,394] INFO in 2571272890: Downloading file...
[

In [61]:
# Inspect configuration
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
print("Configuration values:")
print(f"Source URL: {data_ingestion_config.source_URL}")
print(f"Local file: {data_ingestion_config.local_data_file}")
print(f"Unzip dir: {data_ingestion_config.unzip_dir}")

# Verify URL is accessible
import requests
try:
    response = requests.head(data_ingestion_config.source_URL)
    print(f"\nURL Status: {response.status_code}")
    print("URL Headers:")
    for key, value in response.headers.items():
        print(f"{key}: {value}")
except Exception as e:
    print(f"Error checking URL: {str(e)}")

[2025-08-24 07:20:18,772] INFO in common: YAML file configs\config.yaml loaded successfully.
[2025-08-24 07:20:18,778] INFO in common: YAML file params.yaml loaded successfully.
[2025-08-24 07:20:18,780] INFO in common: YAML file schema.yaml loaded successfully.
[2025-08-24 07:20:18,782] INFO in common: created directory at: artifacts
[2025-08-24 07:20:18,785] INFO in common: created directory at: artifacts/data_ingestion
Configuration values:
Source URL: https://github.com/Dadaranger/dataset/blob/main/winequality-white.zip
Local file: artifacts/data_ingestion/data.zip
Unzip dir: artifacts/data_ingestion

URL Status: 200
URL Headers:
Date: Sat, 23 Aug 2025 23:20:19 GMT
Content-Type: text/html; charset=utf-8
Vary: X-PJAX, X-PJAX-Container, Turbo-Visit, Turbo-Frame, X-Requested-With,Accept-Encoding, Accept, X-Requested-With
x-repository-download: git clone https://github.com/Dadaranger/dataset.git
x-raw-download: https://raw.githubusercontent.com/Dadaranger/dataset/main/winequality-white

In [67]:
# Test raw GitHub URL
import requests
import os

raw_url = "https://raw.githubusercontent.com/Dadaranger/dataset/main/winequality-white.zip"
test_file = "test_download.zip"

try:
    # Download with requests to see response details
    response = requests.get(raw_url)
    print(f"Status Code: {response.status_code}")
    print("Headers:")
    for key, value in response.headers.items():
        print(f"{key}: {value}")
    
    if response.status_code == 200:
        # Save the content
        with open(test_file, "wb") as f:
            f.write(response.content)
        
        # Check file signature
        with open(test_file, "rb") as f:
            header = f.read(4).hex()
            print(f"\nFile header: {header}")
            is_zip = header.startswith('504b0304')
            print(f"Is valid ZIP: {is_zip}")
        
        # Clean up
        os.remove(test_file)
    else:
        print(f"\nFailed to download. Status code: {response.status_code}")
except Exception as e:
    print(f"Error: {str(e)}")
    if os.path.exists(test_file):
        os.remove(test_file)

Status Code: 200
Headers:
Connection: keep-alive
Content-Length: 73146
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: W/"6185468f1650cf78b9373e579b5619e55f60ad823ad68fa1ea3c31715584fdda"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 91B4:37C012:72F25:97C26:68AA4D00
Accept-Ranges: bytes
Date: Sat, 23 Aug 2025 23:23:17 GMT
Via: 1.1 varnish
X-Served-By: cache-pdk-kfty8610086-PDK
X-Cache: HIT
X-Cache-Hits: 0
X-Timer: S1755991398.882521,VS0,VE2
Vary: Authorization,Accept-Encoding
Access-Control-Allow-Origin: *
Cross-Origin-Resource-Policy: cross-origin
X-Fastly-Request-ID: 17315cd26f1013b57736b7ca0c9fdcff2e46ffa5
Expires: Sat, 23 Aug 2025 23:28:17 GMT
Source-Age: 101

File header: 504b0304
Is valid ZIP: True


# GitHub URL Format
When downloading files from GitHub:
- ❌ Don't use: `https://github.com/user/repo/blob/branch/file`
- ✅ Use: `https://raw.githubusercontent.com/user/repo/branch/file`

In [None]:
# Display current config file contents
import yaml

config_path = "configs/config.yaml"
print("Current config.yaml contents:")
print("-" * 50)
with open(config_path, 'r') as f:
    print(yaml.safe_dump(yaml.safe_load(f), default_flow_style=False))

In [None]:
# Update config with correct URL
config_path = "configs/config.yaml"

# Read current config
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Update the URL
config['data_ingestion']['source_URL'] = "https://raw.githubusercontent.com/Dadaranger/dataset/main/winequality-white.zip"

# Save updated config
with open(config_path, 'w') as f:
    yaml.safe_dump(config, f, default_flow_style=False)

print("Config updated successfully!")
print("\nNew config.yaml contents:")
print("-" * 50)
print(yaml.safe_dump(config, default_flow_style=False))

In [None]:
# Clean up old files
import shutil

zip_path = "artifacts/data_ingestion/data.zip"
unzip_dir = "artifacts/data_ingestion"

# Remove old files
if os.path.exists(zip_path):
    os.remove(zip_path)
    print(f"Removed old zip file: {zip_path}")

# Clear the unzip directory but keep the directory itself
if os.path.exists(unzip_dir):
    for item in os.listdir(unzip_dir):
        item_path = os.path.join(unzip_dir, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)
    print(f"Cleared contents of: {unzip_dir}")

print("Ready to try data ingestion again!")

In [65]:
# Debugging cell
import os

# Check file existence and size
zip_path = "artifacts/data_ingestion/data.zip"
if os.path.exists(zip_path):
    file_size = os.path.getsize(zip_path)
    print(f"File exists and its size is: {file_size} bytes")
    
    # Read first few bytes to check file signature
    with open(zip_path, 'rb') as f:
        header = f.read(4).hex()
        print(f"File header (first 4 bytes): {header}")
        # ZIP file should start with PK\x03\x04 (hex: 504b0304)
        is_zip = header.startswith('504b0304')
        print(f"Is valid ZIP header: {is_zip}")
else:
    print("File does not exist")

File exists and its size is: 182952 bytes
File header (first 4 bytes): 0a0a0a0a
Is valid ZIP header: False
