In [1]:
import os
import boto3
import pandas as pd
from botocore.exceptions import NoCredentialsError

# MinIO Configuration
minio_endpoint = 'http://localhost:9000'
minio_access_key = 'minioadmin'
minio_secret_key = 'minioadmin'
raw_bucket = 'raw'
processed_bucket = 'processed'

# Temporary Local Folder for Processing
temp_folder = "C:/TempChinook/"
os.makedirs(temp_folder, exist_ok=True)

# Connect to MinIO
s3_client = boto3.client(
    's3',
    endpoint_url=minio_endpoint,
    aws_access_key_id=minio_access_key,
    aws_secret_access_key=minio_secret_key
)

# Function to List All Files in the Raw Zone
def list_raw_files():
    try:
        print("Listing files in the Raw Zone...")
        response = s3_client.list_objects_v2(Bucket=raw_bucket)
        if 'Contents' in response:
            files = [file['Key'] for file in response['Contents']]
            print(f"Found raw files: {files}")
            return files
        else:
            print("No files found in the Raw Zone.")
            return []
    except Exception as e:
        print(f"Error listing raw files: {e}")
        return []

# Function to Process Raw Data
def process_raw_data():
    try:
        raw_files = list_raw_files()
        if not raw_files:
            print("No raw files to process. Exiting.")
            return

        for raw_file in raw_files:
            print(f"Starting processing for {raw_file}...")
            try:
                # Download raw file from MinIO
                local_raw_file = os.path.join(temp_folder, os.path.basename(raw_file))
                print(f"Downloading {raw_file} from MinIO...")
                s3_client.download_file(raw_bucket, raw_file, local_raw_file)

                # Load raw data into Pandas
                print(f"Processing file: {raw_file}")
                raw_data = pd.read_csv(local_raw_file)

                # Apply light transformations
                processed_data = raw_data.apply(
                    lambda col: col.fillna('').astype(str) if col.dtype == 'object' else col.fillna(0)
                )

                # Save processed data to Parquet format
                local_processed_file = os.path.join(temp_folder, f"{os.path.splitext(raw_file)[0]}_cleaned.parquet")
                processed_data.to_parquet(local_processed_file, index=False)
                print(f"Saved processed data to: {local_processed_file}")

                # Upload processed file to MinIO Processed Zone
                processed_file_key = f"{os.path.splitext(raw_file)[0]}_cleaned.parquet"
                if os.path.exists(local_processed_file):
                    print(f"Uploading {processed_file_key} to MinIO Processed Zone...")
                    s3_client.upload_file(local_processed_file, processed_bucket, processed_file_key)
                    print(f"Uploaded processed file to MinIO: {processed_bucket}/{processed_file_key}")
                else:
                    print(f"Processed file {local_processed_file} not found. Skipping upload.")

            except Exception as e:
                print(f"Error processing {raw_file}: {e}")

        print("All raw files processed and uploaded to the Processed Zone.")

    except NoCredentialsError:
        print("Error: MinIO credentials are invalid.")
    except Exception as e:
        print(f"Error during processing: {e}")

# Main Workflow
if __name__ == "__main__":
    print("Starting processing of Raw Zone data from MinIO...")
    process_raw_data()
    print("Processing completed successfully!")


Starting processing of Raw Zone data from MinIO...
Listing files in the Raw Zone...
Found raw files: ['Album.csv', 'Artist.csv', 'Customer.csv', 'Employee.csv', 'Genre.csv', 'Invoice.csv', 'InvoiceLine.csv', 'MediaType.csv', 'Playlist.csv', 'PlaylistTrack.csv', 'Track.csv', 'sysdiagrams.csv']
Starting processing for Album.csv...
Downloading Album.csv from MinIO...
Processing file: Album.csv
Saved processed data to: C:/TempChinook/Album_cleaned.parquet
Uploading Album_cleaned.parquet to MinIO Processed Zone...
Uploaded processed file to MinIO: processed/Album_cleaned.parquet
Starting processing for Artist.csv...
Downloading Artist.csv from MinIO...
Processing file: Artist.csv
Saved processed data to: C:/TempChinook/Artist_cleaned.parquet
Uploading Artist_cleaned.parquet to MinIO Processed Zone...
Uploaded processed file to MinIO: processed/Artist_cleaned.parquet
Starting processing for Customer.csv...
Downloading Customer.csv from MinIO...
Processing file: Customer.csv
Saved processed d