In [None]:
import pandas as pd
import numpy as np
import os
import logging

# Configure Logging for better tracking and debugging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def run_professional_etl():
    """
    Executes the Extract, Transform, and Load (ETL) pipeline for Superstore data.
    """
    # Define File Paths
    base_path = r"E:\Cloud Data Engineering CDE\cde\Module_04(Amazon Web Servces AWS)\project\Dataflow Data Project"
    input_file = os.path.join(base_path, "Superstore.csv")
    output_file = os.path.join(base_path, "Superstore_Cleaned_Final.csv")

    logging.info("Starting ETL Pipeline...")

    # --- 1. EXTRACT ---
    if not os.path.exists(input_file):
        logging.error(f"File not found at: {input_file}")
        return

    try:
        # Using latin1 encoding to handle special characters common in retail datasets
        df = pd.read_csv(input_file, encoding='latin1')
        logging.info(f"Data extraction successful. Total Records: {len(df)}")
    except Exception as e:
        logging.error(f"Critical Error during Extraction: {e}")
        return

    # --- 2. TRANSFORM ---
    logging.info("Applying data transformations...")

    try:
        # A. Clean Column Names (Standardize for SQL/Cloud Database compatibility)
        df.columns = [col.strip().replace(' ', '_').replace('-', '_') for col in df.columns]

        # B. Handle Missing Values
        # Filling Postal_Code with 0 and ensuring integer type
        df['Postal_Code'] = df['Postal_Code'].fillna(0).astype(int)
        
        # C. Standardize Date Formats
        df['Order_Date'] = pd.to_datetime(df['Order_Date'])
        df['Ship_Date'] = pd.to_datetime(df['Ship_Date'])

        # D. Feature Engineering (Business Intelligence Metrics)
        # 1. Calculate Delivery Duration in days
        df['Processing_Time'] = (df['Ship_Date'] - df['Order_Date']).dt.days
        
        # 2. Categorize Profitability
        df['Profit_Status'] = df['Profit'].apply(lambda x: 'Profitable' if x > 0 else 'Loss')

        # E. Data Validation & Summary
        avg_processing = df['Processing_Time'].mean()
        unique_products = df['Product_ID'].nunique()
        
        logging.info(f"Transformation Complete. Avg Shipping Time: {avg_processing:.2f} days")
        logging.info(f"Total Unique Products Identified: {unique_products}")

    except Exception as e:
        logging.error(f"Transformation Error: {e}")
        return

    # --- 3. LOAD ---
    try:
        # Saving the cleaned dataset without the index column
        df.to_csv(output_file, index=False)
        logging.info(f"Cleaned data successfully saved to: {output_file}")
    except Exception as e:
        logging.error(f"Failed to save the output file: {e}")

if __name__ == "__main__":
    run_professional_etl()

2026-02-11 22:38:33,223 - INFO - ETL Pipeline Shuru ho rahi hai...
2026-02-11 22:38:34,567 - INFO - Data extract ho gaya. Total Rows: 9994
2026-02-11 22:38:34,571 - INFO - Transformations apply ho rahi hain...
2026-02-11 22:38:34,827 - INFO - Average Shipping Time: 3.96 days
2026-02-11 22:38:34,836 - INFO - Total Unique Products: 1862
2026-02-11 22:38:35,346 - INFO - Clean file successfully save ho gayi: E:\Cloud Data Engineering CDE\cde\Module_04(Amazon Web Servces AWS)\project\Dataflow Data Project\Superstore_Cleaned_Final.csv


In [6]:
! pip install boto3

Collecting boto3
  Downloading boto3-1.42.48-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.43.0,>=1.42.48 (from boto3)
  Downloading botocore-1.42.48-py3-none-any.whl.metadata (5.9 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.42.48-py3-none-any.whl (140 kB)
Downloading botocore-1.42.48-py3-none-any.whl (14.6 MB)
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
    --------------------------------------- 0.3/14.6 MB ? eta -:--:--
    --------------------------------------- 0.3/14.6 MB ? eta -:--:--
    --------------------------------------- 0.3/14.6 MB ? eta -:--:--
    --------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.19.0 requires botocore<1.36.4,>=1.36.0, but you have botocore 1.42.48 which is incompatible.


In [None]:
import boto3
from botocore.exceptions import NoCredentialsError

def upload_to_s3_with_keys():
    # --- ENTER YOUR DETAILS HERE ---
    ACCESS_KEY = '<" ">'
    SECRET_KEY = '<" ">'
    BUCKET_NAME = '<S3 Name>'
    
    LOCAL_FILE = '<File Path>'
    S3_DESTINATION = 'Superstore_Direct.csv'

    # Initialize S3 Client (using Access Keys)
    s3 = boto3.client(
        's3',
        aws_access_key_id='<" ">'
        aws_secret_access_key='<" ">'
    )

    try:
        print("Starting upload to S3...")
        s3.upload_file(LOCAL_FILE, BUCKET_NAME, S3_DESTINATION)
        print(f"Done! File '{S3_DESTINATION}' has been successfully uploaded to bucket '{BUCKET_NAME}'.")
        
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    upload_to_s3_with_keys()