In [58]:
!jupyter nbconvert --to script upload_file_in_s3_testing.ipynb

[NbConvertApp] Converting notebook upload_file_in_s3_testing.ipynb to script
[NbConvertApp] Writing 3300 bytes to upload_file_in_s3_testing.py


In [24]:
import psycopg2
import pandas as pd
import boto3
from io import StringIO
import json
from sqlalchemy import create_engine
import urllib.parse
import logging

In [55]:
from configparser import ConfigParser

In [25]:
logging.basicConfig(level=logging.INFO)

In [95]:
# read the credential file

In [26]:
def credentialFile(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [94]:
# read the data from the s3 

In [103]:
def read_csv_from_s3(bucket_name, file_key, credentials):
    try:
        # Create an S3 client
        s3 = boto3.client('s3',
                          aws_access_key_id=credentials['aws_access_key_id'],
                          aws_secret_access_key=credentials['aws_secret_access_key'])
        
        # Read the CSV file from S3
        response = s3.get_object(Bucket=bucket_name, Key=file_key)
        csv_content = response['Body'].read().decode('utf-8')
        
        # Convert the CSV content to a DataFrame
        data = pd.read_csv(StringIO(csv_content))
        
        return data
    
    except s3.exceptions.NoSuchKey:
        logging.error(f"The file {file_key} does not exist in the bucket {bucket_name}.")
        raise
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        raise

In [104]:
# dim table defination

In [105]:
def create_dim_table(data, id_column, dim_columns, key_column):
    dim_table = data[[id_column] + dim_columns].drop_duplicates(subset=[id_column]).reset_index(drop=True)
    dim_table[key_column] = dim_table.index + 1
    return dim_table

In [106]:
# fact table defination

In [107]:
def create_fact_table(data, fact_columns):
    fact_table = data[fact_columns]
    return fact_table

In [131]:
def upload_to_s3(dataframe, bucket_name, ETL_file_key, credentials):
    try:
        # Create an S3 client
        s3 = boto3.client('s3',
                          aws_access_key_id=credentials['aws_access_key_id'],
                          aws_secret_access_key=credentials['aws_secret_access_key'])

        # Convert the DataFrame to a CSV string
        csv_buffer = StringIO()
        dataframe.to_csv(csv_buffer, index=False)

        # Upload the CSV string to S3
        s3.put_object(Bucket=bucket_name, Key=ETL_file_key, Body=csv_buffer.getvalue())

        logging.info(f"Uploaded {ETL_file_key} to S3 bucket {bucket_name} successfully.")
    except Exception as e:
        logging.error(f"Failed to upload {ETL_file_key} to S3: {e}")

In [2]:
def main():
    
    bucket_name = 'bucket_name'
    file_key = 'file_name'
    credentials_file = 'credential_file_name'


    # Load credentials
    credentials = credentialFile(credentials_file)
    
    try:
        # calling of reading the data from s3
        data = read_csv_from_s3(bucket_name, file_key, credentials)

        # defining dim fact table schema
        if data is not None:
            dim_specs = {
                'product': {
                    'id_column': 'Product_ID',
                    'dim_columns': ['Product_Name', 'Category', 'Sub_Category'],
                    'key_column': 'ProductKey'
                },
                'customer': {
                    'id_column': 'Customer_ID',
                    'dim_columns': ['Customer_Name', 'Segment'],
                    'key_column': 'CustomerKey'
                },
                'location': {
                    'id_column': 'Postal_Code',
                    'dim_columns': ['Country', 'City', 'State', 'Region'],
                    'key_column': 'LocationKey'
                },
                'date': {
                    'id_column': 'Order_Date',
                    'dim_columns': [ 'Ship_Date', 'Ship_Mode'],
                    'key_column': 'DateKey'
                }
            }

            fact_columns = ['Order_ID', 'Customer_ID', 'Product_ID', 'Postal_Code', 'Order_Date', 'Ship_Date', 'Sales']


            # dim tables calling
            dim_tables = {}
            for dim_name, dim_spec in dim_specs.items():
                dim_table = create_dim_table(data, dim_spec['id_column'], dim_spec['dim_columns'], dim_spec['key_column'])
                merge_columns = [dim_spec['id_column']] + dim_spec['dim_columns']
                dim_tables[dim_name] = (dim_table, dim_spec['key_column'], merge_columns)
                logging.info(f"{dim_name.capitalize()} Dimension Table:")
                
            print(dim_tables)

            
            # fact table calling
            fact_table = create_fact_table(data, fact_columns)
            logging.info("Fact Table:")


            upload_keys = {
                'product': 'product_file_name',
                'customer': 'customer_file_name',
                'date': 'date_file_name',
                'location': 'location_file_name',
                'fact': 'fact_file_name'
            }

            # Upload dimension tables to S3
            for dim_name, dim_info in dim_tables.items():
                dim_table = dim_info[0]
                ETL_file_key = upload_keys[dim_name]
                upload_to_s3(dim_table, bucket_name, ETL_file_key, credentials)
        
            # Upload fact table to S3
            upload_to_s3(fact_table, bucket_name, upload_keys['fact'], credentials)

    
    except Exception as e:
        logging.error(f"Failed to read CSV from S3: {e}")

In [139]:
if __name__ == "__main__":
    main()

INFO:root:Product Dimension Table:
INFO:root:Customer Dimension Table:
INFO:root:Location Dimension Table:
INFO:root:Date Dimension Table:
INFO:root:Fact Table:


{'product': (           Product_ID                                       Product_Name  \
0     FUR-BO-10001798                  Bush Somerset Collection Bookcase   
1     FUR-CH-10000454  Hon Deluxe Fabric Upholstered Stacking Chairs,...   
2     OFF-LA-10000240  Self-Adhesive Address Labels for Typewriters b...   
3     FUR-TA-10000577      Bretford CR4500 Series Slim Rectangular Table   
4     OFF-ST-10000760                      Eldon Fold N Roll Cart System   
...               ...                                                ...   
1856  TEC-AC-10002380       Sony 8GB Class 10 Micro SDHC R40 Memory Card   
1857  TEC-PH-10002817                    RCA ViSYS 25425RE1 Corded phone   
1858  TEC-MA-10003589                       Cisco 8961 IP Phone Charcoal   
1859  OFF-AP-10003099                        Eureka Hand Vacuum, Bagless   
1860  TEC-PH-10002645                                              LG G2   

             Category Sub_Category  ProductKey  
0           Furniture    