In [1]:
import os

import pandas as pd
import psycopg2
from psycopg2.extras import execute_values

from dotenv import load_dotenv
load_dotenv('.env_load')

True

# Load data into postgres

In [2]:
csv_file_path=os.getenv('csv_file_path')  # Provide the correct path to your CSV file for loading data
db_name=os.getenv('db_name')
db_user=os.getenv('db_user')
db_password=os.getenv('db_password')
db_host=os.getenv('db_host') 
db_port = os.getenv('db_port') 
schema_name=os.getenv('schema_name')
table_name='customers_table'

### Needed for datatypes registration otherwise error
see https://stackoverflow.com/questions/39564755/programmingerror-psycopg2-programmingerror-cant-adapt-type-numpy-ndarray

In [3]:
import numpy as np
from psycopg2.extensions import register_adapter, AsIs

def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def addapt_numpy_float32(numpy_float32):
    return AsIs(numpy_float32)

def addapt_numpy_int32(numpy_int32):
    return AsIs(numpy_int32)

def addapt_numpy_array(numpy_array):
    return AsIs(tuple(numpy_array))

register_adapter(np.float64, addapt_numpy_float64)
register_adapter(np.int64, addapt_numpy_int64)
register_adapter(np.float32, addapt_numpy_float32)
register_adapter(np.int32, addapt_numpy_int32)
register_adapter(np.ndarray, addapt_numpy_array)

In [4]:
def load_csv_to_postgres(csv_file_path, db_name, db_user, db_password, db_host, db_port, schema_name, table_name):
    """Function to load data from csv to postgres"""
    # Step 1: Read the CSV into a pandas DataFrame
    df = pd.read_csv(csv_file_path)

    # for d_c in df.columns:
    #     if df[d_c].dtype.name == 'int64':
    #         df[d_c] = df[d_c].astype('int32')

    # convert data types
    df = df.convert_dtypes()
    # Step 2: Connect to PostgreSQL Database

    conn = None
    try:
        conn = psycopg2.connect(
            dbname=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port
        )
        cur = conn.cursor()
        print("Successfully connected to the database!")
        # create schema if not exists
        create_schema_query = f"""
        CREATE SCHEMA IF NOT EXISTS {schema_name}
        """
        # drop old table if exists
        drop_table_query = f"""
        DROP TABLE IF EXISTS {schema_name}.{table_name} CASCADE
        """
        # Step 3: Create the table if it doesn't exist
        create_table_query = f"""
            CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (
            customer_id int8 NOT NULL PRIMARY KEY,
            "name" varchar(128) NULL,
            address varchar(128) NULL,
            zip_code varchar(100) NULL,
            credit_rating int8 NULL,
            age int8 NULL,
            gender varchar(50) NULL,
            marital_status varchar(50) NULL,
            profession varchar(100) NULL,
            nbr_years_cli int4 NULL,
            risk_score float4 NULL,
            state varchar(100) NULL,
            city varchar(100) NULL,
            profile_url varchar(500) NULL,
            ssn varchar(20),
            phone_number varchar(30),
            email varchar(50)
            );
        """.format(table_name)
        
        cur.execute(create_schema_query)
        print(f"Schema {schema_name} created in the database")
        cur.execute(drop_table_query)
        cur.execute(create_table_query)
        
        print(f"Table `{table_name}` is ready in the database.")

        # Step 4: Prepare the data for bulk insert using `execute_values()`
        insert_query = f"""
            INSERT INTO {schema_name}.{table_name} (
                CUSTOMER_ID, NAME, ADDRESS, ZIP_CODE, CREDIT_RATING, AGE, 
                GENDER, MARITAL_STATUS, PROFESSION, NBR_YEARS_CLI, RISK_SCORE, 
                STATE, CITY, profile_url,SSN, PHONE_NUMBER, EMAIL
            ) VALUES %s;
        """

        # Step 5: Convert DataFrame rows into a list of tuples for bulk insert
        data_to_insert = [tuple(x) for x in df.to_records(index=False)]

        # Step 6: Execute the bulk insert
        execute_values(cur, insert_query, data_to_insert)

        # Commit changes
        conn.commit()
        print("Data successfully inserted into the table.")

    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        if conn:
            cur.close()
            conn.close()
            print("Database connection closed.")

In [5]:
load_csv_to_postgres(
    csv_file_path=csv_file_path,  
    db_name=db_name,
    db_user=db_user,
    db_password=db_password,
    db_host=db_host, 
    db_port=db_port,       
    schema_name=schema_name,
    table_name=table_name
)

Successfully connected to the database!
Schema bankdemo created in the database
Table `customers_table` is ready in the database.
Data successfully inserted into the table.
Database connection closed.
