In [1]:
# Import required libraries
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from IPython.display import display
import boto3

print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
# Function to drop and recreate tables
def setup_databases():
    try:
        print("Starting database setup...")
        
        # First, try to connect to postgres database to create our databases
        print("Connecting to postgres database...")
        conn = psycopg2.connect(
            dbname="postgres",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = conn.cursor()
        
        # Create source database if it doesn't exist
        print("Creating source database if it doesn't exist...")
        cur.execute("SELECT 1 FROM pg_database WHERE datname = 'fdms_source'")
        if not cur.fetchone():
            cur.execute('CREATE DATABASE fdms_source')
            print("Created source database")
        
        # Create target database if it doesn't exist
        print("Creating target database if it doesn't exist...")
        cur.execute("SELECT 1 FROM pg_database WHERE datname = 'fdms_target'")
        if not cur.fetchone():
            cur.execute('CREATE DATABASE fdms_target')
            print("Created target database")
        
        cur.close()
        conn.close()
        print("Databases created successfully")
        
        # Now connect to source database
        print("Connecting to source database...")
        source_conn = psycopg2.connect(
            dbname="fdms_source",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        source_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        source_cur = source_conn.cursor()

        # Drop existing tables if they exist in source database
        print("Dropping existing tables in source database...")
        source_cur.execute("""
            DROP TABLE IF EXISTS employee_documents CASCADE;
            DROP TABLE IF EXISTS employees CASCADE;
            DROP TABLE IF EXISTS departments CASCADE;
            DROP TABLE IF EXISTS document_metadata CASCADE;
        """)
        print("Existing tables dropped from source database successfully!")

        # Create departments table
        print("Creating departments table...")
        source_cur.execute("""
        CREATE TABLE departments (
            department_id SERIAL PRIMARY KEY,
            department_name VARCHAR(100) NOT NULL
        );
        """)

        # Create employees table
        print("Creating employees table...")
        source_cur.execute("""
        CREATE TABLE employees (
            employee_id SERIAL PRIMARY KEY,
            first_name VARCHAR(50) NOT NULL,
            last_name VARCHAR(50) NOT NULL,
            email VARCHAR(100) NOT NULL UNIQUE,
            department_id INTEGER REFERENCES departments(department_id)
        );
        """)

        # Create employee_documents table
        print("Creating employee_documents table...")
        source_cur.execute("""
        CREATE TABLE employee_documents (
            document_id SERIAL PRIMARY KEY,
            employee_id INTEGER REFERENCES employees(employee_id),
            document_type VARCHAR(50) NOT NULL,
            file_path VARCHAR(255) NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );
        """)

        # Insert sample data into departments
        print("Inserting sample data into departments...")
        source_cur.execute("""
        INSERT INTO departments (department_name) VALUES 
            ('HR'),
            ('Engineering'),
            ('Finance');
        """)

        # Insert sample data into employees
        print("Inserting sample data into employees...")
        source_cur.execute("""
        INSERT INTO employees (first_name, last_name, email, department_id) VALUES 
            ('John', 'Doe', 'john.doe@example.com', 1),
            ('Jane', 'Smith', 'jane.smith@example.com', 2),
            ('Bob', 'Johnson', 'bob.johnson@example.com', 3);
        """)

        # Insert sample data into employee_documents
        print("Inserting sample data into employee_documents...")
        source_cur.execute("""
        INSERT INTO employee_documents (employee_id, document_type, file_path) VALUES 
            (1, 'PASSPORT', '/tmp/docs/passport_1.pdf'),
            (1, 'CONTRACT', '/tmp/docs/contract_1.pdf'),
            (2, 'VISA', '/tmp/docs/visa_2.pdf'),
            (2, 'PAYSLIP', '/tmp/docs/payslip_2.pdf'),
            (3, 'CONTRACT', '/tmp/docs/contract_3.pdf');
        """)

        # Close the source connection
        source_cur.close()
        source_conn.close()
        print("Source database setup completed")

        # Connect to target database
        print("Connecting to target database...")
        target_conn = psycopg2.connect(
            dbname="fdms_target",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        target_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        target_cur = target_conn.cursor()

        # Drop existing document_metadata table if it exists in target database
        print("Dropping existing document_metadata table in target database...")
        target_cur.execute("""
            DROP TABLE IF EXISTS document_metadata CASCADE;
        """)

        # Create document_metadata table in target database
        print("Creating document_metadata table in target database...")
        target_cur.execute("""
        CREATE TABLE document_metadata (
            id SERIAL PRIMARY KEY,
            employee_id INTEGER NOT NULL,
            first_name VARCHAR(50) NOT NULL,
            last_name VARCHAR(50) NOT NULL,
            email VARCHAR(100) NOT NULL,
            department_name VARCHAR(100) NOT NULL,
            document_type VARCHAR(50) NOT NULL,
            document_category VARCHAR(50) NOT NULL,
            file_path VARCHAR(255) NOT NULL,
            upload_date TIMESTAMP NOT NULL,
            processed_date TIMESTAMP NOT NULL,
            status VARCHAR(20) NOT NULL,
            document_id VARCHAR(100) NOT NULL UNIQUE,
            s3_path VARCHAR(255)
        );
        """)

        # Close the target connection
        target_cur.close()
        target_conn.close()
        print("Target database setup completed")

        print("Database setup completed successfully!")

    except Exception as e:
        print(f"Error during database setup: {str(e)}")
        print(f"Error type: {type(e)}")
        import traceback
        print("Full traceback:")
        print(traceback.format_exc())

# Run the setup function
setup_databases()

Starting database setup...
Connecting to postgres database...
Creating source database if it doesn't exist...
Creating target database if it doesn't exist...
Databases created successfully
Connecting to source database...
Dropping existing tables in source database...


In [None]:
def test_connections():
    try:
        print("Testing database connections...")
        
        # Test source database connection
        print("\nTesting source database connection...")
        source_conn = psycopg2.connect(
            dbname="fdms_source",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        source_cur = source_conn.cursor()
        source_cur.execute("SELECT COUNT(*) FROM departments")
        dept_count = source_cur.fetchone()[0]
        print(f"Successfully connected to source database. Found {dept_count} departments.")
        source_cur.close()
        source_conn.close()
        
        # Test target database connection
        print("\nTesting target database connection...")
        target_conn = psycopg2.connect(
            dbname="fdms_target",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        target_cur = target_conn.cursor()
        target_cur.execute("SELECT COUNT(*) FROM document_metadata")
        doc_count = target_cur.fetchone()[0]
        print(f"Successfully connected to target database. Found {doc_count} documents.")
        target_cur.close()
        target_conn.close()
        
        print("\nAll connections tested successfully!")
        
    except Exception as e:
        print(f"Error testing connections: {str(e)}")
        print(f"Error type: {type(e)}")
        import traceback
        print("Full traceback:")
        print(traceback.format_exc())

# Run the test function
test_connections()

In [None]:
def extract_data():
    try:
        print("Starting data extraction...")
        
        # Connect to source database
        print("Connecting to source database...")
        source_conn = psycopg2.connect(
            dbname="fdms_source",
            user="espinshalo",
            host="localhost",
            port="5432"
        )
        
        # Extract departments data
        print("\nExtracting departments data...")
        departments_df = pd.read_sql("SELECT * FROM departments", source_conn)
        print(f"Extracted {len(departments_df)} departments")
        print("Departments preview:")
        display(departments_df.head())
        
        # Extract employees data
        print("\nExtracting employees data...")
        employees_df = pd.read_sql("SELECT * FROM employees", source_conn)
        print(f"Extracted {len(employees_df)} employees")
        print("Employees preview:")
        display(employees_df.head())
        
        # Extract employee documents data
        print("\nExtracting employee documents data...")
        documents_df = pd.read_sql("SELECT * FROM employee_documents", source_conn)
        print(f"Extracted {len(documents_df)} documents")
        print("Documents preview:")
        display(documents_df.head())
        
        # Close the connection
        source_conn.close()
        print("\nData extraction completed successfully!")
        
        return departments_df, employees_df, documents_df
        
    except Exception as e:
        print(f"Error during data extraction: {str(e)}")
        print(f"Error type: {type(e)}")
        import traceback
        print("Full traceback:")
        print(traceback.format_exc())
        return None, None, None

# Run the extraction function
departments_df, employees_df, documents_df = extract_data()