In [None]:
# Import required libraries
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from IPython.display import display

print("Libraries imported successfully!")

In [2]:
# Function to drop and recreate tables
def setup_databases():
    # Connect to the source database
    source_conn = psycopg2.connect(
        dbname="fdms_source",
        user="espinshalo",
        host="localhost",
        port="5432"
    )
    source_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    source_cur = source_conn.cursor()

    # Drop existing tables if they exist in source database
    source_cur.execute("""
        DROP TABLE IF EXISTS employee_documents CASCADE;
        DROP TABLE IF EXISTS employees CASCADE;
        DROP TABLE IF EXISTS departments CASCADE;
        DROP TABLE IF EXISTS document_metadata CASCADE;
    """)
    print("Existing tables dropped from source database successfully!")

    # Create departments table
    source_cur.execute("""
    CREATE TABLE departments (
        department_id SERIAL PRIMARY KEY,
        department_name VARCHAR(100) NOT NULL
    );
    """)

    # Create employees table
    source_cur.execute("""
    CREATE TABLE employees (
        employee_id SERIAL PRIMARY KEY,
        first_name VARCHAR(50) NOT NULL,
        last_name VARCHAR(50) NOT NULL,
        email VARCHAR(100) NOT NULL UNIQUE,
        department_id INTEGER REFERENCES departments(department_id)
    );
    """)

    # Create employee_documents table
    source_cur.execute("""
    CREATE TABLE employee_documents (
        document_id SERIAL PRIMARY KEY,
        employee_id INTEGER REFERENCES employees(employee_id),
        document_type VARCHAR(50) NOT NULL,
        file_path VARCHAR(255) NOT NULL,
        upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );
    """)

    # Insert sample data into departments
    source_cur.execute("""
    INSERT INTO departments (department_name) VALUES 
        ('HR'),
        ('Engineering'),
        ('Finance');
    """)

    # Insert sample data into employees
    source_cur.execute("""
    INSERT INTO employees (first_name, last_name, email, department_id) VALUES 
        ('John', 'Doe', 'john.doe@example.com', 1),
        ('Jane', 'Smith', 'jane.smith@example.com', 2),
        ('Bob', 'Johnson', 'bob.johnson@example.com', 3);
    """)

    # Insert sample data into employee_documents
    source_cur.execute("""
    INSERT INTO employee_documents (employee_id, document_type, file_path) VALUES 
        (1, 'PASSPORT', '/tmp/docs/passport_1.pdf'),
        (1, 'CONTRACT', '/tmp/docs/contract_1.pdf'),
        (2, 'VISA', '/tmp/docs/visa_2.pdf'),
        (2, 'PAYSLIP', '/tmp/docs/payslip_2.pdf'),
        (3, 'CONTRACT', '/tmp/docs/contract_3.pdf');
    """)

    # Close the source connection
    source_cur.close()
    source_conn.close()

    # Connect to the target database
    target_conn = psycopg2.connect(
        dbname="fdms_target",
        user="espinshalo",
        host="localhost",
        port="5432"
    )
    target_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    target_cur = target_conn.cursor()

    # Drop existing document_metadata table if it exists in target database
    target_cur.execute("""
        DROP TABLE IF EXISTS document_metadata CASCADE;
    """)

    # Create document_metadata table in target database
    target_cur.execute("""
    CREATE TABLE document_metadata (
        id SERIAL PRIMARY KEY,
        employee_id INTEGER NOT NULL,
        first_name VARCHAR(50) NOT NULL,
        last_name VARCHAR(50) NOT NULL,
        email VARCHAR(100) NOT NULL,
        department_name VARCHAR(100) NOT NULL,
        document_type VARCHAR(50) NOT NULL,
        document_category VARCHAR(50) NOT NULL,
        file_path VARCHAR(255) NOT NULL,
        upload_date TIMESTAMP NOT NULL,
        processed_date TIMESTAMP NOT NULL,
        status VARCHAR(20) NOT NULL,
        document_id VARCHAR(100) NOT NULL UNIQUE,
        s3_path VARCHAR(255)
    );
    """)

    # Close the target connection
    target_cur.close()
    target_conn.close()

    print("Database setup completed successfully!")

# Run the setup function
setup_databases()

Existing tables dropped from source database successfully!
Database setup completed successfully!


In [3]:
# Create database connections
source_engine = create_engine('postgresql://espinshalo@localhost:5432/fdms_source')
target_engine = create_engine('postgresql://espinshalo@localhost:5432/fdms_target')

print("Database connections established successfully!")

Database connections established successfully!


In [4]:
# Extract data from source database
query = """
SELECT 
    e.employee_id,
    e.first_name,
    e.last_name,
    e.email,
    d.department_name,
    doc.document_type,
    doc.file_path,
    doc.upload_date
FROM employees e
JOIN departments d ON e.department_id = d.department_id
JOIN employee_documents doc ON e.employee_id = doc.employee_id;
"""

df = pd.read_sql(query, source_engine)
print("Data extracted successfully!")
print(f"Number of records: {len(df)}")
display(df)

Data extracted successfully!
Number of records: 5


Unnamed: 0,employee_id,first_name,last_name,email,department_name,document_type,file_path,upload_date
0,1,John,Doe,john.doe@example.com,HR,CONTRACT,/tmp/docs/contract_1.pdf,2025-03-24 18:48:19.414394
1,1,John,Doe,john.doe@example.com,HR,PASSPORT,/tmp/docs/passport_1.pdf,2025-03-24 18:48:19.414394
2,2,Jane,Smith,jane.smith@example.com,Engineering,PAYSLIP,/tmp/docs/payslip_2.pdf,2025-03-24 18:48:19.414394
3,2,Jane,Smith,jane.smith@example.com,Engineering,VISA,/tmp/docs/visa_2.pdf,2025-03-24 18:48:19.414394
4,3,Bob,Johnson,bob.johnson@example.com,Finance,CONTRACT,/tmp/docs/contract_3.pdf,2025-03-24 18:48:19.414394
