## Data Migration: SQL Server to Postgres

In [3]:
import os
import pandas as pd;
import pyodbc
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv;


## 1. Load credentials


In [4]:
load_dotenv()

True

In [5]:
sql_host = os.getenv("SQL_SERVER_HOST")
sql_db = os.getenv("SQL_SERVER_DB")

In [6]:
print(f"SQL SERVER HOST: {sql_host}")
print(f"SQL SERVER DB: {sql_db}")

SQL SERVER HOST: localhost\SQLEXPRESS
SQL SERVER DB: TransactionDB_UAT


In [7]:
pg_host = os.getenv("POSTGRES_HOST")
pg_port = os.getenv("POSTGRES_PORT")
pg_db = os.getenv("POSTGRES_DB")
pg_user = os.getenv("POSTGRES_USER")
pg_password = os.getenv("POSTGRES_PASSWORD")

In [8]:
print(f"POSTGRES HOST: {pg_host}")
print(f"POSTGRES PORT: {pg_port}")
print(f"POSTGRES DB: {pg_db}")
print(f"POSTGRES USER: {pg_user}")
print(f"POSTGRES USER: {pg_password}")

POSTGRES HOST: localhost
POSTGRES PORT: 5432
POSTGRES DB: teste
POSTGRES USER: airflow
POSTGRES USER: airflow


## Connect to SQL Server

In [9]:
print("Connecting to SQL Server")
print(f"  Server: {sql_host}")
print(f"  Database: {sql_db}")

Connecting to SQL Server
  Server: localhost\SQLEXPRESS
  Database: TransactionDB_UAT


In [None]:
try:
    sql_conn_string = (
        f"Driver={{ODBC Driver 18 for SQL Server}};"
        f"SERVER={sql_host};"
        f"DATABASE={sql_db};"
        f"Trusted_connection=yes;"
        f"Encrypt=no;"
    )

    sql_conn = pyodbc.connect(sql_conn_string)
    sql_cursor = sql_conn.cursor()
    print("[SUCCESS] -> Connection to SQL Server now live! ")
except Exception as e:
    print(f"SQL Server connection failed: {e}")
    print(""" How to troubleshoot
          > 1. Check server name is .env file correct
          > 2. Verify SQL Server is running
          > 3. Check Windows Authentication is enabled
          > 4. If certified is the problem, use Encrypt=no or TrustServerCertificate=yes
 """)

[SUCCESS] -> Connection to SQL Server now live! 


## 3. Connect to PostgreSQL

In [11]:
print("Connecting to PostgreSQL...")
print(f"  Server: {pg_host}")
print(f"  Database: {pg_db}")

Connecting to PostgreSQL...
  Server: localhost
  Database: teste


In [12]:
try:
    pg_conn= psycopg2.connect(
        host=pg_host,
        port=pg_port,
        database=pg_db,
        user=pg_user,
        password=pg_password
    )

    pg_cursor = pg_conn.cursor()
    pg_cursor.execute("SELECT version();")

    pg_version = pg_cursor.fetchone()[0]
    print("Connected to PostgreSQL")
    print(f" version: {pg_version[:50]}...\n")
except psycopg2.OperationalError as e:
      print(f"Postgres connection failed: {e}")
      print(""" How to troubleshoot
               > 1. Check Postgres is running
               > 2. Verify username + password
               > 3. Check database exists
""")

except Exception as e:
     print("Unexpected error: {e}")

Connected to PostgreSQL
 version: PostgreSQL 13.23 (Debian 13.23-1.pgdg13+1) on x86_...



## 4. Define the tables to migrate

### Migration order

- Categories (no dependencies)
- Suppliers (no dependencies)
- Customers (no dependencies)
- Products (dependencies on Categories and suppliers)

In [28]:
tables_to_migrate = ['Categories', 'Suppliers', 'Customers', 'Products']
print(tables_to_migrate)

['Categories', 'Suppliers', 'Customers', 'Products']


In [22]:
print("Table to migrate: ")
for i, table in enumerate(tables_to_migrate,1):
    print(f"  {i}. {table}")

total_no_tbls = len(tables_to_migrate)
print(f"\nTotal  no of tables to migrate: {total_no_tbls}")

Table to migrate: 
  1. Categories
  2. Suppliers
  3. Customers

Total  no of tables to migrate: 3


### 5. Run pre-migration checks


In [59]:
print("=" * 70)
print(">>> ROW COUNTS")
print("=" * 70)

>>> ROW COUNTS


In [17]:
test_query = "SELECT COUNT(*) AS total_rows FROM Categories"
sql_cursor.execute(test_query)

count = sql_cursor.fetchone()[0]
print(f"Results: {count}")

Results: 7


In [None]:
baseline_counts = {}

try:
    for table in tables_to_migrate:
        row_count_query = f"SELECT COUNT(*) AS total_rows FROM {table}"

        # Warning: Do not input SQL queries with f-strings in production (this is just for the tutorial)
        # Example
        ## table = "users; DROP TABLE users; --"
        ## query = f"SELECT COUNT(*) FROM {table}"

        sql_cursor.execute(row_count_query)
        count = sql_cursor.fetchone()[0]

        baseline_counts[table]= count
        print(f"{table:15} {count:>12} rows")

    total_rows = sum(baseline_counts.values())
    print(f"{'-' * 33}")
    print(f"{'TOTAL':15} {total_rows:>12,} rows")
    print("\n Baseline captured! ")
except Exception as e:
    print(f"Failed to get baseline counts: {e}")
    raise

Categories                 7 rows
Suppliers                 10 rows
Customers                 10 rows
Products                  11 rows
---------------------------------
TOTAL                     38 rows

 Baseline captured! 


In [57]:
print("=" * 50)
print(">>> Check 2: Null COUNTS (CustomerName)")
print("=" * 50)



>>> Check 2: Null COUNTS (CustomerName)


In [None]:
quality_issues = []

try:
    print("\nCHECK 2: NULL CHECKS (CustomerName)")
    sql_cursor.execute("""
        SELECT COUNT(*) AS null_count
        FROM Customers
        WHERE CustomerName IS NULL
        """)
    null_names = sql_cursor.fetchone()[0]
    if null_names > 0:
        quality_issues.append(f"  - {null_names:,} customers with Null names...")
    #print(quality_issues)


    print("\nCHECK 3: Invalid email formats")
    sql_cursor.execute(""" SELECT COUNT(*) AS invalid_email_count FROM Customers WHERE email LIKE '@Invalid' """)
    invalid_emails = sql_cursor.fetchone()[0]
    if invalid_emails > 0:
        quality_issues.append(f"  - {invalid_emails:,} emails with invalid email formats...")
    print(quality_issues)

except Exception as e:
    pass


CHECK 2: NULL CHECKS (CustomerName)

CHECK 3: Invalid email formats
