In [7]:
# Configuration and Setup
import os
os.environ['JAVA_HOME'] = 'C:/Users/anton/.conda/envs/conda-env/Library/lib/jvm'
os.environ['HADOOP_HOME'] = 'C:/Hadoop/hadoop-3.3.6'
os.environ['PATH'] += ';C:/Hadoop/hadoop-3.3.6/bin'
os.environ['SPARK_HOME'] = r"C:\Users\anton\.conda\envs\conda-env\Lib\site-packages\pyspark"

In [11]:
from pyspark.sql.functions import (
    when, col, row_number, current_timestamp, date_format, from_utc_timestamp, lit, regexp_replace
)
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pathlib import Path
import pyodbc

# Initialize Spark session
spark = SparkSession.builder.appName("ETL_Assignment2_Warehouse") \
    .config("spark.jars", r"C:\Spark\sqljdbc_12.10\enu\jars\mssql-jdbc-12.10.0.jre8.jar") \
    .getOrCreate()

# Create the database if it doesn't exist
conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=PC-W11;UID=admin;PWD=sql", autocommit=True)
cursor = conn.cursor()

# SQL query to check if the database exists and create it if it doesn't
create_db_query = """
IF NOT EXISTS (SELECT name FROM master.dbo.sysdatabases WHERE name = 'ETL_Assignment2_Warehouse')
BEGIN
    CREATE DATABASE ETL_Assignment2_Warehouse
    PRINT 'Database ETL_Assignment2_Warehouse created successfully.'
END
ELSE
BEGIN
    PRINT 'Database ETL_Assignment2_Warehouse already exists.'
END
"""
cursor.execute(create_db_query)
conn.close()

# Configuration
CONFIG = {
    "base_path": "data",
    "db_url": "jdbc:sqlserver://PC-W11:1433;databaseName=ETL_Assignment2_Warehouse;user=admin;password=sql;encrypt=false;trustServerCertificate=true",
    "pyodbc_conn": "DRIVER={ODBC Driver 17 for SQL Server};SERVER=PC-W11;DATABASE=ETL_Assignment2_Warehouse;UID=admin;PWD=sql",
    "files": {
        "account": {
            "csv": "data/Raw_Loan_Account.csv",
            "parquet": "data/Loan_Account_cleansed.parquet",
            "table": "Loan_Account_cleansed"
        },
        "balance": {
            "csv": "data/Raw_Loan_Balance.csv",
            "parquet": "data/Loan_Balance_cleansed.parquet",
            "table": "Loan_Balance_cleansed"
        },
        "transactions": {
            "csv": "data/Raw_Loan_Transaction.csv",
            "parquet": "data/Loan_Transaction_cleansed.parquet",
            "table": "Loan_Transaction_cleansed"
        }
    }
}

In [None]:


#Schema definitions
account_schema = StructType([
    StructField("LoanAccountId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("AccountNumber", StringType(), False),
    StructField("IBAN", StringType(), True),
    StructField("BBAN", StringType(), True),
    StructField("AccountCurrencyId", IntegerType(), False),
    StructField("AccountCurrency", StringType(), False),
    StructField("OrganizationId", IntegerType(), False),
    StructField("OrganizationName", StringType(), False),
    StructField("ChannelID", IntegerType(), True),
    StructField("BrokerId", IntegerType(), False),
    StructField("OpenDateId", IntegerType(), False),
    StructField("OpenDate", DateType(), False),
    StructField("CancelledDateId", IntegerType(), True),
    StructField("CancelledDate", DateType(), True),
    StructField("ValueDate", DateType(), False),
    StructField("MaturityDate", DateType(), True),
    StructField("ProductId", IntegerType(), False),
    StructField("Product", StringType(), False),
    StructField("InvoiceDay", IntegerType(), False),
    StructField("CurrentInstallmentAmount", StringType(), False),
    StructField("CurrentInvoiceFee", StringType(), False),
    StructField("RepaymentRate", StringType(), True),
    StructField("NextInvoiceDate", DateType(), False),
    StructField("CalculatedMaturityDate", DateType(), True)
])

balance_schema = StructType([
    StructField("LoanAccountBalanceId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("BalanceDateId", IntegerType(), False),
    StructField("LoanAccountId", IntegerType(), False),
    StructField("ProductId", IntegerType(), False),
    StructField("AccountCurrencyId", IntegerType(), False),
    StructField("AccountStatusId", IntegerType(), False),
    StructField("NumOfTransactions", IntegerType(), False),
    StructField("NetTransactionAmount", StringType(), False),
    StructField("NetTransactionAmountSek", StringType(), False),
    StructField("AccruedInterest", StringType(), False),
    StructField("AccruedInterestSEK", StringType(), False),
    StructField("Balance", StringType(), False),
    StructField("BalanceSek", StringType(), False),
    StructField("LTV", IntegerType(), False),
    StructField("PrecedingId", IntegerType(), True)
])

transaction_schema = StructType([
    StructField("LoanAccountTransactionId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("TransactionDateId", IntegerType(), False),
    StructField("ValueDateId", IntegerType(), False),
    StructField("EntryDateID", IntegerType(), False),
    StructField("LoanAccountId", IntegerType(), False),
    StructField("TransactionTypeId", IntegerType(), False),
    StructField("TransactionStatus", StringType(), True),
    StructField("RectifyStatus", StringType(), True),
    StructField("TransactionCurrencyId", IntegerType(), False),
    StructField("TransactionAmount", StringType(), False),
    StructField("TransactionAmountSEK", StringType(), False),
    StructField("CounterpartClearingNumber", StringType(), True),
    StructField("CounterPartBic", StringType(), True),
    StructField("CounterPartIban", StringType(), True),
    StructField("TransactionReference", StringType(), True),
    StructField("ExchangeRateId", IntegerType(), False),
    StructField("TransactionText", StringType(), True),
    StructField("AccountServicerReference", StringType(), True),
    StructField("CounterPartId", IntegerType(), True),
    StructField("CounterPartAccountNumber", StringType(), True),
    StructField("CounterPartBankName", StringType(), True),
    StructField("TransactionDateTime", TimestampType(), True),
    StructField("IsDirectDebit", IntegerType(), True),
    StructField("GLAccount", StringType(), True),
    StructField("EventName", StringType(), True),
    StructField("InvoiceId", IntegerType(), True)
])

print("Configuration and schema loaded successfully.")


In [None]:
# Utility Functions
def convert_european_decimals(df, decimal_columns):
    """Convert European decimal format (comma) to US format (dot) and cast to decimal"""
    converted_df = df

    for col_name in decimal_columns:
        if col_name in df.columns:
            # Replace comma with dot and handle null/empty values
            converted_df = converted_df.withColumn(
                col_name,
                when(
                    (col(col_name).isNull()) |
                    (col(col_name) == "") |
                    (col(col_name) == "NULL"),
                    lit(0.0)
                ).otherwise(regexp_replace(col(col_name), ",", "."))
            )
            converted_df = converted_df.withColumn(
                col_name,
                converted_df[col_name].cast(DecimalType(12, 5))
            )

    return converted_df

def check_table_exists(table_name):
    """Check if table exists in database"""
    try:
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        cursor.execute(f"""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_NAME = '{table_name}'
        """)
        exists = cursor.fetchone()[0] > 0
        cursor.close()
        conn.close()
        return exists
    except Exception as e:
        print(f"Error checking table existence: {e}")
        return False

def create_account_table(table_name):
    """Create loan account cleansed table"""
    try:
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = f"""
        CREATE TABLE {table_name} (
            LoanAccountIdentityId INT IDENTITY(1,1) PRIMARY KEY,
            LoanAccountId INT NOT NULL,
            SourceId INT NOT NULL DEFAULT 8,
            AccountNumber NVARCHAR(50) NOT NULL,
            IBAN NVARCHAR(34),
            BBAN NVARCHAR(34),
            AccountCurrencyId INT NOT NULL,
            AccountCurrency NVARCHAR(3) NOT NULL CHECK (LEN(AccountCurrency) = 3),
            OrganizationId INT NOT NULL,
            OrganizationName NVARCHAR(50) NOT NULL,
            ChannelID INT,
            BrokerId INT NOT NULL,
            OpenDateId INT NOT NULL,
            OpenDate DATE NOT NULL,
            CancelledDateId INT,
            CancelledDate DATE,
            ValueDate DATE NOT NULL,
            MaturityDate DATE,
            ProductId INT NOT NULL,
            Product NVARCHAR(30) NOT NULL,
            InvoiceDay TINYINT NOT NULL DEFAULT 14,
            CurrentInstallmentAmount DECIMAL(12,2) NOT NULL,
            CurrentInvoiceFee DECIMAL(12,2) NOT NULL,
            RepaymentRate NVARCHAR(50),
            NextInvoiceDate DATE NOT NULL,
            CalculatedMaturityDate DATE,
            IsActive AS (CASE WHEN CancelledDate IS NULL THEN 1 ELSE 0 END) PERSISTED,
            CreatedDate DATETIME DEFAULT GETDATE(),
            UpdatedDate DATE DEFAULT GETDATE(),
            UpdatedTime TIME DEFAULT GETDATE()
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print(f"Table {table_name} created successfully.")
        return True
    except Exception as e:
        print(f"Error creating table {table_name}: {e}")
        return False

def create_balance_table(table_name):
    """Create loan balance cleansed table"""
    print(f"Creating database table {table_name}...")
    try:
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = f"""
        CREATE TABLE {table_name} (
            LoanAccountBalanceIdentityId INT IDENTITY(1,1) PRIMARY KEY,
            LoanAccountBalanceId INT NOT NULL,
            SourceId INT NOT NULL,
            BalanceDateId INT NOT NULL,
            BalanceDate AS CONVERT(DATE, CAST(BalanceDateId AS CHAR(8)), 112) PERSISTED,
            LoanAccountId INT NOT NULL,
            ProductId INT NOT NULL,
            AccountCurrencyId INT NOT NULL,
            AccountStatusId INT NOT NULL,
            NumOfTransactions INT NOT NULL DEFAULT 0,
            NetTransactionAmount DECIMAL(12,2) DEFAULT 0,
            NetTransactionAmountSek DECIMAL(12,2) DEFAULT 0,
            AccruedInterest DECIMAL(12,2) DEFAULT 0,
            AccruedInterestSEK DECIMAL(12,5) DEFAULT 0,
            Balance DECIMAL(12,2) NULL DEFAULT 0,
            BalanceSek DECIMAL(12,5) DEFAULT 0,
            LTV INT NOT NULL DEFAULT 0,
            PrecedingId INT,
            CreatedDate DATETIME DEFAULT GETDATE(),
            UpdatedDate DATE DEFAULT GETDATE(),
            UpdatedTime TIME DEFAULT GETDATE()
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print(f"Table {table_name} created successfully.")
        return True
    except Exception as e:
        print(f"Error creating table {table_name}: {e}")
        return False

def create_transaction_table(table_name):
    """Create loan transaction cleansed table"""
    print(f"Creating database table {table_name}...")
    try:
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = f"""
        CREATE TABLE {table_name} (
            LoanAccountTransactionIdentityId INT IDENTITY(1,1) PRIMARY KEY,
            LoanAccountTransactionId INT NOT NULL,
            SourceId INT NOT NULL DEFAULT 8,
            TransactionDateId INT NOT NULL,
            TransactionDate AS CONVERT(DATE, CAST(TransactionDateId AS CHAR(8)), 112) PERSISTED,
            ValueDateId INT NOT NULL,
            ValueDate AS CONVERT(DATE, CAST(TransactionDateId AS CHAR(8)), 112) PERSISTED,
            EntryDateID INT NOT NULL,
            EntryDate AS CONVERT(DATE, CAST(EntryDateID AS CHAR(8)), 112) PERSISTED,
            LoanAccountId INT NOT NULL,
            TransactionTypeId INT NOT NULL,
            TransactionStatus NVARCHAR(50),
            RectifyStatus NVARCHAR(50),
            TransactionCurrencyId INT NOT NULL,
            TransactionAmount DECIMAL(12,2) NOT NULL,
            TransactionAmountSEK DECIMAL(12,5) DEFAULT 0,
            CounterpartClearingNumber NVARCHAR(50),
            CounterPartBic NVARCHAR(50),
            CounterPartIban NVARCHAR(50),
            TransactionReference NVARCHAR(100),
            ExchangeRateId INT NOT NULL,
            TransactionText NVARCHAR(255),
            AccountServicerReference NVARCHAR(100),
            CounterPartId INT,
            CounterPartAccountNumber NVARCHAR(50),
            CounterPartBankName NVARCHAR(100),
            TransactionDateTime DATETIME2,
            IsDirectDebit INT,
            GLAccount NVARCHAR(50),
            EventName NVARCHAR(100),
            InvoiceId INT,
            CreatedDate DATETIME DEFAULT GETDATE(),
            UpdatedDate DATE DEFAULT GETDATE(),
            UpdatedTime TIME DEFAULT GETDATE()
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print(f"Table {table_name} created successfully.")
        return True
    except Exception as e:
        print(f"Error creating table {table_name}: {e}")
        return False

def load_csv_data(csv_path, schema):
    """Load and clean CSV data"""
    try:
        df = spark.read.option("delimiter", ";") \
            .csv(csv_path, header=True, schema=schema)
        print(f"Loaded {df.count()} records from {csv_path}")
        return df
    except Exception as e:
        print(f"Error loading CSV {csv_path}: {e}")
        return None

def load_existing_data(table_name, file_key):
    """Load existing data from database or parquet"""
    try:
        # Try database first
        df = spark.read.format("jdbc") \
            .option("url", CONFIG["db_url"]) \
            .option("dbtable", table_name) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
        print(f"Loaded existing data from database table {table_name}")
        return df
    except Exception as e:
        print(f"Could not load from database: {e}")

        # Fallback to parquet
        parquet_path = CONFIG["files"][file_key]["parquet"]
        if Path(parquet_path).exists():
            try:
                df = spark.read.parquet(parquet_path)
                print(f"Loaded existing data from parquet {parquet_path}")
                return df
            except Exception as pe:
                print(f"Could not load from parquet: {pe}")
        return None

print("Utility functions loaded successfully.")


Utility functions loaded successfully.


In [229]:
# Data Processing Functions
def clean_account_data(df):
    """Clean and transform account data"""
    # Convert European decimal format columns
    decimal_columns = [
        "CurrentInstallmentAmount",
        "CurrentInvoiceFee"
    ]

    # Convert decimals first
    converted_df = convert_european_decimals(df, decimal_columns)

    cleaned_df = converted_df \
        .withColumn("CancelledDateId",
                   when(col("CancelledDateId") == -1, None)
                   .otherwise(col("CancelledDateId")))

    # Deduplication
    window_spec = Window.partitionBy("LoanAccountId").orderBy(col("OpenDate").desc())
    deduped_df = cleaned_df \
        .withColumn("row_num", row_number().over(window_spec)) \
        .filter("row_num = 1") \
        .drop("row_num")

    print(f"Cleaned account data: {deduped_df.count()} records after deduplication")
    return deduped_df

# def clean_balance_data(df):
#     """Clean and transform balance data with decimal conversion"""
#     # Convert European decimal format columns
#     decimal_columns = [
#         "NetTransactionAmount",
#         "NetTransactionAmountSek",
#         "AccruedInterest",
#         "AccruedInterestSEK",
#         "Balance",
#         "BalanceSek"
#     ]

#     # Convert decimals first
#     converted_df = convert_european_decimals(df, decimal_columns)

#     # Clean other columns
#     cleaned_df = converted_df \
#         .withColumn("PrecedingId",
#                    when(col("PrecedingId") == -1, None)
#                    .otherwise(col("PrecedingId")))

#     # Deduplication based on LoanAccountId and BalanceDateId
#     window_spec = Window.partitionBy("LoanAccountId", "BalanceDateId").orderBy(col("LoanAccountBalanceId").desc())
#     deduped_df = cleaned_df \
#         .withColumn("row_num", row_number().over(window_spec)) \
#         .filter("row_num = 1") \
#         .drop("row_num")

#     print(f"Cleaned balance data: {deduped_df.count()} records after deduplication")
#     return deduped_df

# def clean_transaction_data(df):
#     """Clean and transform transaction data with decimal conversion"""
#     # Convert European decimal format columns
#     decimal_columns = [
#         "TransactionAmount",
#         "TransactionAmountSEK"
#     ]

#     # Convert decimals first
#     converted_df = convert_european_decimals(df, decimal_columns)

#     # Clean other columns
#     cleaned_df = converted_df \
#         .withColumn("CounterPartId",
#                    when(col("CounterPartId") == -1, None)
#                    .otherwise(col("CounterPartId"))) \
#         .withColumn("InvoiceId",
#                    when(col("InvoiceId") == -1, None)
#                    .otherwise(col("InvoiceId")))

#     # Deduplication based on transaction_id
#     window_spec = Window.partitionBy("LoanAccountTransactionId").orderBy(col("TransactionDateId").desc())
#     deduped_df = cleaned_df \
#         .withColumn("row_num", row_number().over(window_spec)) \
#         .filter("row_num = 1") \
#         .drop("row_num")

#     print(f"Cleaned transaction data: {deduped_df.count()} records after deduplication")
#     return deduped_df

# def add_metadata_columns(df):
#     """Add metadata columns (timestamps)"""
#     ts = from_utc_timestamp(current_timestamp(), "Europe/Stockholm")

#     return df \
#         .withColumn("CreatedDate", ts) \
#         .withColumn("UpdatedDate", date_format(ts, "yyyy-MM-dd")) \
#         .withColumn("UpdatedTime", date_format(ts, "HH:mm:ss"))

def prepare_for_initial_load(df, identity_col, sort_col):
    """Prepare DataFrame for initial load (add identity column)"""
    window_spec = Window.orderBy(sort_col)
    df_with_id = df.withColumn(identity_col, row_number().over(window_spec))

    # Reorder columns
    columns = [identity_col] + [col for col in df_with_id.columns if col != identity_col]
    return df_with_id.select(columns)

# def detect_changes(new_df, existing_df, compare_cols):
#     """Detect changes between new and existing data"""
#     new_business_df = new_df.select(compare_cols)
#     old_business_df = existing_df.select(compare_cols)

#     # Find differences
#     diff_df = new_business_df.exceptAll(old_business_df)

#     # Get full records for changed data
#     changed_df = new_df.join(diff_df, on=compare_cols, how="inner")
#     return changed_df, compare_cols

# print("Data processing functions loaded successfully.")


In [230]:
# Define account comparison columns (business keys)
compare_cols_acc = [
    "LoanAccountId", "SourceId", "AccountNumber", "IBAN", "BBAN",
    "AccountCurrencyId", "AccountCurrency", "OrganizationId", "OrganizationName",
    "ChannelID", "BrokerId", "OpenDateId", "OpenDate", "CancelledDateId",
    "CancelledDate", "ValueDate", "MaturityDate", "ProductId", "Product",
    "InvoiceDay", "CurrentInstallmentAmount", "CurrentInvoiceFee",
    "RepaymentRate", "NextInvoiceDate", "CalculatedMaturityDate"
]

# Enhanced Upsert Functions with Better Error Handling
def perform_comprehensive_upsert(new_df, existing_df, table_name, parquet_path, key_cols, compare_cols):
    """
    Comprehensive upsert that handles both parquet and database updates
    """
    try:
        print(f"Starting comprehensive upsert for {table_name}...")
        
        # 1. Detect changes
        if existing_df is not None:
            # Find new and changed records
            new_business_df = new_df.select(compare_cols)
            old_business_df = existing_df.select(compare_cols)
            
            # Get records that are different (new or changed)
            diff_df = new_business_df.exceptAll(old_business_df)
            changed_df = new_df.join(diff_df, on=compare_cols, how="inner")
            
            changed_count = changed_df.count()
            print(f"Found {changed_count} new/changed records")
            
            if changed_count == 0:
                print("No changes detected. Skipping upsert.")
                return True
                
        else:
            # No existing data - treat as initial load
            changed_df = new_df
            changed_count = changed_df.count()
            print(f"Initial load: {changed_count} records")
        
        # 2. Update Parquet (append new/changed records)
        if existing_df is not None:
            # For existing data, append only changes
            changed_df.write.mode("append").parquet(parquet_path)
            print(f"Appended {changed_count} records to parquet")
        else:
            # Initial load
            changed_df.write.mode("overwrite").parquet(parquet_path)
            print(f"Created parquet with {changed_count} records")
        
        # 3. Update Database using MERGE
        perform_database_merge(changed_df, table_name, key_cols)
        
        return True
        
    except Exception as e:
        print(f"Comprehensive upsert failed for {table_name}: {e}")
        return False

def perform_database_merge(df, table_name, key_cols):
    """
    Perform database MERGE operation for upsert
    """
    try:
        temp_table = f"{table_name}_staging"
        
        # Remove identity columns for database insert
        identity_cols = [col for col in df.columns if "IdentityId" in col or "Identity" in col]
        db_df = df
        for id_col in identity_cols:
            if id_col in db_df.columns:
                db_df = db_df.drop(id_col)

        # Write to staging table
        db_df.write \
            .format("jdbc") \
            .option("url", CONFIG["db_url"]) \
            .option("dbtable", temp_table) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .mode("overwrite") \
            .save()
        
        # Execute MERGE statement
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        
        # Build key matching condition
        key_conditions = " AND ".join([f"target.{key} = source.{key}" for key in key_cols])
        
        # Use only compare_cols (exclude audit fields and keys)
        audit_cols = ["CreatedDate", "UpdatedDate", "UpdatedTime"]
        update_cols = [col for col in compare_cols_acc if col not in key_cols and col not in audit_cols]
        
        # Build update set clause (exclude audit fields)
        update_set = ", ".join([f"target.{col} = source.{col}" for col in update_cols])

        # Build insert columns and values (include audit fields, but use GETDATE() for them)
        insert_cols = key_cols + update_cols + audit_cols
        insert_vals = [f"source.{col}" for col in key_cols + update_cols] + [
            "GETDATE()",           # CreatedDate
            "CONVERT(DATE, GETDATE())",  # UpdatedDate
            "CONVERT(TIME, GETDATE())",  # UpdatedTime
        ]

        merge_sql = f"""
        MERGE {table_name} AS target
        USING {temp_table} AS source
        ON {key_conditions}
        WHEN MATCHED THEN
            UPDATE SET {update_set},
                    UpdatedDate = CONVERT(DATE, GETDATE()),
                    UpdatedTime = CONVERT(TIME, GETDATE())
        WHEN NOT MATCHED THEN
            INSERT ({', '.join(insert_cols)})
            VALUES ({', '.join(insert_vals)});
"""
        
        print(f"Executing MERGE for {table_name}...")
        cursor.execute(merge_sql)
        rows_affected = cursor.rowcount
        conn.commit()
        
        # Clean up staging table
        cursor.execute(f"DROP TABLE IF EXISTS {temp_table}")
        conn.commit()
        cursor.close()
        conn.close()
        
        print(f"Database MERGE completed: {rows_affected} rows affected")
        return True
        
    except Exception as e:
        print(f"Database MERGE failed: {e}")
        return False

# Enhanced ETL Functions with Comprehensive Upsert
def run_account_etl_enhanced():
    """Enhanced ETL process for account data with comprehensive upsert"""
    print("\n" + "="*50)
    print("STARTING ENHANCED ACCOUNT ETL")
    print("="*50)
    
    # Load CSV data
    csv_path = CONFIG["files"]["account"]["csv"]
    raw_df = load_csv_data(csv_path, account_schema)
    
    if raw_df is None:
        print("Failed to load account CSV data.")
        return False
    
    # Clean data
    cleaned_df = clean_account_data(raw_df)
    cleaned_df = add_metadata_columns(cleaned_df)
    
    # Check if table exists
    table_name = CONFIG["files"]["account"]["table"]
    parquet_path = CONFIG["files"]["account"]["parquet"]
    table_exists = check_table_exists(table_name)
    
    if not table_exists:
        print(f"Creating new table {table_name}...")
        if not create_account_table(table_name):
            return False
    
    # Load existing data
    existing_df = load_existing_data(table_name, "account") if table_exists else None
    
    # Add identity column for new records
    if existing_df is not None:
        # Get max identity value
        max_id_result = existing_df.agg({"LoanAccountIdentityId": "max"}).collect()[0][0]
        max_id = max_id_result if max_id_result is not None else 0
        
        window_spec = Window.orderBy("LoanAccountId")
        final_df = cleaned_df.withColumn("LoanAccountIdentityId", 
                                       row_number().over(window_spec) + max_id)
    else:
        # Initial load
        final_df = prepare_for_initial_load(cleaned_df, "LoanAccountIdentityId", "LoanAccountId")
    
    # Perform comprehensive upsert
    success = perform_comprehensive_upsert(
        new_df=final_df,
        existing_df=existing_df,
        table_name=table_name,
        parquet_path=parquet_path,
        key_cols=["LoanAccountId"],
        compare_cols=compare_cols_acc
    )
    
    print(f"Account ETL completed: {'SUCCESS' if success else 'FAILED'}")
    return success

def run_balance_etl_enhanced():
    """Enhanced ETL process for balance data with comprehensive upsert"""
    print("\n" + "="*50)
    print("STARTING ENHANCED BALANCE ETL")
    print("="*50)
    
    # Load CSV data
    csv_path = CONFIG["files"]["balance"]["csv"]
    raw_df = load_csv_data(csv_path, balance_schema)
    
    if raw_df is None:
        print("Failed to load balance CSV data.")
        return False
    
    # Clean data
    cleaned_df = clean_balance_data(raw_df)
    cleaned_df = add_metadata_columns(cleaned_df)
    
    # Check if table exists
    table_name = CONFIG["files"]["balance"]["table"]
    parquet_path = CONFIG["files"]["balance"]["parquet"]
    table_exists = check_table_exists(table_name)
    
    if not table_exists:
        print(f"Creating new table {table_name}...")
        if not create_balance_table(table_name):
            return False
    
    # Load existing data
    existing_df = load_existing_data(table_name, "balance") if table_exists else None
    
    # Add identity column for new records
    if existing_df is not None:
        max_id_result = existing_df.agg({"LoanAccountBalanceIdentityId": "max"}).collect()[0][0]
        max_id = max_id_result if max_id_result is not None else 0
        
        window_spec = Window.orderBy("LoanAccountBalanceId")
        final_df = cleaned_df.withColumn("LoanAccountBalanceIdentityId", 
                                       row_number().over(window_spec) + max_id)
    else:
        final_df = prepare_for_initial_load(cleaned_df, "LoanAccountBalanceIdentityId", "LoanAccountBalanceId")
    
    # Define comparison columns
    compare_cols = [
        "LoanAccountBalanceId", "SourceId", "BalanceDateId", "LoanAccountId",
        "ProductId", "AccountCurrencyId", "AccountStatusId", "NumOfTransactions",
        "NetTransactionAmount", "NetTransactionAmountSek", "AccruedInterest",
        "AccruedInterestSEK", "Balance", "BalanceSek", "LTV", "PrecedingId"
    ]
    
    # Perform comprehensive upsert
    success = perform_comprehensive_upsert(
        new_df=final_df,
        existing_df=existing_df,
        table_name=table_name,
        parquet_path=parquet_path,
        key_cols=["LoanAccountBalanceId"],
        compare_cols=compare_cols
    )
    
    print(f"Balance ETL completed: {'SUCCESS' if success else 'FAILED'}")
    return success

def run_transaction_etl_enhanced():
    """Enhanced ETL process for transaction data with comprehensive upsert"""
    print("\n" + "="*50)
    print("STARTING ENHANCED TRANSACTION ETL")
    print("="*50)
    
    # Load CSV data
    csv_path = CONFIG["files"]["transactions"]["csv"]
    raw_df = load_csv_data(csv_path, transaction_schema)
    
    if raw_df is None:
        print("Failed to load transaction CSV data.")
        return False
    
    # Clean data
    cleaned_df = clean_transaction_data(raw_df)
    cleaned_df = add_metadata_columns(cleaned_df)
    
    # Check if table exists
    table_name = CONFIG["files"]["transactions"]["table"]
    parquet_path = CONFIG["files"]["transactions"]["parquet"]
    table_exists = check_table_exists(table_name)
    
    if not table_exists:
        print(f"Creating new table {table_name}...")
        if not create_transaction_table(table_name):
            return False
    
    # Load existing data
    existing_df = load_existing_data(table_name, "transactions") if table_exists else None
    
    # Add identity column for new records
    if existing_df is not None:
        max_id_result = existing_df.agg({"LoanAccountTransactionIdentityId": "max"}).collect()[0][0]
        max_id = max_id_result if max_id_result is not None else 0
        
        window_spec = Window.orderBy("LoanAccountTransactionId")
        final_df = cleaned_df.withColumn("LoanAccountTransactionIdentityId", 
                                       row_number().over(window_spec) + max_id)
    else:
        final_df = prepare_for_initial_load(cleaned_df, "LoanAccountTransactionIdentityId", "LoanAccountTransactionId")
    
    # Define comparison columns
    compare_cols = [
        "LoanAccountTransactionId", "SourceId", "TransactionDateId", "ValueDateId",
        "EntryDateID", "LoanAccountId", "TransactionTypeId", "TransactionStatus",
        "RectifyStatus", "TransactionCurrencyId", "TransactionAmount", "TransactionAmountSEK",
        "CounterpartClearingNumber", "CounterPartBic", "CounterPartIban", "TransactionReference",
        "ExchangeRateId", "TransactionText", "AccountServicerReference", "CounterPartId",
        "CounterPartAccountNumber", "CounterPartBankName", "TransactionDateTime",
        "IsDirectDebit", "GLAccount", "EventName", "InvoiceId"
    ]
    
    # Perform comprehensive upsert
    success = perform_comprehensive_upsert(
        new_df=final_df,
        existing_df=existing_df,
        table_name=table_name,
        parquet_path=parquet_path,
        key_cols=["LoanAccountTransactionId"],
        compare_cols=compare_cols
    )
    
    print(f"Transaction ETL completed: {'SUCCESS' if success else 'FAILED'}")
    return success

# Enhanced Full ETL Process
def run_enhanced_etl_process():
    """Run enhanced ETL for all tables with comprehensive upsert"""
    print("\n" + "="*80)
    print("STARTING ENHANCED FULL ETL PROCESS WITH COMPREHENSIVE UPSERT")
    print("="*80)
    
    success_count = 0
    total_tables = 3
    
    # Run Enhanced Account ETL
    if run_account_etl_enhanced():
        success_count += 1
    
    # Run Enhanced Balance ETL
    # if run_balance_etl_enhanced():
    #     success_count += 1
    
    # # Run Enhanced Transaction ETL
    # if run_transaction_etl_enhanced():
    #     success_count += 1
    
    print(f"\n" + "="*80)
    print(f"ENHANCED ETL PROCESS COMPLETED")
    print(f"SUCCESS: {success_count}/{total_tables} tables processed successfully")
    print("="*80)
    
    return success_count == total_tables

In [231]:
# Execute Enhanced ETL Process
run_enhanced_etl_process()


STARTING ENHANCED FULL ETL PROCESS WITH COMPREHENSIVE UPSERT

STARTING ENHANCED ACCOUNT ETL
Loaded 499 records from data/Raw_Loan_Account.csv


Cleaned account data: 499 records after deduplication
Loaded existing data from database table Loan_Account_cleansed
Starting comprehensive upsert for Loan_Account_cleansed...
Found 0 new/changed records
No changes detected. Skipping upsert.
Account ETL completed: SUCCESS

ENHANCED ETL PROCESS COMPLETED
SUCCESS: 1/3 tables processed successfully


False

In [26]:

# Read the Parquet file
df = spark.read.parquet(r"C:\Users\anton\Local Data\BI Local\7.Python\Py-class\SQL-ETL-2\data\Loan_Transaction_cleansed.parquet")
df.printSchema()

root
 |-- LoanAccountTransactionIdentityId: integer (nullable = true)
 |-- LoanAccountTransactionId: integer (nullable = true)
 |-- SourceId: integer (nullable = true)
 |-- TransactionDateId: integer (nullable = true)
 |-- ValueDateId: integer (nullable = true)
 |-- EntryDateID: integer (nullable = true)
 |-- LoanAccountId: integer (nullable = true)
 |-- TransactionTypeId: integer (nullable = true)
 |-- TransactionStatus: string (nullable = true)
 |-- RectifyStatus: string (nullable = true)
 |-- TransactionCurrencyId: integer (nullable = true)
 |-- TransactionAmount: double (nullable = true)
 |-- TransactionAmountSEK: double (nullable = true)
 |-- CounterpartClearingNumber: string (nullable = true)
 |-- CounterPartBic: string (nullable = true)
 |-- CounterPartIban: string (nullable = true)
 |-- TransactionReference: string (nullable = true)
 |-- ExchangeRateId: integer (nullable = true)
 |-- TransactionText: string (nullable = true)
 |-- AccountServicerReference: string (nullable = tru

In [29]:
df.orderBy(col("LoanAccountTransactionId").desc()).limit(10).show()
df.count()

+--------------------------------+------------------------+--------+-----------------+-----------+-----------+-------------+-----------------+-----------------+-------------+---------------------+-----------------+--------------------+-------------------------+--------------+---------------+--------------------+--------------+---------------+------------------------+-------------+------------------------+-------------------+-------------------+-------------+---------+---------+---------+--------------------+-----------+-----------+
|LoanAccountTransactionIdentityId|LoanAccountTransactionId|SourceId|TransactionDateId|ValueDateId|EntryDateID|LoanAccountId|TransactionTypeId|TransactionStatus|RectifyStatus|TransactionCurrencyId|TransactionAmount|TransactionAmountSEK|CounterpartClearingNumber|CounterPartBic|CounterPartIban|TransactionReference|ExchangeRateId|TransactionText|AccountServicerReference|CounterPartId|CounterPartAccountNumber|CounterPartBankName|TransactionDateTime|IsDirectDebit|

1475

In [None]:
def load_database_data(table_name):
    """Load data from database table"""
    try:
        df = spark.read.format("jdbc") \
            .option("url", CONFIG["db_url"]) \
            .option("dbtable", table_name) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
        count = df.count()
        print(f"        Loaded {count} records from database table {table_name}")
        return df if count > 0 else None
    except Exception as e:
        print(f"        \nCould not load from database table {table_name}: {e}")
        return None
df = load_database_data("Loan_Transaction_cleansed")
df

        
Could not load from database table Loan_Transaction_cleansed: An error occurred while calling o90.load.
: com.microsoft.sqlserver.jdbc.SQLServerException: Invalid object name 'Loan_Transaction_cleansed'.
	at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:276)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.getNextResult(SQLServerStatement.java:1787)
	at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement.doExecutePreparedStatement(SQLServerPreparedStatement.java:688)
	at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement$PrepStmtExecCmd.doExecute(SQLServerPreparedStatement.java:607)
	at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7745)
	at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:4700)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeCommand(SQLServerStatement.java:321)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeStatement(SQLSe

AttributeError: 'NoneType' object has no attribute 'printSchema'