In [None]:
# Star Schema Design for Loan Data Warehouse
from pyspark.sql.functions import (
    when, col, row_number, current_timestamp, date_format, from_utc_timestamp, lit, regexp_replace
)
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pathlib import Path
import pyodbc

# Initialize Spark session
spark = SparkSession.builder.appName("ETL_Assignment2_StarSchema") \
    .config("spark.jars", r"C:\Spark\sqljdbc_12.10\enu\jars\mssql-jdbc-12.10.0.jre8.jar") \
    .getOrCreate()


In [696]:

# Create the star schema database
conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=PC-W11;UID=admin;PWD=sql", autocommit=True)
cursor = conn.cursor()

create_db_query = """
IF NOT EXISTS (SELECT name FROM master.dbo.sysdatabases WHERE name = 'ETL_Assignment2_Star')
BEGIN
    CREATE DATABASE ETL_Assignment2_Star;
    PRINT 'Database ETL_Assignment2_Star created successfully.'
END
ELSE
BEGIN
    PRINT 'Database ETL_Assignment2_Star already exists.'
END
"""
cursor.execute(create_db_query)
conn.close()

# Configuration for Star Schema
STAR_CONFIG = {
    "db_url": "jdbc:sqlserver://PC-W11:1433;databaseName=ETL_Assignment2_Star;user=admin;password=sql;encrypt=false;trustServerCertificate=true",
    "pyodbc_conn": "DRIVER={ODBC Driver 17 for SQL Server};SERVER=PC-W11;DATABASE=ETL_Assignment2_Star;UID=admin;PWD=sql"
}

print("Star Schema configuration loaded successfully.")


Star Schema configuration loaded successfully.


In [697]:
# Star Schema Table Creation Functions

def create_dim_date():
    """Create Date Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        
        create_sql = """
        CREATE TABLE Dim_Date (
            date_key INT PRIMARY KEY,
            full_date DATE NOT NULL,
            year INT NOT NULL,
            quarter INT NOT NULL,
            month INT NOT NULL,
            month_name NVARCHAR(20) NOT NULL,
            day INT NOT NULL,
            day_of_week INT NOT NULL,
            day_name NVARCHAR(20) NOT NULL,
            week_of_year INT NOT NULL,
            is_weekend BIT NOT NULL,
            is_holiday BIT DEFAULT 0,
            created_date DATETIME DEFAULT GETDATE(),
            updated_date DATE DEFAULT GETDATE(),
            updated_time TIME DEFAULT GETDATE()
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Date table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Date table: {e}")
        return False

def create_dim_account():
    """Create Account Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        
        create_sql = """
        CREATE TABLE Dim_Account (
            account_key INT IDENTITY(1,1) PRIMARY KEY,
            account_id INT NOT NULL UNIQUE,
            account_number NVARCHAR(50) NOT NULL,
            account_currency NVARCHAR(3) NOT NULL,
            organization_name NVARCHAR(50) NOT NULL,
            channel_id INT,
            broker_id INT NOT NULL,
            product_name NVARCHAR(30) NOT NULL,
            open_date DATE NOT NULL,
            cancelled_date DATE,
            value_date DATE NOT NULL,
            invoice_day TINYINT NOT NULL,
            current_installment_amount DECIMAL(12,2) NOT NULL,
            current_invoice_fee DECIMAL(12,2) NOT NULL,
            next_invoice_date DATE NOT NULL,
            calculated_maturity_date DATE,
            is_active BIT DEFAULT 0,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Account table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Account table: {e}")
        return False

def create_dim_organization():
    """Create Organization Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        
        create_sql = """
        CREATE TABLE Dim_Organization (
            organization_key INT IDENTITY(1,1) PRIMARY KEY,
            organization_id INT NOT NULL UNIQUE,
            organization_name NVARCHAR(50) NOT NULL,
            source_id INT NOT NULL,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Organization table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Organization table: {e}")
        return False

def create_dim_product():
    """Create Product Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        
        create_sql = """
        CREATE TABLE Dim_Product (
            product_key INT IDENTITY(1,1) PRIMARY KEY,
            product_id INT NOT NULL UNIQUE,
            product_name NVARCHAR(30) NOT NULL,
            -- ProductCategory NVARCHAR(50),
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Product table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Product table: {e}")
        return False

def create_dim_currency():
    """Create Currency Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = """
        CREATE TABLE Dim_Currency (
            currency_key INT IDENTITY(1,1) PRIMARY KEY,
            currency_id INT NOT NULL UNIQUE,
            currency_code NVARCHAR(3) NOT NULL,
            -- is_base_currency AS (CASE WHEN currency_id IS 49 THEN 1 ELSE 0 END) PERSISTED,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Currency table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Currency table: {e}")
        return False

def create_dim_transaction_type():
    """Create Transaction Type Dimension Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = """
        CREATE TABLE Dim_Transaction_Type (
            transaction_type_key INT IDENTITY(1,1) PRIMARY KEY,
            transaction_type_id INT NOT NULL UNIQUE,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Dim_Transaction_Type table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Dim_Transaction_Type table: {e}")
        return False

print("Dimension table creation functions loaded.")

Dimension table creation functions loaded.


In [698]:
# Fact Table Creation Functions

def create_fact_loan_balance():
    """Create Loan Balance Fact Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = """
        CREATE TABLE Fact_Balance (
            BalanceKey BIGINT IDENTITY(1,1) PRIMARY KEY,
            AccountKey INT NOT NULL,
            ProductKey INT NOT NULL,
            OrganizationKey INT NOT NULL,
            CurrencyKey INT NOT NULL,
            BalanceDateKey INT NOT NULL,

            -- Original IDs for reference
            LoanAccountBalanceId INT NOT NULL,
            LoanAccountId INT NOT NULL,
            AccountStatusId INT NOT NULL,

            -- Measures
            AccruedInterest DECIMAL(15,5) DEFAULT 0,
            AccruedInterestSEK DECIMAL(15,5) DEFAULT 0,
            Balance DECIMAL(15,5) DEFAULT 0,
            BalanceSek DECIMAL(15,5) DEFAULT 0,

            -- Metadata
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL

            -- Foreign Keys
            /*
            FOREIGN KEY (AccountKey) REFERENCES Dim_Account(AccountKey),
            FOREIGN KEY (ProductKey) REFERENCES Dim_Product(ProductKey),
            FOREIGN KEY (OrganizationKey) REFERENCES Dim_Organization(OrganizationKey),
            FOREIGN KEY (CurrencyKey) REFERENCES Dim_Currency(CurrencyKey),
            FOREIGN KEY (BalanceDateKey) REFERENCES Dim_Date(DateKey)*/
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Fact_Balance table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Fact_Balance table: {e}")
        return False

def create_fact_loan_transaction():
    """Create Loan Transaction Fact Table"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        create_sql = """
        CREATE TABLE Fact_Transaction (
            transaction_key INT IDENTITY(1,1) PRIMARY KEY,
            transaction_id INT NOT NULL,
            transaction_date INT NOT NULL,
            value_date INT NOT NULL,
            entry_date INT NOT NULL,
            account_id INT NOT NULL,
            transaction_type_id INT NOT NULL,
            transaction_amount DECIMAL(12,5) NOT NULL,
            transaction_amount_SEK DECIMAL(12,5) NOT NULL,
            transaction_reference NVARCHAR(50),
            currency_id INT NOT NULL,
            exchange_rate_id INT,

            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL,

            FOREIGN KEY (transaction_date) REFERENCES Dim_Date(date_key),
            FOREIGN KEY (account_id) REFERENCES Dim_Account(account_id),
            FOREIGN KEY (transaction_type_id) REFERENCES Dim_Transaction_Type(transaction_type_id),
            FOREIGN KEY (currency_id) REFERENCES Dim_Currency(currency_id)
        );
        """
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Fact_Transaction table created successfully.")
        return True
    except Exception as e:
        print(f"Error creating Fact_Transaction table: {e}")
        return False


print("Fact table creation functions loaded.")

Fact table creation functions loaded.


In [699]:
# Create All Star Schema Tables
def create_star_schema():
    """Create complete star schema"""
    print("=" * 60)
    print("CREATING STAR SCHEMA TABLES")
    print("=" * 60)
    
    success_count = 0
    total_tables = 8
    
    # Create Dimension Tables
    print("\nCreating Dimension Tables...")
    if create_dim_date():
        success_count += 1
    if create_dim_account():
        success_count += 1
    if create_dim_organization():
        success_count += 1
    if create_dim_product():
        success_count += 1
    if create_dim_currency():
        success_count += 1
    if create_dim_transaction_type():
        success_count += 1
    # if create_dim_counterpart():
    #     success_count += 1
    
    # Create Fact Tables
    print("\nCreating Fact Tables...")
    if create_fact_loan_balance():
        success_count += 1
    if create_fact_loan_transaction():
        success_count += 1
    # if create_fact_account_snapshot():
    #     success_count += 1
    
    print(f"\nStar Schema Creation COMPLETED: \n{success_count}/{total_tables} tables created successfully")
    print("=" * 60)
    
    return success_count == total_tables


In [700]:
# Function to populate star schema tables from source tables
def populate_analysis_table(populate_sql, table_to, table_from):
    """Populate a star schema table from a source table"""
    try:
        with pyodbc.connect(STAR_CONFIG["pyodbc_conn"]) as conn:
            with conn.cursor() as cursor:
                cursor.execute(populate_sql)
                conn.commit()
        print(f"Table {table_to} populated successfully.")
        return True
    except Exception as e:
        print(f"Error populating table {table_to} from {table_from}:\nError: {e}")
           #  SQL Query:\n{populate_sql}\nError:\n{e}")
        return False

# Populate Date Dimension (Required for all facts)
def populate_dim_date():
    """Populate date dimension with date range"""
    try:
        conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"])
        cursor = conn.cursor()

        # Create dates from 2020 to 2030
        populate_sql = """
        WITH DateRange AS (
            SELECT CAST('2017-01-01' AS DATE) AS DateValue
            UNION ALL
            SELECT DATEADD(day, 1, DateValue)
            FROM DateRange
            WHERE DateValue < '2025-12-31'
        )
        INSERT INTO Dim_Date (
            date_key, full_date, year, quarter, month, month_name,
            day, day_of_week, day_name, week_of_year, is_weekend
        )
        SELECT
            YEAR(DateValue) * 10000 + MONTH(DateValue) * 100 + DAY(DateValue) AS date_key,
            DateValue AS full_date,
            YEAR(DateValue) AS year,
            DATEPART(quarter, DateValue) AS quarter,
            MONTH(DateValue) AS month,
            DATENAME(month, DateValue) AS month_name,
            DAY(DateValue) AS day,
            DATEPART(weekday, DateValue) AS day_of_week,
            DATENAME(weekday, DateValue) AS day_name,
            DATEPART(week, DateValue) AS week_of_year,
            CASE WHEN DATEPART(weekday, DateValue) IN (1, 7) THEN 1 ELSE 0 END AS is_weekend
        FROM DateRange
        OPTION (MAXRECURSION 0);
        """

        cursor.execute(populate_sql)
        conn.commit()
        cursor.close()
        conn.close()
        print("Table [Dim_Date] populated successfully.")
        return True
    except Exception as e:
        print(f"Error populating Dim_Date: {e}")
        return False

# Populate Dim_Account from Loan_Account_cleansed
table_to_acc = "[Dim_Account]"
table_from_acc = "[Loan_Account_cleansed]"
populate_sql_acc = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_acc}(
    account_id,
    account_number,
    account_currency,
    organization_name,
    channel_id,
    broker_id,
    product_name,
    open_date,
    cancelled_date,
    value_date,
    invoice_day,
    current_installment_amount,
    current_invoice_fee,
    next_invoice_date,
    calculated_maturity_date,
    is_active,
    created_date,
    updated_date,
    updated_time
)
SELECT
    LoanAccountId,
    AccountNumber,
    AccountCurrency,
    OrganizationName,
    ChannelID,
    BrokerId,
    Product,
    OpenDate,
    CancelledDate,
    ValueDate,
    InvoiceDay,
    CurrentInstallmentAmount,
    CurrentInvoiceFee,
    NextInvoiceDate,
    CalculatedMaturityDate,
    IsActive as is_active,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_acc};
"""

# Populate Dim_Organization from Loan_Account_cleansed
table_to_org = "[Dim_Organization]"
table_from_org = "[Loan_Account_cleansed]"
populate_sql_org = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_org}(
    organization_id,
    organization_name,
    source_id,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    OrganizationId,
    OrganizationName,
    SourceId,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_org};
"""

# Populate Dim_Product from Loan_Account_cleansed
table_to_prod = "[Dim_Product]"
table_from_prod = "[Loan_Account_cleansed]"
populate_sql_prod = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_prod}(
    product_id,
    product_name,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    ProductId,
    Product,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_prod};
"""

# Populate Dim_Currency from Loan_Account_cleansed
table_to_currency = "[Dim_Currency]"
table_from_currency = "[Loan_Account_cleansed]"
populate_sql_currency = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_currency}(
    currency_id,
    currency_code,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    AccountCurrencyId,
    AccountCurrency,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_currency};
"""

# Populate Dim_Transaction_Type from Loan_Transaction_cleansed
table_to_transaction_type = "[Dim_Transaction_Type]"
table_from_transaction_type = "[Loan_Transaction_cleansed]"
populate_sql_transaction_type = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_transaction_type}(
    transaction_type_id,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    TransactionTypeId,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_transaction_type};
"""

In [701]:
# Create All Star Schema Tables
def populate_star_schema():
    """Load data into star schema"""
    print("=" * 60)
    print("POPULATING STAR SCHEMA TABLES")
    print("=" * 60)
    
    success_count = 0
    total_tables = 8
    
    # Create Dimension Tables
    print("\nPopulating Dimension Tables...")
    if populate_dim_date():
        success_count += 1
    if populate_analysis_table(
        populate_sql=populate_sql_acc,
        table_to=table_to_acc,
        table_from=table_from_acc):
        success_count += 1
    if populate_analysis_table(
        populate_sql=populate_sql_org,
        table_to=table_to_org,
        table_from=table_from_org):
        success_count += 1
    if populate_analysis_table(
        populate_sql=populate_sql_prod,
        table_to=table_to_prod,
        table_from=table_from_prod):
        success_count += 1
    if populate_analysis_table(
        populate_sql=populate_sql_currency,
        table_to=table_to_currency,
        table_from=table_from_currency):
        success_count += 1
    if populate_analysis_table(
        populate_sql=populate_sql_transaction_type,
        table_to=table_to_transaction_type,
        table_from=table_from_transaction_type):
        success_count += 1

    # Create Fact Tables
    print("\nPopulating Fact Tables...")
    # if populate_analysis_table():
    #     success_count += 1
    # if populate_analysis_table():
    #     success_count += 1

    print(f"\nStar Schema Data Load COMPLETED: \n{success_count}/{total_tables} tables populated successfully")
    print("=" * 60 + "\n")

    return success_count == total_tables

In [702]:
# Execute star schema creation
create_star_schema()

CREATING STAR SCHEMA TABLES

Creating Dimension Tables...
Error creating Dim_Date table: ('42S01', "[42S01] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There is already an object named 'Dim_Date' in the database. (2714) (SQLExecDirectW)")
Error creating Dim_Account table: ('42S01', "[42S01] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There is already an object named 'Dim_Account' in the database. (2714) (SQLExecDirectW)")
Error creating Dim_Organization table: ('42S01', "[42S01] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There is already an object named 'Dim_Organization' in the database. (2714) (SQLExecDirectW)")
Error creating Dim_Product table: ('42S01', "[42S01] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There is already an object named 'Dim_Product' in the database. (2714) (SQLExecDirectW)")
Error creating Dim_Currency table: ('42S01', "[42S01] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There is already an object named 'Dim_Curr

False

In [703]:

# Execute star schema population
populate_star_schema()

POPULATING STAR SCHEMA TABLES

Populating Dimension Tables...
Error populating Dim_Date: ('23000', "[23000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Violation of PRIMARY KEY constraint 'PK__Dim_Date__67370B44ED0C8EB9'. Cannot insert duplicate key in object 'dbo.Dim_Date'. The duplicate key value is (20170101). (2627) (SQLExecDirectW); [23000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]The statement has been terminated. (3621)")
Error populating table [Dim_Account] from [Loan_Account_cleansed]:
Error: ('23000', "[23000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Violation of UNIQUE KEY constraint 'UQ__Dim_Acco__46A222CCA1CC813F'. Cannot insert duplicate key in object 'dbo.Dim_Account'. The duplicate key value is (1). (2627) (SQLExecDirectW); [23000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]The statement has been terminated. (3621)")
Error populating table [Dim_Organization] from [Loan_Account_cleansed]:
Error: ('23000', "[23000] [Microsoft

False

In [704]:
print("- Proper foreign key relationships established")

- Proper foreign key relationships established
