In [1]:
# Configuration and Setup
import os
os.environ['JAVA_HOME'] = r'C:\Users\anton\.conda\envs\conda-env\Library\lib\jvm'
os.environ['HADOOP_HOME'] = r'C:\Hadoop\hadoop-3.3.6'
os.environ['PATH'] += r';C:\Hadoop\hadoop-3.3.6\bin'
os.environ['SPARK_HOME'] = r'C:\Users\anton\.conda\envs\conda-env\Lib\site-packages\pyspark'

from pyspark.sql import SparkSession
import pyodbc

In [2]:
sql_server = "PC-W11"
database_name = "ETL_Assignment2_Star" # Do not change this
user = "admin"
password = "sql"

In [3]:
spark = SparkSession.builder.appName(database_name).getOrCreate()

In [4]:
STAR_CONFIG = {
    "db_url": f"jdbc:sqlserver://{sql_server}:1433;"
              f"databaseName={database_name};"
              f"user={user};"
              f"password={password};"
              "encrypt=false;"
              "trustServerCertificate=true",

    "pyodbc_conn": f"DRIVER={{ODBC Driver 17 for SQL Server}};"
                   f"SERVER={sql_server};"
                   f"DATABASE={database_name};"
                   f"UID={user};"
                   f"PWD={password};"
}

In [5]:
# Create the star schema database
conn = pyodbc.connect(STAR_CONFIG["pyodbc_conn"], autocommit=True)
cursor = conn.cursor()

create_db_query = """
IF NOT EXISTS (SELECT name FROM master.dbo.sysdatabases WHERE name = 'ETL_Assignment2_Star')
BEGIN
    CREATE DATABASE ETL_Assignment2_Star;
    PRINT 'Database ETL_Assignment2_Star created successfully.'
END
ELSE
BEGIN
    PRINT 'Database ETL_Assignment2_Star already exists.'
END
"""
cursor.execute(create_db_query)
# cursor.close()
# conn.close()

print("\n" + "=" * 60)
print("ETL ASSIGNMENT 2: STAR SCHEMA DESIGN")
print("=" * 60)
print("\nSTEP 1: Star Schema configuration loaded successfully.")


ETL ASSIGNMENT 2: STAR SCHEMA DESIGN

STEP 1: Star Schema configuration loaded successfully.


In [None]:
# Star Schema Dim Table Creation Functions
def create_dim_date():
    """Create Date Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Date (
            date_key INT PRIMARY KEY,
            full_date DATE NOT NULL,
            year INT NOT NULL,
            quarter INT NOT NULL,
            month INT NOT NULL,
            month_name NVARCHAR(20) NOT NULL,
            day INT NOT NULL,
            day_of_week INT NOT NULL,
            day_name NVARCHAR(20) NOT NULL,
            week_of_year INT NOT NULL,
            is_weekend BIT NOT NULL,
            is_holiday BIT DEFAULT 0,
            created_date DATETIME DEFAULT GETDATE(),
            updated_date DATE DEFAULT GETDATE(),
            updated_time TIME DEFAULT GETDATE()
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Date table created successfully.")
        return True
    except Exception as e:
        print(f"        \nError creating Dim_Date table: {e}")
        return False

def create_dim_account():
    """Create Account Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Account (
            account_key INT IDENTITY(1,1) PRIMARY KEY,
            account_id INT NOT NULL,
            account_number NVARCHAR(50) NOT NULL,
            currency_code NVARCHAR(3) NOT NULL,
            organization_name NVARCHAR(50) NOT NULL,
            channel_id INT,
            broker_id INT NOT NULL,
            product_name NVARCHAR(30) NOT NULL,
            open_date DATE NOT NULL,
            cancelled_date DATE,
            value_date DATE NOT NULL,
            invoice_day TINYINT NOT NULL,
            current_installment_amount DECIMAL(12,2) NOT NULL,
            current_invoice_fee DECIMAL(12,2) NOT NULL,
            next_invoice_date DATE NOT NULL,
            calculated_maturity_date DATE,
            is_active BIT DEFAULT 0,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Account table created successfully.")
        return True
    except Exception as e:
        print(f"        \nError creating Dim_Account table: {e}")
        return False

def create_dim_organization():
    """Create Organization Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Organization (
            organization_key INT IDENTITY(1,1) PRIMARY KEY,
            organization_id INT NOT NULL,
            organization_name NVARCHAR(50) NOT NULL,
            source_id INT NOT NULL,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Organization table created successfully.")
        return True
    except Exception as e:
        print(f"        \nError creating Dim_Organization table: {e}")
        return False

def create_dim_product():
    """Create Product Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Product (
            product_key INT IDENTITY(1,1) PRIMARY KEY,
            product_id INT NOT NULL,
            product_name NVARCHAR(30) NOT NULL,
            -- ProductCategory NVARCHAR(50),
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Product table created successfully.")
        return True
    except Exception as e:
        print(f"\n        Error creating Dim_Product table: {e}")
        return False

def create_dim_currency():
    """Create Currency Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Currency (
            currency_key INT IDENTITY(1,1) PRIMARY KEY,
            currency_id INT NOT NULL,
            currency_code NVARCHAR(3) NOT NULL,
            -- is_base_currency AS (CASE WHEN currency_id IS 49 THEN 1 ELSE 0 END) PERSISTED,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Currency table created successfully.")
        return True
    except Exception as e:
        print(f"        \nError creating Dim_Currency table: {e}")
        return False

def create_dim_transaction_type():
    """Create Transaction Type Dimension Table"""
    try:

        create_sql = """
        CREATE TABLE Dim_Transaction_Type (
            transaction_type_key INT IDENTITY(1,1) PRIMARY KEY,
            transaction_type_id INT NOT NULL,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Dim_Transaction_Type table created successfully.")
        return True
    except Exception as e:
        print(f"\n        Error creating Dim_Transaction_Type table: {e}")
        return False

print("        Dimension table creation functions loaded.")


        Dimension table creation functions loaded.


In [None]:
# Star Schema Fact Table Creation Functions
def create_fact_loan_balance():
    """Create Loan Balance Fact Table"""
    try:

        create_sql = """
        CREATE TABLE Fact_Balance (
            balance_key BIGINT IDENTITY(1,1) PRIMARY KEY,
            balance_id INT NOT NULL,
            preceding_id INT,
            account_id INT NOT NULL,
            account_status_id INT NOT NULL,
            product_id INT NOT NULL,
            organization_id INT NOT NULL,
            currency_id INT NOT NULL,
            accrued_interest_eur DECIMAL(15,5) DEFAULT 0,
            accrued_interest_sek DECIMAL(15,5) DEFAULT 0,
            balance_eur DECIMAL(15,5) DEFAULT 0,
            balance_sek DECIMAL(15,5) DEFAULT 0,
            balance_date DATE NOT NULL,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL,
            FOREIGN KEY (account_id) REFERENCES Dim_Account(account_id),
            FOREIGN KEY (product_id) REFERENCES Dim_Product(product_id),
            FOREIGN KEY (organization_id) REFERENCES Dim_Organization(organization_id),
            FOREIGN KEY (currency_id) REFERENCES Dim_Currency(currency_id)
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Fact_Balance table created successfully.")
        return True
    except Exception as e:
        print(f"\n        Error creating Fact_Balance table: {e}")
        return False

def create_fact_loan_transaction():
    """Create Loan Transaction Fact Table"""
    try:

        create_sql = """
        CREATE TABLE Fact_Transaction (
            transaction_key INT IDENTITY(1,1) PRIMARY KEY,
            transaction_id INT NOT NULL,
            transaction_date DATE NOT NULL,
            value_date DATE NOT NULL,
            entry_date DATE NOT NULL,
            account_id INT NOT NULL,
            transaction_type_id INT NOT NULL,
            transaction_amount DECIMAL(12,5) NOT NULL,
            transaction_amount_SEK DECIMAL(12,5) NOT NULL,
            transaction_reference NVARCHAR(50),
            currency_id INT NOT NULL,
            exchange_rate_id INT,
            created_date DATETIME NOT NULL,
            updated_date DATE NOT NULL,
            updated_time TIME NOT NULL,
            FOREIGN KEY (account_id) REFERENCES Dim_Account(account_id),
            FOREIGN KEY (transaction_type_id) REFERENCES Dim_Transaction_Type(transaction_type_id),
            FOREIGN KEY (currency_id) REFERENCES Dim_Currency(currency_id)
        );
        """
        cursor.execute(create_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Fact_Transaction table created successfully.")
        return True
    except Exception as e:
        print(f"\n        Error creating Fact_Transaction table: {e}")
        return False

print("        Fact table creation functions loaded.")

        Fact table creation functions loaded.


In [8]:
# Create All Star Schema Tables
def create_star_schema():
    """Create complete star schema"""
    print("\n" + "=" * 60)
    print("CREATING STAR SCHEMA TABLES")
    print("=" * 60)
    
    success_count = 0
    total_tables = 8
    
    # Create Dimension Tables
    print("\nSTEP 2: Creating Dimension Tables...")
    if create_dim_date():
        success_count += 1
    if create_dim_account():
        success_count += 1
    if create_dim_organization():
        success_count += 1
    if create_dim_product():
        success_count += 1
    if create_dim_currency():
        success_count += 1
    if create_dim_transaction_type():
        success_count += 1

    # Create Fact Tables
    print("\nSTEP 3: Creating Fact Tables...")
    if create_fact_loan_balance():
        success_count += 1
    if create_fact_loan_transaction():
        success_count += 1
    
    print(f"\n        Star Schema Tables COMPLETED: \
          {success_count}/{total_tables} tables created successfully\n")
    
    return success_count == total_tables

In [9]:
# Function to populate star schema tables from source tables
def populate_star_tables(populate_sql, table_to, table_from):
    """Populate a star schema table from a source table"""
    try:
        with pyodbc.connect(STAR_CONFIG["pyodbc_conn"]) as conn:
            with conn.cursor() as cursor:
                cursor.execute(populate_sql)
                conn.commit()
        print(f"        Table {table_to} populated successfully.")
        return True
    except Exception as e:
        print(f"\n        Error populating table {table_to} from {table_from}:")
        print(f"\n        Error: {e}")
           #  SQL Query:\n{populate_sql}\nError:\n{e}")
        return False

In [None]:
# Populate Date Dimension
def populate_dim_date():
    """Populate date dimension with date range"""
    try:
        # cursor = conn.cursor()

        populate_sql = """
        WITH DateRange AS (
            SELECT CAST('2017-01-01' AS DATE) AS DateValue
            UNION ALL
            SELECT DATEADD(day, 1, DateValue)
            FROM DateRange
            WHERE DateValue < '2025-12-31'
        )
        INSERT INTO Dim_Date (
            date_key, full_date, year, quarter, month, month_name,
            day, day_of_week, day_name, week_of_year, is_weekend
        )
        SELECT
            YEAR(DateValue) * 10000 + MONTH(DateValue) * 100 + DAY(DateValue) AS date_key,
            DateValue AS full_date,
            YEAR(DateValue) AS year,
            DATEPART(quarter, DateValue) AS quarter,
            MONTH(DateValue) AS month,
            DATENAME(month, DateValue) AS month_name,
            DAY(DateValue) AS day,
            DATEPART(weekday, DateValue) AS day_of_week,
            DATENAME(weekday, DateValue) AS day_name,
            DATEPART(week, DateValue) AS week_of_year,
            CASE WHEN DATEPART(weekday, DateValue) IN (1, 7) THEN 1 ELSE 0 END AS is_weekend
        FROM DateRange
        OPTION (MAXRECURSION 0);
        """

        cursor.execute(populate_sql)
        # conn.commit()
        # cursor.close()
        # conn.close()
        print("        Table [Dim_Date] populated successfully.")
        return True
    except Exception as e:
        print(f"\n        Error populating Dim_Date: {e}")
        return False


# Populate Dim_Account from Loan_Account_cleansed
table_to_acc = "[Dim_Account]"
table_from_acc = "[Loan_Account_cleansed]"
populate_sql_acc = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_acc}(
    account_id,
    account_number,
    currency_code,
    organization_name,
    channel_id,
    broker_id,
    product_name,
    open_date,
    cancelled_date,
    value_date,
    invoice_day,
    current_installment_amount,
    current_invoice_fee,
    next_invoice_date,
    calculated_maturity_date,
    is_active,
    created_date,
    updated_date,
    updated_time
)
SELECT
    LoanAccountId,
    AccountNumber,
    AccountCurrency,
    OrganizationName,
    ChannelID,
    BrokerId,
    Product,
    OpenDate,
    CancelledDate,
    ValueDate,
    InvoiceDay,
    CurrentInstallmentAmount,
    CurrentInvoiceFee,
    NextInvoiceDate,
    CalculatedMaturityDate,
    IsActive,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_acc};
"""

# Populate Dim_Organization from Loan_Account_cleansed
table_to_org = "[Dim_Organization]"
table_from_org = "[Loan_Account_cleansed]"
populate_sql_org = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_org}(
    organization_id,
    organization_name,
    source_id,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    OrganizationId,
    OrganizationName,
    SourceId,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_org};
"""

# Populate Dim_Product from Loan_Account_cleansed
table_to_prod = "[Dim_Product]"
table_from_prod = "[Loan_Account_cleansed]"
populate_sql_prod = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_prod}(
    product_id,
    product_name,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    ProductId,
    Product,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_prod};
"""

# Populate Dim_Currency from Loan_Account_cleansed
table_to_currency = "[Dim_Currency]"
table_from_currency = "[Loan_Account_cleansed]"
populate_sql_currency = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_currency}(
    currency_id,
    currency_code,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    AccountCurrencyId,
    AccountCurrency,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_currency};
"""

# Populate Dim_Transaction_Type from Loan_Transaction_cleansed
table_to_transaction_type = "[Dim_Transaction_Type]"
table_from_transaction_type = "[Loan_Transaction_cleansed]"
populate_sql_transaction_type = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_transaction_type}(
    transaction_type_id,
    created_date,
    updated_date,
    updated_time
)
SELECT DISTINCT
    TransactionTypeId,
    CreatedDate,
    UpdatedDate,
    UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_transaction_type};
"""

# Populate Fact_Balance from Loan_Balance_cleansed
table_to_fact_balance = "[Fact_Balance]"
table_from_fact_balance = "[Loan_Balance_cleansed]"
populate_sql_fact_balance = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_fact_balance}(
    balance_id,
    preceding_id,
    account_id,
    account_status_id,
    product_id,
    organization_id,
    currency_id,
    accrued_interest_eur,
    accrued_interest_sek,
    balance_eur,
    balance_sek,
    balance_date,
    created_date,
    updated_date,
    updated_time
)
SELECT
    lbc.LoanAccountBalanceId,
    lbc.PrecedingId,
    da.account_id,
    lbc.AccountStatusId,
    dp.product_id,
    do.organization_id,
    dc.currency_id,
    lbc.AccruedInterest,
    lbc.AccruedInterestSEK,
    lbc.Balance,
    lbc.BalanceSek,
    lbc.BalanceDate,
    lbc.CreatedDate,
    lbc.UpdatedDate,
    lbc.UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_fact_balance} lbc
JOIN Dim_Account da ON lbc.LoanAccountId = da.account_id
JOIN Dim_Product dp ON lbc.ProductId = dp.product_id
JOIN Dim_Organization do ON da.organization_name = do.organization_name
JOIN Dim_Currency dc ON lbc.AccountCurrencyId = dc.currency_id;
"""

# Populate Fact_Transaction from Loan_Transaction_cleansed
table_to_fact_transaction = "[Fact_Transaction]"
table_from_fact_transaction = "[Loan_Transaction_cleansed]"
populate_sql_fact_transaction = f"""
INSERT INTO [ETL_Assignment2_Star].[dbo].{table_to_fact_balance}(
    balance_id,
    preceding_id,
    account_id,
    account_status_id,
    product_id,
    currency_id,
    accrued_interest_eur,
    accrued_interest_sek,
    balance_eur,
    balance_sek,
    balance_date,
    created_date,
    updated_date,
    updated_time
)
SELECT
    lbc.LoanAccountBalanceId,
    lbc.PrecedingId,
    lbc.LoanAccountId,
    lbc.AccountStatusId,
    lbc.ProductId,
    lbc.AccountCurrencyId,
    lbc.AccruedInterest,
    lbc.AccruedInterestSEK,
    lbc.Balance,
    lbc.BalanceSek,
    lbc.BalanceDate,
    lbc.CreatedDate,
    lbc.UpdatedDate,
    lbc.UpdatedTime
FROM [ETL_Assignment2_Warehouse].[dbo].{table_from_fact_balance}
"""


In [11]:
# Create All Star Schema Tables
def populate_star_schema():
    """Load data into star schema"""

    print("=" * 60)
    print("POPULATING STAR SCHEMA TABLES")
    print("=" * 60)

    success_count = 0
    total_tables = 8

    # Populate Dimension Tables
    print("\nSTEP 4: Populating Dimension Tables...")
    if populate_dim_date():
        success_count += 1
    if populate_star_tables(
        populate_sql=populate_sql_acc,
        table_to=table_to_acc,
        table_from=table_from_acc):
        success_count += 1
    if populate_star_tables(
        populate_sql=populate_sql_org,
        table_to=table_to_org,
        table_from=table_from_org):
        success_count += 1
    if populate_star_tables(
        populate_sql=populate_sql_prod,
        table_to=table_to_prod,
        table_from=table_from_prod):
        success_count += 1
    if populate_star_tables(
        populate_sql=populate_sql_currency,
        table_to=table_to_currency,
        table_from=table_from_currency):
        success_count += 1
    if populate_star_tables(
        populate_sql=populate_sql_transaction_type,
        table_to=table_to_transaction_type,
        table_from=table_from_transaction_type):
        success_count += 1

    # Populate Fact Tables
    print("\nSTEP 5: Populating Fact Tables...")
    if populate_star_tables(
        populate_sql=populate_sql_fact_balance,
        table_to=table_to_fact_balance,
        table_from=table_from_fact_balance):
        success_count += 1

    if populate_star_tables(
        populate_sql=populate_sql_fact_transaction,
        table_to=table_to_fact_transaction,
        table_from=table_from_fact_transaction):
        success_count += 1

    print(f"\n        Star Schema Data Load COMPLETED: \
          {success_count}/{total_tables} tables populated successfully")
    print("\n" + "=" * 60 + "\n")
    print("\n" + "=" * 60)
    print("ETL ASSIGNMENT 2: STAR SCHEMA DESIGN")
    print("=" * 60)
    print("\nSTEP 1: Star Schema configuration loaded successfully.")
    return success_count == total_tables


In [12]:
create_star_schema()


CREATING STAR SCHEMA TABLES

STEP 2: Creating Dimension Tables...
        Dim_Date table created successfully.
        Dim_Account table created successfully.
        Dim_Organization table created successfully.
        Dim_Product table created successfully.
        Dim_Currency table created successfully.
        Dim_Transaction_Type table created successfully.

STEP 3: Creating Fact Tables...
        Fact_Balance table created successfully.
        Fact_Transaction table created successfully.

        Star Schema Tables COMPLETED:           8/8 tables created successfully



True

In [13]:
populate_star_schema()

POPULATING STAR SCHEMA TABLES

STEP 4: Populating Dimension Tables...
        Table [Dim_Date] populated successfully.
        Table [Dim_Account] populated successfully.
        Table [Dim_Organization] populated successfully.
        Table [Dim_Product] populated successfully.
        Table [Dim_Currency] populated successfully.
        Table [Dim_Transaction_Type] populated successfully.

STEP 5: Populating Fact Tables...
        Table [Fact_Balance] populated successfully.
        Table [Fact_Transaction] populated successfully.

        Star Schema Data Load COMPLETED:           8/8 tables populated successfully



ETL ASSIGNMENT 2: STAR SCHEMA DESIGN

STEP 1: Star Schema configuration loaded successfully.


True