ETL: Raw to Cleanse

In [1]:
!pip install pyspark



In [62]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ColabSparkSession") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

In [63]:
from pyspark.sql.functions import (
    when, col, row_number, current_timestamp, date_format,
    from_utc_timestamp, lit, regexp_replace
)
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType, DateType,
    DecimalType, TimestampType
)
from pathlib import Path
import pyodbc

In [82]:
CONFIG = {
    "files": {
        "account": {
            "csv": "/data/Raw_Loan_Account.csv",
            "parquet": "/data/Loan_Account_cleansed.parquet",
            "table": "Loan_Account_cleansed"
            },
        "balance": {
            "csv": "/data/Raw_Loan_Balance.csv",
            "parquet": "/data/Loan_Balance_cleansed.parquet",
            "table": "Loan_Balance_cleansed"
            },
        "transaction": {
            "csv": "/data/Raw_Loan_Transaction.csv",
            "parquet": "/data/Loan_Transaction_cleansed.parquet",
            "table": "Loan_Transaction_cleansed"
            }
        }
}
csv_path = CONFIG["files"]["account"]["csv"]

In [None]:
df = spark.read.csv(csv_path, header=True, inferSchema=True, sep=";")

# Show the data
# df.show()

In [35]:

# Schema definitions
account_schema = StructType([
    StructField("LoanAccountId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("AccountNumber", StringType(), False),
    StructField("IBAN", StringType(), True),
    StructField("BBAN", StringType(), True),
    StructField("AccountCurrencyId", IntegerType(), False),
    StructField("AccountCurrency", StringType(), False),
    StructField("OrganizationId", IntegerType(), False),
    StructField("OrganizationName", StringType(), False),
    StructField("ChannelID", IntegerType(), True),
    StructField("BrokerId", IntegerType(), False),
    StructField("OpenDateId", IntegerType(), False),
    StructField("OpenDate", DateType(), False),
    StructField("CancelledDateId", IntegerType(), True),
    StructField("CancelledDate", DateType(), True),
    StructField("ValueDate", DateType(), False),
    StructField("MaturityDate", DateType(), True),
    StructField("ProductId", IntegerType(), False),
    StructField("Product", StringType(), False),
    StructField("InvoiceDay", IntegerType(), False),
    StructField("CurrentInstallmentAmount", StringType(), False),
    StructField("CurrentInvoiceFee", StringType(), False),
    StructField("RepaymentRate", StringType(), True),
    StructField("NextInvoiceDate", DateType(), False),
    StructField("CalculatedMaturityDate", DateType(), True)
])

balance_schema = StructType([
    StructField("LoanAccountBalanceId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("BalanceDateId", IntegerType(), False),
    StructField("LoanAccountId", IntegerType(), False),
    StructField("ProductId", IntegerType(), False),
    StructField("AccountCurrencyId", IntegerType(), False),
    StructField("AccountStatusId", IntegerType(), False),
    StructField("NumOfTransactions", IntegerType(), False),
    StructField("NetTransactionAmount", StringType(), False),
    StructField("NetTransactionAmountSek", StringType(), False),
    StructField("AccruedInterest", StringType(), False),
    StructField("AccruedInterestSEK", StringType(), False),
    StructField("Balance", StringType(), False),
    StructField("BalanceSek", StringType(), False),
    StructField("LTV", IntegerType(), False),
    StructField("PrecedingId", IntegerType(), True)
])

transaction_schema = StructType([
    StructField("LoanAccountTransactionId", IntegerType(), False),
    StructField("SourceId", IntegerType(), False),
    StructField("TransactionDateId", IntegerType(), False),
    StructField("ValueDateId", IntegerType(), False),
    StructField("EntryDateID", IntegerType(), False),
    StructField("LoanAccountId", IntegerType(), False),
    StructField("TransactionTypeId", IntegerType(), False),
    StructField("TransactionStatus", StringType(), True),
    StructField("RectifyStatus", StringType(), True),
    StructField("TransactionCurrencyId", IntegerType(), False),
    StructField("TransactionAmount", StringType(), False),
    StructField("TransactionAmountSEK", StringType(), False),
    StructField("CounterpartClearingNumber", StringType(), True),
    StructField("CounterPartBic", StringType(), True),
    StructField("CounterPartIban", StringType(), True),
    StructField("TransactionReference", StringType(), True),
    StructField("ExchangeRateId", IntegerType(), False),
    StructField("TransactionText", StringType(), True),
    StructField("AccountServicerReference", StringType(), True),
    StructField("CounterPartId", IntegerType(), True),
    StructField("CounterPartAccountNumber", StringType(), True),
    StructField("CounterPartBankName", StringType(), True),
    StructField("TransactionDateTime", TimestampType(), True),
    StructField("IsDirectDebit", IntegerType(), True),
    StructField("GLAccount", StringType(), True),
    StructField("EventName", StringType(), True),
    StructField("InvoiceId", IntegerType(), True)
])

print("Configuration and schemas loaded successfully.")

Configuration and schemas loaded successfully.


In [39]:

# Utility Functions
def convert_european_decimals(df, decimal_columns):
    """Convert European decimal format to US format and cast to decimal"""
    converted_df = df

    for col_name in decimal_columns:
        if col_name in df.columns:
            converted_df = converted_df.withColumn(
                col_name,
                when(
                    (col(col_name).isNull()) |
                    (col(col_name) == "") |
                    (col(col_name) == "NULL") |
                    (col(col_name) == "null"),
                    lit(0.0)
                ).otherwise(
                    regexp_replace(
                        regexp_replace(col(col_name), r"^\s+|\s+$", ""),
                        ",", "."
                    ).cast(DecimalType(12, 5))
                )
            )
    return converted_df

def check_table_exists(table_name):
    """Check if table exists in database"""
    try:
        conn = pyodbc.connect(CONFIG["pyodbc_conn"])
        cursor = conn.cursor()
        cursor.execute(f"""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_NAME = '{table_name}'
        """)
        exists = cursor.fetchone()[0] > 0
        cursor.close()
        conn.close()
        return exists
    except Exception as e:
        print(f"        \nError checking table existence: {e}")
        return False

def check_parquet_exists(parquet_path):
    """Check if parquet file exists and has data"""
    try:
        path = Path(parquet_path)
        if path.exists():
            # Check if directory has parquet files
            parquet_files = list(path.glob("*.parquet"))
            if parquet_files:
                return True
            # Check if it's a directory with parquet files
            if path.is_dir():
                for item in path.iterdir():
                    if (item.name.endswith('.parquet') or
                        item.name.startswith('part-')):
                        return True
        return False
    except Exception as e:
        print(f"        \nError checking parquet existence: {e}")
        return False

def load_csv_data(csv_path, schema):
    """Load CSV data"""
    try:
        if not Path(csv_path).exists():
            print(f"CSV file not found: {csv_path}")
            return None

        df = spark.read.option("delimiter", ";") \
            .csv(csv_path, header=True, schema=schema)
        count = df.count()
        print(f"        Loaded {count} records from {csv_path}")
        return df if count > 0 else None
    except Exception as e:
        print(f"        \nError loading CSV {csv_path}: {e}")
        return None

def load_parquet_data(parquet_path):
    """Load existing parquet data"""
    try:
        if not check_parquet_exists(parquet_path):
            print(f"Parquet file not found: {parquet_path}")
            return None

        df = spark.read.parquet(parquet_path)
        count = df.count()
        print(f"        Loaded {count} records from parquet {parquet_path}")
        return df if count > 0 else None
    except Exception as e:
        print(f"        \nError loading parquet {parquet_path}: {e}")
        return None

# def load_database_data(table_name):
#     """Load data from database table"""
#     load_database_none = None
#     try:
#         df = spark.read.format("jdbc") \
#             .option("url", CONFIG["db_url"]) \
#             .option("dbtable", table_name) \
#             .option(
#                 "driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver"
#                 ).load()
#         count = df.count()
#         print(f"        Loaded {count} records from database table "
#               "{table_name}")
#         return df if count > 0 else load_database_none
#     except Exception as e:
#         print(f"        \nCould not load from database table "
#               "{table_name}: {e}")
#         return load_database_none


In [40]:

# Data Processing Functions
def clean_account_data(df):
    """Clean and transform account data"""
    decimal_columns = ["CurrentInstallmentAmount", "CurrentInvoiceFee"]
    converted_df = convert_european_decimals(df, decimal_columns)

    cleaned_df = converted_df.withColumn("CancelledDateId",
                                       when(col("CancelledDateId") == -1, None)
                                       .otherwise(col("CancelledDateId")))

    # Deduplication
    window_spec = Window.partitionBy("LoanAccountId") \
    .orderBy(col("OpenDateId").desc())
    deduped_df = cleaned_df.withColumn(
        "row_num", row_number().over(window_spec
        )).filter("row_num = 1").drop("row_num")

    print(f"        Cleaned account data: {deduped_df.count()} " +
          "records after deduplication")
    return deduped_df

def clean_balance_data(df):
    """Clean and transform balance data"""
    decimal_columns = [
        "NetTransactionAmount", "NetTransactionAmountSek",
        "AccruedInterest", "AccruedInterestSEK", "Balance", "BalanceSek"
    ]
    converted_df = convert_european_decimals(df, decimal_columns)

    cleaned_df = converted_df.withColumn("PrecedingId",
                                       when(col("PrecedingId") == -1, None)
                                       .otherwise(col("PrecedingId")))

    # Deduplication
    window_spec = Window.partitionBy("LoanAccountBalanceId") \
    .orderBy(col("BalanceDateId").desc())
    deduped_df = cleaned_df.withColumn(
        "row_num", row_number().over(window_spec)
        ).filter("row_num = 1").drop("row_num")

    print(f"        Cleaned balance data: {deduped_df.count()} "
          "records after deduplication")
    return deduped_df

def clean_transaction_data(df):
    """Clean and transform transaction data"""
    decimal_columns = ["TransactionAmount", "TransactionAmountSEK"]
    converted_df = convert_european_decimals(df, decimal_columns)

    cleaned_df = converted_df \
        .withColumn("CounterPartId", when(col("CounterPartId") == -1, None)
                    .otherwise(col("CounterPartId"))) \
        .withColumn("InvoiceId", when(col("InvoiceId") == -1, None)
                    .otherwise(col("InvoiceId")))

    # Deduplication
    window_spec = Window.partitionBy("LoanAccountTransactionId") \
        .orderBy(col("TransactionDateId").desc())
    deduped_df = cleaned_df.withColumn(
        "row_num", row_number().over(window_spec)
        ).filter("row_num = 1").drop("row_num")

    print(f"        Cleaned transaction data: {deduped_df.count()} "
          "records after deduplication")
    return deduped_df

def add_metadata_columns(df):
    """Add metadata columns"""
    ts = from_utc_timestamp(current_timestamp(), "Europe/Stockholm")
    return df \
        .withColumn("CreatedDate", ts) \
        .withColumn(
            "UpdatedDate", date_format(ts, "yyyy-MM-dd").cast(DateType())
            ).withColumn("UpdatedTime", date_format(ts, "HH:mm:ss"))

In [41]:

def detect_changes(new_df, existing_df, compare_cols):
    """Detect new/changed records"""
    try:
        if existing_df is None:
            return new_df, new_df.count()

        # Get counts first for debugging
        new_count = new_df.count()
        existing_count = existing_df.count()
        print(f"        New CSV records: {new_count} \
              \n        Existing records: {existing_count}")

        # If counts are different, we definitely have changes
        if new_count != existing_count:
            print(f"        Count difference detected: "
                  "{new_count} vs {existing_count}")

            # Find new records by comparing primary keys
            new_keys = new_df.select(compare_cols[0]).distinct()
            existing_keys = existing_df.select(compare_cols[0]).distinct()

            # Get new keys that don't exist in existing data
            new_only_keys = new_keys.exceptAll(existing_keys)

            if new_only_keys.count() > 0:
                # Get full records for new keys
                changed_df = new_df.join(
                    new_only_keys, on=compare_cols[0], how="inner")
                return changed_df, changed_df.count()

        # Select only comparison columns (excluding metadata)
        business_cols = [col for col in compare_cols if col not in [
            "%IdentityId", "CreatedDate", "UpdatedDate", "UpdatedTime"]]

        new_business = new_df.select(business_cols)
        old_business = existing_df.select(business_cols)

        # Find differences
        diff_df = new_business.exceptAll(old_business)
        diff_count = diff_df.count()
        print(f"        Business data differences found: {diff_count}")

        if diff_count == 0:
            return None, 0

        # Get full records for changed data
        changed_df = new_df.join(diff_df, on=business_cols, how="inner")
        return changed_df, changed_df.count()

    except Exception as e:
        print(f"        \nError detecting changes: {e}")
        # If error, process all new data to be safe
        return new_df, new_df.count()

In [42]:
# Save to destination
# def save_to_database(df, table_name):
#     """Save dataframe to database"""
#     try:
#         # Remove identity columns
#         identity_cols = [c for c in df.columns if "IdentityId" in c]
#         db_df = df
#         for col_name in identity_cols:
#             if col_name in db_df.columns:
#                 db_df = db_df.drop(col_name)

#         db_df.write \
#             .format("jdbc") \
#             .option("url", CONFIG["db_url"]) \
#             .option("dbtable", table_name) \
#             .option(
#                 "driver",, "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
#             .mode("append").save()
#         return True
#     except Exception as e:
#         print(f"        \nError saving to database table {table_name}: {e}")
#         return False

def save_to_parquet(df, parquet_path, mode="overwrite"):
    """Save dataframe to parquet"""
    try:
        # Ensure directory exists
        Path(parquet_path).parent.mkdir(parents=True, exist_ok=True)

        df.write.mode(mode).parquet(parquet_path)
        print(f"        Successfully saved {df.count()} records to parquet "
              "{parquet_path}")
        return True
    except Exception as e:
        print(f"        Error saving to parquet {parquet_path}: {e}")
        return False

In [43]:

# Main ETL Functions
def run_table_etl(
        table_config, schema, clean_func, compare_cols,
        identity_col, partition_col ,sort_col
        ):
    """Generic ETL function for any table"""
    print(f"\n{'='*60}")
    print(f"STARTING ETL FOR {table_config['table'].upper()}")
    print(f"{'='*60}")

    csv_path = table_config["csv"]
    parquet_path = table_config["parquet"]
    table_name = table_config["table"]

    # 1. Load CSV data
    print("\nStep 1: Loading CSV data...")
    raw_df = load_csv_data(csv_path, schema)
    if raw_df is None:
        print(f"        Failed to load CSV data from {csv_path}")
        return False

    # 2. Clean data
    print("\nStep 2: Cleaning data...")
    cleaned_df = clean_func(raw_df)
    cleaned_df = add_metadata_columns(cleaned_df)

    # 3. Check if table exists
    print("\nStep 3: Checking table existence...")
    table_exists = check_table_exists(table_name)

    # 4. Check if parquet exists
    print("\nStep 4: Checking parquet existence...")
    parquet_exists = check_parquet_exists(parquet_path)

    # 5. Load existing data
    print("\nStep 5: Loading existing data...")
    existing_df = None
    if table_exists:
        existing_df = load_database_data(table_name)
    elif parquet_exists:
        existing_df = load_parquet_data(parquet_path)

    # DEBUG: Show sample of existing data if available
    if existing_df:
        # print("Sample of existing data:")
        # existing_df.select(compare_cols[0], "CreatedDate").show(5)

        # Compare primary key ranges
        new_min_max = cleaned_df.agg(
            {compare_cols[0]: "min", compare_cols[0]: "max"}
            ).collect()[0]
        existing_min_max = existing_df.agg(
            {compare_cols[0]: "min", compare_cols[0]: "max"}
            ).collect()[0]
        print(f"        New data key range: {new_min_max}")
        print(f"        Existing data key range: {existing_min_max}")

    # 6. Detect changes or prepare for initial load
    print("\nStep 6: Detecting changes...")
    if not table_exists:
        print("        Database table missing - "
              "processing all data for initial load")
        change_count = cleaned_df.count()

        # Add identity column for initial load
        window_spec = Window.partitionBy(partition_col).orderBy(sort_col)
        final_df = cleaned_df.withColumn(
            identity_col, row_number().over(window_spec)
            )
        # Reorder columns to put identity first
        columns = (
            [identity_col] + [c for c in final_df.columns if c != identity_col]
            )
        final_df = final_df.select(columns)

    elif existing_df is not None:
        changed_df, change_count = detect_changes(
            cleaned_df, existing_df, compare_cols
            )
        if change_count == 0:
            print("        No changes detected. ETL completed.")
            return True

        print(f"        Found {change_count} new/changed records")

        # Get max identity for incremental load
        max_id = existing_df.agg({identity_col: "max"}).collect()[0][0] or 0
        window_spec = Window.partitionBy(partition_col).orderBy(sort_col)
        final_df = changed_df.withColumn(
            identity_col, row_number().over(window_spec) + max_id
            )
    else:
        print("        No existing data found. Performing initial load...")
        change_count = cleaned_df.count()

        # Add identity column for initial load
        window_spec = Window.partitionBy(partition_col).orderBy(sort_col)
        final_df = cleaned_df.withColumn(
            identity_col, row_number().over(window_spec)
            )
        # Reorder columns to put identity first
        columns = (
            [identity_col] + [c for c in final_df.columns if c != identity_col]
            )
        final_df = final_df.select(columns)

    # # 7. Create table if needed
    # if not table_exists:
    #     print("\nStep 7: Creating database table...")
    #     if table_name == "Loan_Account_cleansed":
    #         create_success = create_account_table(table_name)
    #     elif table_name == "Loan_Balance_cleansed":
    #         create_success = create_balance_table(table_name)
    #     elif table_name == "Loan_Transaction_cleansed":
    #         create_success = create_transaction_table(table_name)
    #     else:
    #         create_success = False

    #     if not create_success:
    #         print("        \nFailed to create database table")
    #         return False
    # else:
    #     print("\nStep 7: Database table already exists, skipping creation...")

    # # 8. Save to database
    # print("\nStep 8: Saving to database...")
    # if change_count > 0:
    #     if not save_to_database(final_df, table_name):
    #         print("        Failed to save to database")
    #         return False
    #     print(f"        Successfully saved {change_count} " +
    #           "records to database")
    # else:
    #     print("        No data to save to database")

    # 9. Update parquet (full dataset)
    print("\nStep 9: Updating parquet backup...")
    if change_count > 0:
        if not table_exists and existing_df is not None:
            # Use the processed data as complete dataset
            complete_df = final_df
        elif existing_df is not None:
            # Normal incremental update
            new_keys = final_df.select(partition_col).distinct()
            updated_existing = existing_df.join(
                new_keys, on=compare_cols[0], how="left_anti"
                )
            business_cols = [col for col in compare_cols if col not in [
                "LoanAccountIdentityId", "LoanAccountBalanceIdentityId"
                "LoanAccountTransactionIdentityId", "CreatedDate",
                "UpdatedDate", "UpdatedTime"]
                ]
            if updated_existing.count() > 0:
                updated_existing = updated_existing.select(
                    business_cols + [
                        "CreatedDate", "UpdatedDate", "UpdatedTime"
                        ]
                    )
            final_business = final_df.select(
                business_cols + ["CreatedDate", "UpdatedDate", "UpdatedTime"]
                )

            # Combine updated existing + new records
            if updated_existing.count() > 0:
                complete_df = updated_existing.union(final_business)
            else:
                complete_df = final_business

            complete_df = complete_df.withColumn(
                identity_col, row_number().over(
                    Window.partitionBy(partition_col).orderBy(sort_col)
                    )
                )
            columns = (
                [identity_col] +
                [c for c in complete_df.columns if c != identity_col]
                )
            complete_df = complete_df.select(columns)
        else:
            # No existing data
            complete_df = final_df

        if not save_to_parquet(complete_df, parquet_path, "overwrite"):
            print("        Warning: Failed to update parquet backup")
    else:
        print("        No changes to save to parquet")

    print(f"        ETL completed successfully for {table_name}")
    print(f"        Processed {change_count} records.")
    return True

In [44]:

# Comparison columns definitions
account_compare_cols = [
    "LoanAccountId", "SourceId", "AccountNumber", "IBAN", "BBAN",
    "AccountCurrencyId", "AccountCurrency", "OrganizationId", "OrganizationName",
    "ChannelID", "BrokerId", "OpenDateId", "OpenDate", "CancelledDateId",
    "CancelledDate", "ValueDate", "MaturityDate", "ProductId", "Product",
    "InvoiceDay", "CurrentInstallmentAmount", "CurrentInvoiceFee",
    "RepaymentRate", "NextInvoiceDate", "CalculatedMaturityDate"
]

balance_compare_cols = [
    "LoanAccountBalanceId", "SourceId", "BalanceDateId", "LoanAccountId",
    "ProductId", "AccountCurrencyId", "AccountStatusId", "NumOfTransactions",
    "NetTransactionAmount", "NetTransactionAmountSek", "AccruedInterest",
    "AccruedInterestSEK", "Balance", "BalanceSek", "LTV", "PrecedingId"
]

transaction_compare_cols = [
    "LoanAccountTransactionId", "SourceId", "TransactionDateId", "ValueDateId",
    "EntryDateID", "LoanAccountId", "TransactionTypeId", "TransactionStatus",
    "RectifyStatus", "TransactionCurrencyId", "TransactionAmount",
    "TransactionAmountSEK", "CounterpartClearingNumber", "CounterPartBic",
    "CounterPartIban", "TransactionReference", "ExchangeRateId", "TransactionText",
    "AccountServicerReference", "CounterPartId", "CounterPartAccountNumber",
    "CounterPartBankName", "TransactionDateTime", "IsDirectDebit", "GLAccount",
    "EventName", "InvoiceId"
]


In [45]:

def run_full_etl_process():
    """Run ETL for all three tables"""
    print("\n" + "="*60)
    print("STARTING COMPREHENSIVE ETL PROCESS")
    print("="*60)

    success_count = 0

    # Run Account ETL
    try:
        if run_table_etl(
            CONFIG["files"]["account"],
            account_schema,
            clean_account_data,
            account_compare_cols,
            "LoanAccountIdentityId",
            "LoanAccountId",
            "OpenDate"
        ):
            success_count += 1
    except Exception as e:
        print(f"        \nAccount ETL failed: {e}")

    # Run Balance ETL
    try:
        if run_table_etl(
            CONFIG["files"]["balance"],
            balance_schema,
            clean_balance_data,
            balance_compare_cols,
            "LoanAccountBalanceIdentityId",
            "LoanAccountBalanceId",
            "BalanceDateId"
        ):
            success_count += 1
    except Exception as e:
        print(f"        \nBalance ETL failed: {e}")

    # Run Transaction ETL
    try:
        if run_table_etl(
            CONFIG["files"]["transaction"],
            transaction_schema,
            clean_transaction_data,
            transaction_compare_cols,
            "LoanAccountTransactionIdentityId",
            "LoanAccountTransactionId",
            "TransactionDateId"
        ):
            success_count += 1
    except Exception as e:
        print(f"        \nTransaction ETL failed: {e}")

    print("\n" + "="*60)
    print(f"Tables Processed: {success_count}/3")
    print("="*60)

    return success_count == 3

# Execute ETL Process
if __name__ == "__main__":
    try:
        success = run_full_etl_process()
        print(f"\nETL PROCESS "
        f"{'COMPLETED SUCCESSFULLY' if success else 'COMPLETED WITH ERRORS'}"
        "\n")
    except Exception as e:
        print(f"        \nCritical error in ETL process: {e}\n")
    finally:
        # Clean up
        spark.stop()


STARTING COMPREHENSIVE ETL PROCESS

STARTING ETL FOR LOAN_ACCOUNT_CLEANSED

Step 1: Loading CSV data...
        Loaded 500 records from /data/Raw_Loan_Account.csv

Step 2: Cleaning data...
        Cleaned account data: 500 records after deduplication

Step 3: Checking table existence...
        
Error checking table existence: 'pyodbc_conn'

Step 4: Checking parquet existence...

Step 5: Loading existing data...

Step 6: Detecting changes...
        Database table missing - processing all data for initial load

Step 9: Updating parquet backup...
        Successfully saved 500 records to parquet {parquet_path}
        ETL completed successfully for Loan_Account_cleansed
        Processed 500 records.

STARTING ETL FOR LOAN_BALANCE_CLEANSED

Step 1: Loading CSV data...
        Loaded 4657 records from /data/Raw_Loan_Balance.csv

Step 2: Cleaning data...
        Cleaned balance data: 4657 records after deduplication

Step 3: Checking table existence...
        
Error checking table exist

In [83]:
account_csv_path = CONFIG["files"]["account"]["csv"]
account_parquet_path = CONFIG["files"]["account"]["parquet"]

balance_csv_path = CONFIG["files"]["balance"]["csv"]
balance_parquet_path = CONFIG["files"]["balance"]["parquet"]

transaction_csv_path = CONFIG["files"]["transaction"]["csv"]
transaction_parquet_path = CONFIG["files"]["transaction"]["parquet"]

In [84]:
df_account_csv = spark.read.csv(
    account_csv_path, header=True, inferSchema=True, sep=";")
df_balance_csv = spark.read.csv(
    balance_csv_path, header=True, inferSchema=True, sep=";")
df_transaction_csv = spark.read.csv(
    transaction_csv_path, header=True, inferSchema=True, sep=";")

df_account_parq = spark.read.parquet(account_parquet_path)
df_balance_parq = spark.read.parquet(balance_parquet_path)
df_transaction_parq = spark.read.parquet(transaction_parquet_path)

In [77]:
# Show the data
df_account_csv.printSchema()
df_account_parq.printSchema()

root
 |-- LoanAccountId: integer (nullable = true)
 |-- SourceId: integer (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- IBAN: string (nullable = true)
 |-- BBAN: string (nullable = true)
 |-- AccountCurrencyId: integer (nullable = true)
 |-- AccountCurrency: string (nullable = true)
 |-- OrganizationId: integer (nullable = true)
 |-- OrganizationName: string (nullable = true)
 |-- ChannelID: string (nullable = true)
 |-- BrokerId: integer (nullable = true)
 |-- OpenDateId: integer (nullable = true)
 |-- OpenDate: date (nullable = true)
 |-- CancelledDateId: integer (nullable = true)
 |-- CancelledDate: string (nullable = true)
 |-- ValueDate: date (nullable = true)
 |-- MaturityDate: string (nullable = true)
 |-- ProductId: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- InvoiceDay: integer (nullable = true)
 |-- CurrentInstallmentAmount: string (nullable = true)
 |-- CurrentInvoiceFee: string (nullable = true)
 |-- RepaymentRate: string (nul

In [88]:
df_balance_csv.printSchema()
df_balance_parq.printSchema()

root
 |-- LoanAccountBalanceId: integer (nullable = true)
 |-- SourceId: integer (nullable = true)
 |-- BalanceDateId: integer (nullable = true)
 |-- LoanAccountId: integer (nullable = true)
 |-- ProductId: integer (nullable = true)
 |-- AccountCurrencyId: integer (nullable = true)
 |-- AccountStatusId: integer (nullable = true)
 |-- NumOfTransactions: integer (nullable = true)
 |-- NetTransactionAmount: integer (nullable = true)
 |-- NetTransactionAmountSek: integer (nullable = true)
 |-- AccruedInterest: string (nullable = true)
 |-- AccruedInterestSEK: string (nullable = true)
 |-- Balance: string (nullable = true)
 |-- BalanceSek: string (nullable = true)
 |-- LTV: integer (nullable = true)
 |-- PrecedingId: string (nullable = true)

root
 |-- LoanAccountBalanceIdentityId: integer (nullable = true)
 |-- LoanAccountBalanceId: integer (nullable = true)
 |-- SourceId: integer (nullable = true)
 |-- BalanceDateId: integer (nullable = true)
 |-- LoanAccountId: integer (nullable = true)


In [89]:
df_transaction_csv.printSchema()
df_transaction_parq.printSchema()

root
 |-- LoanAccountTransactionId: integer (nullable = true)
 |-- SourceId: integer (nullable = true)
 |-- TransactionDateId: integer (nullable = true)
 |-- ValueDateId: integer (nullable = true)
 |-- EntryDateID: integer (nullable = true)
 |-- LoanAccountId: integer (nullable = true)
 |-- TransactionTypeId: integer (nullable = true)
 |-- TransactionStatus: string (nullable = true)
 |-- RectifyStatus: string (nullable = true)
 |-- TransactionCurrencyId: integer (nullable = true)
 |-- TransactionAmount: string (nullable = true)
 |-- TransactionAmountSEK: string (nullable = true)
 |-- CounterpartClearingNumber: string (nullable = true)
 |-- CounterPartBic: string (nullable = true)
 |-- CounterPartIban: string (nullable = true)
 |-- TransactionReference: string (nullable = true)
 |-- ExchangeRateId: integer (nullable = true)
 |-- TransactionText: string (nullable = true)
 |-- AccountServicerReference: string (nullable = true)
 |-- CounterPartId: string (nullable = true)
 |-- CounterPartA

In [90]:
# Show the data
df_account_csv.show()
df_account_parq.show()

+-------------+--------+-------------+----+----+-----------------+---------------+--------------+----------------+---------+--------+----------+----------+---------------+-------------+----------+------------+---------+--------------------+----------+------------------------+-----------------+-------------+---------------+----------------------+
|LoanAccountId|SourceId|AccountNumber|IBAN|BBAN|AccountCurrencyId|AccountCurrency|OrganizationId|OrganizationName|ChannelID|BrokerId|OpenDateId|  OpenDate|CancelledDateId|CancelledDate| ValueDate|MaturityDate|ProductId|             Product|InvoiceDay|CurrentInstallmentAmount|CurrentInvoiceFee|RepaymentRate|NextInvoiceDate|CalculatedMaturityDate|
+-------------+--------+-------------+----+----+-----------------+---------------+--------------+----------------+---------+--------+----------+----------+---------------+-------------+----------+------------+---------+--------------------+----------+------------------------+-----------------+----------

In [91]:
df_balance_csv.show()
df_balance_parq.show()

+--------------------+--------+-------------+-------------+---------+-----------------+---------------+-----------------+--------------------+-----------------------+---------------+------------------+--------+-----------+---+-----------+
|LoanAccountBalanceId|SourceId|BalanceDateId|LoanAccountId|ProductId|AccountCurrencyId|AccountStatusId|NumOfTransactions|NetTransactionAmount|NetTransactionAmountSek|AccruedInterest|AccruedInterestSEK| Balance| BalanceSek|LTV|PrecedingId|
+--------------------+--------+-------------+-------------+---------+-----------------+---------------+-----------------+--------------------+-----------------------+---------------+------------------+--------+-----------+---+-----------+
|                9937|       8|     20230604|            2|      293|               49|            331|                0|                   0|                      0|              0|                 0|27591,69|311348,7687|  0|       NULL|
|               92431|       8|     20230605

In [92]:
df_transaction_csv.show()
df_transaction_parq.show()

+------------------------+--------+-----------------+-----------+-----------+-------------+-----------------+-----------------+-------------+---------------------+-----------------+--------------------+-------------------------+--------------+---------------+--------------------+--------------+---------------+------------------------+-------------+------------------------+-------------------+-------------------+-------------+---------+---------+---------+
|LoanAccountTransactionId|SourceId|TransactionDateId|ValueDateId|EntryDateID|LoanAccountId|TransactionTypeId|TransactionStatus|RectifyStatus|TransactionCurrencyId|TransactionAmount|TransactionAmountSEK|CounterpartClearingNumber|CounterPartBic|CounterPartIban|TransactionReference|ExchangeRateId|TransactionText|AccountServicerReference|CounterPartId|CounterPartAccountNumber|CounterPartBankName|TransactionDateTime|IsDirectDebit|GLAccount|EventName|InvoiceId|
+------------------------+--------+-----------------+-----------+-----------+---