In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DecimalType,IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark with Snowflake packages
spark = SparkSession.builder \
    .appName("SnowflakeLoad_LoanProduct") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.22,net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3") \
    .getOrCreate()



# Load CSV file into DataFrame (Replace 'your_file.csv' with actual filename)
csv_path = "Transformed_2014_18_v2.csv"
df = spark.read.option("header", True).csv(csv_path)  #.sample(withReplacement=False, fraction=0.0001, seed=42)


#### customize the csv file for Dim_loan_product

column_mapping = {
    "loan_amnt": "LOAN_AMOUNT",
    "funded_amnt_inv": "FUNDED_AMOUNT_INVESTOR",
    "int_rate": "INTEREST_RATE",
    "installment": "INSTALLMENT",
    "loan_status": "LOAN_STATUS",
    "dti": "DTI",
    "revol_bal": "REVOLVING_BALANCE",
    "revol_util": "REVOLVING_UTILIZATION",
    "tot_cur_bal":"TOTAL_CURRENT_BALANCE",
    "total_bal_il":"TOTAL_BALANCE_INSTALLMENT",
    "max_bal_bc":"MAXIMUM_BALANCE",
    "delinq_amnt":"DELINQUENT_AMOUNT",
    "issue_d":"ISSUE_DATE",
    "id":"LOANPRODUCT_BK"
}

fact_columns = [
    "loan_amnt", "funded_amnt_inv", "int_rate", "installment",

    "loan_status", "dti", "revol_bal", "revol_util",

    "tot_cur_bal", "total_bal_il", "max_bal_bc", "delinq_amnt",

    "issue_d","id"
]
 
FACT_DF = df.select(fact_columns)
FACT_DF.show(5)

# Snowflake connection options
sf_options = {
    "sfURL": "https://WOA97553.east-us-2.azure.snowflakecomputing.com",
    "sfUser": 'hussien1',
    "sfPassword":'@Hussien123456',
    "sfDatabase": 'Loan_DB',
    "sfSchema": 'Loan_Schema',
    "sfWarehouse": 'loan_Warehouse',
    "autopushdown": "on",
    "usestagingtable": "on"
}


# Rename columns in the DataFrame according to the column mapping
for source_col, dest_col in column_mapping.items():
    if source_col in FACT_DF.columns:
        FACT_DF = FACT_DF.withColumnRenamed(source_col, dest_col)




# Show DataFrame to verify changes
print("Transformed Data for FACT_DF:")

FACT_DF.printSchema()
FACT_DF.show(5)

+---------+---------------+--------+-----------+-----------+-----+---------+----------+-----------+------------+----------+-----------+----------+------+
|loan_amnt|funded_amnt_inv|int_rate|installment|loan_status|  dti|revol_bal|revol_util|tot_cur_bal|total_bal_il|max_bal_bc|delinq_amnt|   issue_d|    id|
+---------+---------------+--------+-----------+-----------+-----+---------+----------+-----------+------------+----------+-----------+----------+------+
|   8000.0|         8000.0|   6.49%|     245.16|Charged Off|10.84|   3012.0|     35.4%|    74131.0|     17290.0|    1569.0|        0.0|2016-01-01| 56121|
|   8800.0|         8800.0|  18.99%|     322.53|Charged Off|18.24|  14741.0|     81.9%|   173057.0|    158316.0|    4878.0|        0.0|2016-05-01| 65104|
|  16000.0|        16000.0|   6.89%|     493.23| Fully Paid|23.43|  49975.0|     73.4%|   262520.0|        NULL|      NULL|        0.0|2015-06-01| 65419|
|  10000.0|        10000.0|   7.89%|     312.86| Fully Paid|11.24|  10063.0|

# Load Dim_LOANPRODUCT from snowflake into spark data frame

In [2]:
from pyspark.sql import SparkSession


# Step 3: Load data from Snowflake table into Spark DataFrame
df_dim_Loanproduct = spark.read \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_LOANPRODUCT") \
    .load()

# Step 4: Show the data
df_dim_Loanproduct.show(4)

# Optional: Save the DataFrame as a CSV or another format if needed
# df_snowflake.write.option("header", "true").csv("output_path")


+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|LOAN_PRODUCT_KEY_PK|LOAN_GRADE|LOAN_SUBGRADE|LOAN_TERM|      LOAN_PURPOSE|INITIAL_LIST_STATUS|OUT_PRINCIPAL|APPLICATION_TYPE|DEBT_SETTLEMENT_FLAG|LOANPRODUCT_BK|
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|                  1|         A|           A2|       36|  home_improvement|                  w|          0.0|      Individual|                   N|         56121|
|                  2|         D|           D4|       36|debt_consolidation|                  w|          0.0|      Individual|                   N|         65104|
|                  3|         A|           A3|       36|  home_improvement|                  w|          0.0|      Individual|                   N|         65419|
|                  4| 

# Load Dim_Borrower from snowflake into spark data frame

In [3]:
from pyspark.sql import SparkSession


# Step 3: Load data from Snowflake table into Spark DataFrame
df_dim_Borrower = spark.read \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_BORROWER") \
    .load()

# Step 4: Show the data
df_dim_Borrower.show(4)

# Optional: Save the DataFrame as a CSV or another format if needed
# df_snowflake.write.option("header", "true").csv("output_path")


+------------------+-------------+----------------+-----------------+-------------+-------------------------+-------------------+--------+-------------+------------+-------------+--------------+------------+-------------+-------------+----------------+-------------------+----------------+---------------------------+-------------------------+----------------+-------------+--------------+
|BORROWER_KEY_PK_SK|ANNUAL_INCOME|EMPLOYMENT_TITLE|EMPLOYMENT_LENGTH|HOMEOWNERSHIP|EARLIEST_CREDIT_LINE_DATE|VERIFICATION_STATUS|ZIP_CODE|ADDRESS_STATE|       STATE|DELINQ_2YEARS|AVG_FICO_RANGE|OPEN_ACCOUNT|PUBLIC_RECORD|TOTAL_ACCOUNT|LAST_PAYMENT_DAY|LAST_PAYMENT_AMOUNT|NEXT_PAYMENT_DAY|MTHS_SINCE_LAST_MAJOR_DEROG|ACTIVE_LOANS_LAST_24MONTH|MORTGAGE_ACCOUNT|HARDSHIP_FLAG|LOANPRODUCT_BK|
+------------------+-------------+----------------+-----------------+-------------+-------------------------+-------------------+--------+-------------+------------+-------------+--------------+------------+-------------

# Load Dim_DAte from snowflake into spark data frame

In [4]:
from pyspark.sql import SparkSession


# Step 3: Load data from Snowflake table into Spark DataFrame
df_dim_date = spark.read \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_DATE") \
    .load()

# Step 4: Show the data
df_dim_date.show(4)

# Optional: Save the DataFrame as a CSV or another format if needed
# df_snowflake.write.option("header", "true").csv("output_path")


+--------+----------+---+-----+----+
| DATE_SK|      DATE|DAY|MONTH|YEAR|
+--------+----------+---+-----+----+
|20000101|2000-01-01|  1|    1|2000|
|20000102|2000-01-02|  2|    1|2000|
|20000103|2000-01-03|  3|    1|2000|
|20000104|2000-01-04|  4|    1|2000|
+--------+----------+---+-----+----+
only showing top 4 rows



In [14]:
df_dim_date.printSchema()

root
 |-- DATE_SK: decimal(38,0) (nullable = false)
 |-- DATE: date (nullable = false)
 |-- DAY: decimal(38,0) (nullable = true)
 |-- MONTH: decimal(38,0) (nullable = true)
 |-- YEAR: decimal(38,0) (nullable = true)



# join FACT_DF with dim_Borrower and DIM_LOANPRODUCT to get df_FACT_TABLE_enrich

In [None]:
# join FACT_DF with dim_Borrower and DIM_LOANPRODUCT to get df_FACT_TABLE_enrich

In [6]:
fact_Borrower_columns= ['LOAN_AMOUNT', 'FUNDED_AMOUNT_INVESTOR', 'INTEREST_RATE', 'INSTALLMENT',
 'LOAN_STATUS', 'DTI', 'REVOLVING_BALANCE', 'REVOLVING_UTILIZATION',
 'TOTAL_CURRENT_BALANCE', 'TOTAL_BALANCE_INSTALLMENT', 'MAXIMUM_BALANCE',
 'DELINQUENT_AMOUNT', 'ISSUE_DATE', FACT_DF['LOANPRODUCT_BK'], 'BORROWER_KEY_PK_SK']

In [7]:
# First join: FACT_DF with dim_Borrower on BORROWER_KEY_PK_SK
fact_with_borrower = FACT_DF.join(
    df_dim_Borrower,
    on=FACT_DF["LOANPRODUCT_BK"] == df_dim_Borrower["LOANPRODUCT_BK"],
    how="left"
).select(fact_Borrower_columns)

fact_with_borrower.printSchema()

root
 |-- LOAN_AMOUNT: string (nullable = true)
 |-- FUNDED_AMOUNT_INVESTOR: string (nullable = true)
 |-- INTEREST_RATE: string (nullable = true)
 |-- INSTALLMENT: string (nullable = true)
 |-- LOAN_STATUS: string (nullable = true)
 |-- DTI: string (nullable = true)
 |-- REVOLVING_BALANCE: string (nullable = true)
 |-- REVOLVING_UTILIZATION: string (nullable = true)
 |-- TOTAL_CURRENT_BALANCE: string (nullable = true)
 |-- TOTAL_BALANCE_INSTALLMENT: string (nullable = true)
 |-- MAXIMUM_BALANCE: string (nullable = true)
 |-- DELINQUENT_AMOUNT: string (nullable = true)
 |-- ISSUE_DATE: string (nullable = true)
 |-- LOANPRODUCT_BK: string (nullable = true)
 |-- BORROWER_KEY_PK_SK: decimal(38,0) (nullable = true)



In [8]:
fact_Borrower_LoanPRODUCT_columns= ['LOAN_AMOUNT', 'FUNDED_AMOUNT_INVESTOR', 'INTEREST_RATE', 'INSTALLMENT',
 'LOAN_STATUS', 'DTI', 'REVOLVING_BALANCE', 'REVOLVING_UTILIZATION',
 'TOTAL_CURRENT_BALANCE', 'TOTAL_BALANCE_INSTALLMENT', 'MAXIMUM_BALANCE',
 'DELINQUENT_AMOUNT', 'ISSUE_DATE', fact_with_borrower['LOANPRODUCT_BK'], 'BORROWER_KEY_PK_SK','LOAN_PRODUCT_KEY_PK']

In [9]:
# Second join: result with dim_Loanproduct on LOANPRODUCT_BK
df_FACT_TABLE_enrich = fact_with_borrower.join(
    df_dim_Loanproduct,
    on=fact_with_borrower["LOANPRODUCT_BK"] == df_dim_Loanproduct["LOANPRODUCT_BK"],
    how="left"
).select(fact_Borrower_LoanPRODUCT_columns)

# Optional: show result
df_FACT_TABLE_enrich.show(2)
df_FACT_TABLE_enrich.printSchema()

+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+
|LOAN_AMOUNT|FUNDED_AMOUNT_INVESTOR|INTEREST_RATE|INSTALLMENT|LOAN_STATUS|  DTI|REVOLVING_BALANCE|REVOLVING_UTILIZATION|TOTAL_CURRENT_BALANCE|TOTAL_BALANCE_INSTALLMENT|MAXIMUM_BALANCE|DELINQUENT_AMOUNT|ISSUE_DATE|LOANPRODUCT_BK|BORROWER_KEY_PK_SK|LOAN_PRODUCT_KEY_PK|
+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+
|    24000.0|               24000.0|       14.98%|     570.71|Charged Off| 6.81|          12897.0|                38.8%|             160760.0|                     NULL|           NULL|            

In [10]:
# Define the window spec for row numbering
windowSpec = Window.orderBy(F.lit(1))  # Constant value to ensure numbering starts from 1

# Add the BORROWER_KEY_PK_SK column with row numbers starting from 1
df_FACT_TABLE_enrich = df_FACT_TABLE_enrich.withColumn("LOAN_APP_PK_SK", F.row_number().over(windowSpec))

# List of columns to select (ensure that the column names match after renaming)
final_columns_of_FACT_TABLE = [
    'LOAN_APP_PK_SK','LOAN_AMOUNT', 'FUNDED_AMOUNT_INVESTOR', 'INTEREST_RATE', 'INSTALLMENT',
 'LOAN_STATUS', 'DTI', 'REVOLVING_BALANCE', 'REVOLVING_UTILIZATION',
 'TOTAL_CURRENT_BALANCE', 'TOTAL_BALANCE_INSTALLMENT', 'MAXIMUM_BALANCE',
 'DELINQUENT_AMOUNT', 'ISSUE_DATE', df_FACT_TABLE_enrich['LOANPRODUCT_BK'], 'BORROWER_KEY_PK_SK','LOAN_PRODUCT_KEY_PK']

# Select the required columns
df_FACT_TABLE_enrich = df_FACT_TABLE_enrich.select(final_columns_of_FACT_TABLE)

In [11]:
df_FACT_TABLE_enrich.printSchema()

root
 |-- LOAN_APP_PK_SK: integer (nullable = false)
 |-- LOAN_AMOUNT: string (nullable = true)
 |-- FUNDED_AMOUNT_INVESTOR: string (nullable = true)
 |-- INTEREST_RATE: string (nullable = true)
 |-- INSTALLMENT: string (nullable = true)
 |-- LOAN_STATUS: string (nullable = true)
 |-- DTI: string (nullable = true)
 |-- REVOLVING_BALANCE: string (nullable = true)
 |-- REVOLVING_UTILIZATION: string (nullable = true)
 |-- TOTAL_CURRENT_BALANCE: string (nullable = true)
 |-- TOTAL_BALANCE_INSTALLMENT: string (nullable = true)
 |-- MAXIMUM_BALANCE: string (nullable = true)
 |-- DELINQUENT_AMOUNT: string (nullable = true)
 |-- ISSUE_DATE: string (nullable = true)
 |-- LOANPRODUCT_BK: string (nullable = true)
 |-- BORROWER_KEY_PK_SK: decimal(38,0) (nullable = true)
 |-- LOAN_PRODUCT_KEY_PK: decimal(38,0) (nullable = true)



In [12]:
df_FACT_TABLE_enrich.show(2)

+--------------+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+
|LOAN_APP_PK_SK|LOAN_AMOUNT|FUNDED_AMOUNT_INVESTOR|INTEREST_RATE|INSTALLMENT|LOAN_STATUS|  DTI|REVOLVING_BALANCE|REVOLVING_UTILIZATION|TOTAL_CURRENT_BALANCE|TOTAL_BALANCE_INSTALLMENT|MAXIMUM_BALANCE|DELINQUENT_AMOUNT|ISSUE_DATE|LOANPRODUCT_BK|BORROWER_KEY_PK_SK|LOAN_PRODUCT_KEY_PK|
+--------------+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+
|             1|    24000.0|               24000.0|       14.98%|     570.71|Charged Off| 6.81|          12897.0|                38.8%|             160

In [13]:
final_columns_of_FACT_TABLE_with_DATE = [
    'LOAN_APP_PK_SK','LOAN_AMOUNT', 'FUNDED_AMOUNT_INVESTOR', 'INTEREST_RATE', 'INSTALLMENT',
 'LOAN_STATUS', 'DTI', 'REVOLVING_BALANCE', 'REVOLVING_UTILIZATION',
 'TOTAL_CURRENT_BALANCE', 'TOTAL_BALANCE_INSTALLMENT', 'MAXIMUM_BALANCE',
 'DELINQUENT_AMOUNT', 'ISSUE_DATE', df_FACT_TABLE_enrich['LOANPRODUCT_BK'], 'BORROWER_KEY_PK_SK','LOAN_PRODUCT_KEY_PK','DATE_SK']

In [15]:
# code to change ISSUE_DATE type to DATE
from pyspark.sql.functions import to_date, col
from pyspark.sql.types import DateType

# Convert ISSUE_DATE to date format (assuming it's in format like 'yyyy-MM-dd' or 'MMM-yyyy')
df_FACT_TABLE_enrich = df_FACT_TABLE_enrich.withColumn(
    "ISSUE_DATE",
    to_date(col("ISSUE_DATE"), "yyyy-MM-dd")  # Change format if needed, e.g. "MMM-yyyy"
)

In [16]:
# third join: 
final_FACT_TABLE_enrich = df_FACT_TABLE_enrich.join(
    df_dim_date,
    on=df_FACT_TABLE_enrich["ISSUE_DATE"] == df_dim_date["DATE"],
    how="left"
).select(final_columns_of_FACT_TABLE_with_DATE)

# Optional: show result
final_FACT_TABLE_enrich.show(2)
final_FACT_TABLE_enrich.printSchema()

+--------------+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+--------+
|LOAN_APP_PK_SK|LOAN_AMOUNT|FUNDED_AMOUNT_INVESTOR|INTEREST_RATE|INSTALLMENT|LOAN_STATUS|  DTI|REVOLVING_BALANCE|REVOLVING_UTILIZATION|TOTAL_CURRENT_BALANCE|TOTAL_BALANCE_INSTALLMENT|MAXIMUM_BALANCE|DELINQUENT_AMOUNT|ISSUE_DATE|LOANPRODUCT_BK|BORROWER_KEY_PK_SK|LOAN_PRODUCT_KEY_PK| DATE_SK|
+--------------+-----------+----------------------+-------------+-----------+-----------+-----+-----------------+---------------------+---------------------+-------------------------+---------------+-----------------+----------+--------------+------------------+-------------------+--------+
|             1|    24000.0|               24000.0|       14.98%|     570.71|Charged Off| 6.81|          12897.0|           

In [18]:
final_FACT_TABLE_enrich.printSchema()

root
 |-- LOAN_APP_PK_SK: integer (nullable = false)
 |-- LOAN_AMOUNT: string (nullable = true)
 |-- FUNDED_AMOUNT_INVESTOR: string (nullable = true)
 |-- INTEREST_RATE: string (nullable = true)
 |-- INSTALLMENT: string (nullable = true)
 |-- LOAN_STATUS: string (nullable = true)
 |-- DTI: string (nullable = true)
 |-- REVOLVING_BALANCE: string (nullable = true)
 |-- REVOLVING_UTILIZATION: string (nullable = true)
 |-- TOTAL_CURRENT_BALANCE: string (nullable = true)
 |-- TOTAL_BALANCE_INSTALLMENT: string (nullable = true)
 |-- MAXIMUM_BALANCE: string (nullable = true)
 |-- DELINQUENT_AMOUNT: string (nullable = true)
 |-- ISSUE_DATE: date (nullable = true)
 |-- LOANPRODUCT_BK: string (nullable = true)
 |-- BORROWER_KEY_PK_SK: decimal(38,0) (nullable = true)
 |-- LOAN_PRODUCT_KEY_PK: decimal(38,0) (nullable = true)
 |-- DATE_SK: decimal(38,0) (nullable = true)



# Load into FACT_LOANAPPLICATION

In [22]:

try:
    print("Testing Snowflake connection...")
    test_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT 1 AS test_value") \
        .load()
    test_df.show()
    print("Connection test successful!")
 
    print("Writing data to Snowflake...")
    final_FACT_TABLE_enrich.write \
        .format("snowflake") \
        .options(**sf_options) \
        .option("dbtable", "FACT_LOANAPPLICATION") \
        .mode("append") \
        .save()
 
    print("Data load complete!")
 #######################

#############################################
    """
    print("Reading data back from Snowflake to verify...")
    snowflake_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT * FROM Loan_DB.Loan_Schema.FACT_LOANAPPLICATION") \
        .load()
 
    snowflake_df.show()
 """
except Exception as e:
    print("Error occurred:", str(e))
 
#finally:
    #spark.stop()

Testing Snowflake connection...
+----------+
|TEST_VALUE|
+----------+
|         1|
+----------+

Connection test successful!
Writing data to Snowflake...
Data load complete!
