In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DecimalType,IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark with Snowflake packages
spark = SparkSession.builder \
    .appName("SnowflakeLoad_LoanProduct") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.22,net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3") \
    .getOrCreate()



# Load CSV file into DataFrame (Replace 'your_file.csv' with actual filename)
csv_path = "Transformed_2014_18_v2.csv"
df = spark.read.option("header", True).csv(csv_path)  #.sample(withReplacement=False, fraction=0.0001, seed=42)


#### customize the csv file for Dim_loan_product

column_mapping = {
    "application_type": "application_type",
    "annual_inc_joint": "annual_income_joint",
    "dti_joint": "debt_to_income_ratio_joint",
    "sec_app_fico_range_avg": "sec_app_fico_range_avg",
    "sec_app_earliest_cr_line": "sec_app_earliest_cr_line",
    "sec_app_inq_last_6mths": "sec_app_inq_last_6mths",
    "sec_app_mort_acc": "sec_app_mort_acc",
    "sec_app_open_acc": "sec_app_open_acc",
    "sec_app_revol_util":"sec_app_revol_util",
    "id":"LOANPRODUCT_BK"
}


columns_of_SecBorrower = [
    "application_type", "annual_inc_joint", "dti_joint", "sec_app_fico_range_avg","sec_app_earliest_cr_line", 
    "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc","sec_app_revol_util",
    "id"
]
 
dim_SecBorrower = df.filter(df.application_type == "Joint App").select(columns_of_SecBorrower)
dim_SecBorrower.show(5)

# Snowflake connection options
sf_options = {
    "sfURL": "https://WOA97553.east-us-2.azure.snowflakecomputing.com",
    "sfUser": 'hussien1',
    "sfPassword":'@Hussien123456',
    "sfDatabase": 'Loan_DB',
    "sfSchema": 'Loan_Schema',
    "sfWarehouse": 'loan_Warehouse',
    "autopushdown": "on",
    "usestagingtable": "on"
}

from pyspark.sql import functions as F
from pyspark.sql.window import Window



# Rename columns in the DataFrame according to the column mapping
for source_col, dest_col in column_mapping.items():
    if source_col in dim_SecBorrower.columns:
        dim_SecBorrower = dim_SecBorrower.withColumnRenamed(source_col, dest_col)




# Show DataFrame to verify changes
print("Transformed Data for dim_SecBorrower:")

dim_SecBorrower.printSchema()
dim_SecBorrower.show(5)

SyntaxError: expression expected after dictionary key and ':' (655834417.py, line 42)

# Load Dim_Borrower from snowflake into spark data frame

In [4]:
from pyspark.sql import SparkSession


# Step 3: Load data from Snowflake table into Spark DataFrame
df_dim_Borrower = spark.read \
    .format("snowflake") \
    .options(**sf_options) \
    .option("dbtable", "DIM_BORROWER") \
    .load()

# Step 4: Show the data
df_dim_Borrower.show(4)

# Optional: Save the DataFrame as a CSV or another format if needed
# df_snowflake.write.option("header", "true").csv("output_path")


+------------------+-------------+----------------+-----------------+-------------+-------------------------+-------------------+--------+-------------+------------+-------------+--------------+------------+-------------+-------------+----------------+-------------------+----------------+---------------------------+-------------------------+----------------+-------------+--------------+
|BORROWER_KEY_PK_SK|ANNUAL_INCOME|EMPLOYMENT_TITLE|EMPLOYMENT_LENGTH|HOMEOWNERSHIP|EARLIEST_CREDIT_LINE_DATE|VERIFICATION_STATUS|ZIP_CODE|ADDRESS_STATE|       STATE|DELINQ_2YEARS|AVG_FICO_RANGE|OPEN_ACCOUNT|PUBLIC_RECORD|TOTAL_ACCOUNT|LAST_PAYMENT_DAY|LAST_PAYMENT_AMOUNT|NEXT_PAYMENT_DAY|MTHS_SINCE_LAST_MAJOR_DEROG|ACTIVE_LOANS_LAST_24MONTH|MORTGAGE_ACCOUNT|HARDSHIP_FLAG|LOANPRODUCT_BK|
+------------------+-------------+----------------+-----------------+-------------+-------------------------+-------------------+--------+-------------+------------+-------------+--------------+------------+-------------

# join dim_secondBorrowe with dim_Borrower to get df_dim_secBorrower_enrich

In [5]:
# Join based on matching loan product fields (example: loan_grade, loan_term, etc.)
df_dim_secBorrower_enrich = dim_SecBorrower.join(
    df_dim_Borrower,
    on=[
        dim_SecBorrower["LOANPRODUCT_BK"] == df_dim_Borrower["LOANPRODUCT_BK"]
          # add more if needed
    ],
    how="left"
)

df_dim_secBorrower_enrich.show(2)

+----------------+-------------------+--------------------------+----------------------+------------------------+----------------------+----------------+----------------+------------------+--------------+------------------+-------------+--------------------+-----------------+-------------+-------------------------+-------------------+--------+-------------+--------+-------------+--------------+------------+-------------+-------------+----------------+-------------------+----------------+---------------------------+-------------------------+----------------+-------------+--------------+
|application_type|annual_income_joint|debt_to_income_ratio_joint|sec_app_fico_range_avg|sec_app_earliest_cr_line|sec_app_inq_last_6mths|sec_app_mort_acc|sec_app_open_acc|sec_app_revol_util|LOANPRODUCT_BK|BORROWER_KEY_PK_SK|ANNUAL_INCOME|    EMPLOYMENT_TITLE|EMPLOYMENT_LENGTH|HOMEOWNERSHIP|EARLIEST_CREDIT_LINE_DATE|VERIFICATION_STATUS|ZIP_CODE|ADDRESS_STATE|   STATE|DELINQ_2YEARS|AVG_FICO_RANGE|OPEN_ACCOU

# # Select final columns

In [6]:
new_columns_of_second_Borrower = [
    "application_type", "annual_income_joint", "debt_to_income_ratio_joint", "sec_app_fico_range_avg","sec_app_earliest_cr_line", 
    "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc","sec_app_revol_util",
    dim_SecBorrower["LOANPRODUCT_BK"],"BORROWER_KEY_PK_SK"
]


final_dim_secBorrower = df_dim_secBorrower_enrich.select(new_columns_of_second_Borrower)

final_dim_secBorrower.show(2)
final_dim_secBorrower.printSchema()

+----------------+-------------------+--------------------------+----------------------+------------------------+----------------------+----------------+----------------+------------------+--------------+------------------+
|application_type|annual_income_joint|debt_to_income_ratio_joint|sec_app_fico_range_avg|sec_app_earliest_cr_line|sec_app_inq_last_6mths|sec_app_mort_acc|sec_app_open_acc|sec_app_revol_util|LOANPRODUCT_BK|BORROWER_KEY_PK_SK|
+----------------+-------------------+--------------------------+----------------------+------------------------+----------------------+----------------+----------------+------------------+--------------+------------------+
|       Joint App|           115000.0|                     24.95|                  NULL|                    NULL|                  NULL|            NULL|            NULL|              NULL|      32519551|           1549831|
|       Joint App|           168472.0|                      4.55|                  NULL|                

In [7]:
# Define the window spec for row numbering
windowSpec = Window.orderBy(F.lit(1))  # Constant value to ensure numbering starts from 1

# Add the BORROWER_KEY_PK_SK column with row numbers starting from 1
final_dim_secBorrower = final_dim_secBorrower.withColumn("SEC_BORROWER_KEY_PK_SK", F.row_number().over(windowSpec))

# List of columns to select (ensure that the column names match after renaming)
final_columns_of_second_Borrower = [
    "SEC_BORROWER_KEY_PK_SK","application_type", "annual_income_joint", "debt_to_income_ratio_joint", "sec_app_fico_range_avg","sec_app_earliest_cr_line", 
    "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc","sec_app_revol_util",
    dim_SecBorrower["LOANPRODUCT_BK"],"BORROWER_KEY_PK_SK"
]

# Select the required columns
final_dim_secBorrower = final_dim_secBorrower.select(final_columns_of_second_Borrower)


In [8]:
final_dim_secBorrower.printSchema()

root
 |-- SEC_BORROWER_KEY_PK_SK: integer (nullable = false)
 |-- application_type: string (nullable = true)
 |-- annual_income_joint: string (nullable = true)
 |-- debt_to_income_ratio_joint: string (nullable = true)
 |-- sec_app_fico_range_avg: string (nullable = true)
 |-- sec_app_earliest_cr_line: string (nullable = true)
 |-- sec_app_inq_last_6mths: string (nullable = true)
 |-- sec_app_mort_acc: string (nullable = true)
 |-- sec_app_open_acc: string (nullable = true)
 |-- sec_app_revol_util: string (nullable = true)
 |-- LOANPRODUCT_BK: string (nullable = true)
 |-- BORROWER_KEY_PK_SK: decimal(38,0) (nullable = true)



# Load into DIM_SECONDBORROWER

In [9]:

try:
    print("Testing Snowflake connection...")
    test_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT 1 AS test_value") \
        .load()
    test_df.show()
    print("Connection test successful!")
 
    print("Writing data to Snowflake...")
    final_dim_secBorrower.write \
        .format("snowflake") \
        .options(**sf_options) \
        .option("dbtable", "DIM_SECONDBORROWER") \
        .mode("append") \
        .save()
 
    print("Data load complete!")
 #######################

#############################################
    """
    print("Reading data back from Snowflake to verify...")
    snowflake_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT * FROM Loan_DB.Loan_Schema.DIM_SECONDBORROWER") \
        .load()
 
    snowflake_df.show()
 """
except Exception as e:
    print("Error occurred:", str(e))
 
#finally:
    #spark.stop()

Testing Snowflake connection...
+----------+
|TEST_VALUE|
+----------+
|         1|
+----------+

Connection test successful!
Writing data to Snowflake...
Data load complete!
