In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DecimalType,IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark with Snowflake packages
spark = SparkSession.builder \
    .appName("SnowflakeLoad_LoanProduct") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.22,net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3") \
    .getOrCreate()

# my addition     
custom_schema = StructType([
    StructField("LOAN_PRODUCT_KEY_PK", DecimalType(38, 0), True),
    StructField("LOAN_GRADE", StringType(), True),
    StructField("LOAN_SUBGRADE", StringType(), True),
    StructField("LOAN_TERM", StringType(), True),
    StructField("LOAN_PURPOSE", StringType(), True),
    StructField("INITIAL_LIST_STATUS", StringType(), True),
    StructField("OUT_PRINCIPAL", FloatType(), True),   #float
    StructField("APPLICATION_TYPE", StringType(), True),
    StructField("DEBT_SETTLEMENT_FLAG", StringType(), True)
])

# Load CSV file into DataFrame (Replace 'your_file.csv' with actual filename)
csv_path = "Transformed_2014_18_v2.csv"
df = spark.read.option("header", True).csv(csv_path)  #.sample(withReplacement=False, fraction=0.0001, seed=42)


#### customize the csv file for Dim_loan_product

column_mapping = {
    "grade": "loan_grade",
    "sub_grade": "loan_subgrade",
    "term": "loan_term",
    "purpose": "loan_purpose",
    "initial_list_status": "initial_list_status",
    "out_prncp": "out_principal",
    "application_type": "application_type",
    "debt_settlement_flag": "debt_settlement_flag",
    "id":"LOANPRODUCT_BK"
}

columns_of_loanProduct = [
    "grade", "sub_grade", "term", "purpose","initial_list_status", "out_prncp", "application_type", "debt_settlement_flag","id"
]
 
dim_loanproduct = df.select(columns_of_loanProduct)
dim_loanproduct.show(5)

# Snowflake connection options
sf_options = {
    "sfURL": "https://WOA97553.east-us-2.azure.snowflakecomputing.com",
    "sfUser": 'hussien1',
    "sfPassword":'@Hussien123456',
    "sfDatabase": 'Loan_DB',
    "sfSchema": 'Loan_Schema',
    "sfWarehouse": 'loan_Warehouse',
    "autopushdown": "on",
    "usestagingtable": "on"
}

+-----+---------+----+------------------+-------------------+---------+----------------+--------------------+------+
|grade|sub_grade|term|           purpose|initial_list_status|out_prncp|application_type|debt_settlement_flag|    id|
+-----+---------+----+------------------+-------------------+---------+----------------+--------------------+------+
|    A|       A2|  36|  home_improvement|                  w|      0.0|      Individual|                   N| 56121|
|    D|       D4|  36|debt_consolidation|                  w|      0.0|      Individual|                   N| 65104|
|    A|       A3|  36|  home_improvement|                  w|      0.0|      Individual|                   N| 65419|
|    A|       A5|  36|       credit_card|                  f|      0.0|      Individual|                   N|364880|
|    C|       C5|  36|debt_consolidation|                  f|      0.0|      Individual|                   N|366792|
+-----+---------+----+------------------+-------------------+---

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Column Mapping: Rename source columns to match DIM_BORROWER destination columns
column_mapping = {
    "grade": "loan_grade",
    "sub_grade": "loan_subgrade",
    "term": "loan_term",
    "purpose": "loan_purpose",
    "initial_list_status": "initial_list_status",
    "out_prncp": "out_principal",
    "application_type": "application_type",
    "debt_settlement_flag": "debt_settlement_flag",
    "id":"LOANPRODUCT_BK"
}

# Rename columns in the DataFrame according to the column mapping
for source_col, dest_col in column_mapping.items():
    if source_col in dim_loanproduct.columns:
        dim_loanproduct = dim_loanproduct.withColumnRenamed(source_col, dest_col)

# Define the window spec for row numbering
windowSpec = Window.orderBy(F.lit(1))  # Constant value to ensure numbering starts from 1

# Add the BORROWER_KEY_PK_SK column with row numbers starting from 1
dim_loanproduct = dim_loanproduct.withColumn("LOAN_PRODUCT_KEY_PK", F.row_number().over(windowSpec))

# List of columns to select (ensure that the column names match after renaming)
columns_of_loanProduct = [
    "LOAN_PRODUCT_KEY_PK","loan_grade", "loan_subgrade", "loan_term", "loan_purpose","initial_list_status", "out_principal", "application_type", "debt_settlement_flag","LOANPRODUCT_BK"
]

# Select the required columns
dim_loanproduct = dim_loanproduct.select(columns_of_loanProduct)

# Show DataFrame to verify changes
print("Transformed Data for DIM_LOANPRODUCT:")
dim_loanproduct.show()


Transformed Data for DIM_LOANPRODUCT:
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|LOAN_PRODUCT_KEY_PK|loan_grade|loan_subgrade|loan_term|      loan_purpose|initial_list_status|out_principal|application_type|debt_settlement_flag|LOANPRODUCT_BK|
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|                  1|         A|           A2|       36|  home_improvement|                  w|          0.0|      Individual|                   N|         56121|
|                  2|         D|           D4|       36|debt_consolidation|                  w|          0.0|      Individual|                   N|         65104|
|                  3|         A|           A3|       36|  home_improvement|                  w|          0.0|      Individual|                   N|

In [3]:
dim_loanproduct.printSchema()

root
 |-- LOAN_PRODUCT_KEY_PK: integer (nullable = false)
 |-- loan_grade: string (nullable = true)
 |-- loan_subgrade: string (nullable = true)
 |-- loan_term: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- out_principal: string (nullable = true)
 |-- application_type: string (nullable = true)
 |-- debt_settlement_flag: string (nullable = true)
 |-- LOANPRODUCT_BK: string (nullable = true)



In [4]:
dim_loanproduct.show(3)

+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|LOAN_PRODUCT_KEY_PK|loan_grade|loan_subgrade|loan_term|      loan_purpose|initial_list_status|out_principal|application_type|debt_settlement_flag|LOANPRODUCT_BK|
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+--------------+
|                  1|         A|           A2|       36|  home_improvement|                  w|          0.0|      Individual|                   N|         56121|
|                  2|         D|           D4|       36|debt_consolidation|                  w|          0.0|      Individual|                   N|         65104|
|                  3|         A|           A3|       36|  home_improvement|                  w|          0.0|      Individual|                   N|         65419|
+-------------------+-

In [None]:
sampled_df = dim_loanproduct.sample(fraction=0.0001, seed=42) 
sampled_df.show(3)

In [7]:
sampled_df.printSchema()

root
 |-- LOAN_PRODUCT_KEY_PK: integer (nullable = false)
 |-- loan_grade: string (nullable = true)
 |-- loan_subgrade: string (nullable = true)
 |-- loan_term: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- out_principal: string (nullable = true)
 |-- application_type: string (nullable = true)
 |-- debt_settlement_flag: string (nullable = true)



In [9]:

try:
    print("Testing Snowflake connection...")
    test_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT 1 AS test_value") \
        .load()
    test_df.show()
    print("Connection test successful!")
 
    print("Writing data to Snowflake...")
    dim_loanproduct.write \
        .format("snowflake") \
        .options(**sf_options) \
        .option("dbtable", "DIM_LOANPRODUCT") \
        .mode("append") \
        .save()
 
    print("Data load complete!")
 #######################

#############################################
    print("Reading data back from Snowflake to verify...")
    snowflake_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT * FROM Loan_DB.Loan_Schema.DIM_LOANPRODUCT") \
        .load()
 
    snowflake_df.show()
 
except Exception as e:
    print("Error occurred:", str(e))
 
#finally:
    #spark.stop()

Testing Snowflake connection...
+----------+
|TEST_VALUE|
+----------+
|         1|
+----------+

Connection test successful!
Writing data to Snowflake...
Data load complete!
Reading data back from Snowflake to verify...
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+
|LOAN_PRODUCT_KEY_PK|LOAN_GRADE|LOAN_SUBGRADE|LOAN_TERM|      LOAN_PURPOSE|INITIAL_LIST_STATUS|OUT_PRINCIPAL|APPLICATION_TYPE|DEBT_SETTLEMENT_FLAG|
+-------------------+----------+-------------+---------+------------------+-------------------+-------------+----------------+--------------------+
|                  1|         A|           A2|       36|  home_improvement|                  w|          0.0|      Individual|                   N|
|                  2|         D|           D4|       36|debt_consolidation|                  w|          0.0|      Individual|                   N|
|                  3|         A|       

# Load data to dim_loanproduct at one cell

In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DecimalType,IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark with Snowflake packages
spark = SparkSession.builder \
    .appName("SnowflakeLoad_LoanProduct") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.22,net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3") \
    .getOrCreate()

# my addition     
custom_schema = StructType([
    StructField("LOAN_PRODUCT_KEY_PK", DecimalType(38, 0), True),
    StructField("LOAN_GRADE", StringType(), True),
    StructField("LOAN_SUBGRADE", StringType(), True),
    StructField("LOAN_TERM", StringType(), True),
    StructField("LOAN_PURPOSE", StringType(), True),
    StructField("INITIAL_LIST_STATUS", StringType(), True),
    StructField("OUT_PRINCIPAL", FloatType(), True),   #float
    StructField("APPLICATION_TYPE", StringType(), True),
    StructField("DEBT_SETTLEMENT_FLAG", StringType(), True)
])

# Load CSV file into DataFrame (Replace 'your_file.csv' with actual filename)
csv_path = "Transformed_2014_18_v2.csv"
df = spark.read.option("header", True).csv(csv_path)  #.sample(withReplacement=False, fraction=0.0001, seed=42)


#### customize the csv file for Dim_loan_product

column_mapping = {
    "grade": "loan_grade",
    "sub_grade": "loan_subgrade",
    "term": "loan_term",
    "purpose": "loan_purpose",
    "initial_list_status": "initial_list_status",
    "out_prncp": "out_principal",
    "application_type": "application_type",
    "debt_settlement_flag": "debt_settlement_flag",
    "id":"LOANPRODUCT_BK"
}

columns_of_loanProduct = [
    "grade", "sub_grade", "term", "purpose","initial_list_status", "out_prncp", "application_type", "debt_settlement_flag","id"
]
 
dim_loanproduct = df.select(columns_of_loanProduct)
dim_loanproduct.show(5)

# Snowflake connection options
sf_options = {
    "sfURL": "https://WOA97553.east-us-2.azure.snowflakecomputing.com",
    "sfUser": 'hussien1',
    "sfPassword":'@Hussien123456',
    "sfDatabase": 'Loan_DB',
    "sfSchema": 'Loan_Schema',
    "sfWarehouse": 'loan_Warehouse',
    "autopushdown": "on",
    "usestagingtable": "on"
}

from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Column Mapping: Rename source columns to match DIM_BORROWER destination columns
column_mapping = {
    "grade": "loan_grade",
    "sub_grade": "loan_subgrade",
    "term": "loan_term",
    "purpose": "loan_purpose",
    "initial_list_status": "initial_list_status",
    "out_prncp": "out_principal",
    "application_type": "application_type",
    "debt_settlement_flag": "debt_settlement_flag",
    "id":"LOANPRODUCT_BK"
}

# Rename columns in the DataFrame according to the column mapping
for source_col, dest_col in column_mapping.items():
    if source_col in dim_loanproduct.columns:
        dim_loanproduct = dim_loanproduct.withColumnRenamed(source_col, dest_col)

# Define the window spec for row numbering
windowSpec = Window.orderBy(F.lit(1))  # Constant value to ensure numbering starts from 1

# Add the BORROWER_KEY_PK_SK column with row numbers starting from 1
dim_loanproduct = dim_loanproduct.withColumn("LOAN_PRODUCT_KEY_PK", F.row_number().over(windowSpec))

# List of columns to select (ensure that the column names match after renaming)
columns_of_loanProduct = [
    "LOAN_PRODUCT_KEY_PK","loan_grade", "loan_subgrade", "loan_term", "loan_purpose","initial_list_status", "out_principal", "application_type", "debt_settlement_flag","LOANPRODUCT_BK"
]

# Select the required columns
dim_loanproduct = dim_loanproduct.select(columns_of_loanProduct)

# Show DataFrame to verify changes
print("Transformed Data for DIM_LOANPRODUCT:")


try:
    print("Testing Snowflake connection...")
    test_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT 1 AS test_value") \
        .load()
    test_df.show()
    print("Connection test successful!")
 
    print("Writing data to Snowflake...")
    dim_loanproduct.write \
        .format("snowflake") \
        .options(**sf_options) \
        .option("dbtable", "DIM_LOANPRODUCT") \
        .mode("append") \
        .save()
 
    print("Data load complete!")
 #######################

#############################################
    print("Reading data back from Snowflake to verify...")
    snowflake_df = spark.read \
        .format("net.snowflake.spark.snowflake") \
        .options(**sf_options) \
        .option("query", "SELECT * FROM Loan_DB.Loan_Schema.DIM_LOANPRODUCT") \
        .load()
 
    snowflake_df.show()
 
except Exception as e:
    print("Error occurred:", str(e))
 
#finally:
    #spark.stop()


+-----+---------+----+------------------+-------------------+---------+----------------+--------------------+------+
|grade|sub_grade|term|           purpose|initial_list_status|out_prncp|application_type|debt_settlement_flag|    id|
+-----+---------+----+------------------+-------------------+---------+----------------+--------------------+------+
|    A|       A2|  36|  home_improvement|                  w|      0.0|      Individual|                   N| 56121|
|    D|       D4|  36|debt_consolidation|                  w|      0.0|      Individual|                   N| 65104|
|    A|       A3|  36|  home_improvement|                  w|      0.0|      Individual|                   N| 65419|
|    A|       A5|  36|       credit_card|                  f|      0.0|      Individual|                   N|364880|
|    C|       C5|  36|debt_consolidation|                  f|      0.0|      Individual|                   N|366792|
+-----+---------+----+------------------+-------------------+---