In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CreditCardApprovalPrediction") \
    .getOrCreate()

In [16]:

# Define file paths
path_to_application = 'data/raw/application_record.csv'
path_to_credit = 'data/raw/credit_record.csv'

# Ingest data into Spark DataFrames
applicant_data_spark = spark.read.option("header", "true").csv(path_to_application, inferSchema=True)
credit_data_spark = spark.read.option("header", "true").csv(path_to_credit, inferSchema=True)

# Show first few rows for verification
applicant_data_spark.show(5)
credit_data_spark.show(5)

                                                                                

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+
|5008804|          M|           Y|              Y|           0|        427500.0|             Working|    Higher education|      Civil marriage| Rented apartment|    -12005|        -4542|         1

In [17]:
from pyspark.sql.functions import col, when, count, lit, min, max, abs

# Drop duplicates
applicant_data_spark = applicant_data_spark.dropDuplicates(['ID'])
merged_spark_df = applicant_data_spark.join(credit_data_spark, on="ID", how="left")


# Mode calculation for 'STATUS'
from pyspark.sql import functions as F

# Group by 'ID' and calculate mode (most frequent value) for 'STATUS'
status_mode_df = merged_spark_df.groupBy('ID').agg(
    F.expr("first(STATUS)").alias("mode_status")  # Simplified mode calculation in Spark
)

# Filter out 'X' values
status_mode_df = status_mode_df.filter(status_mode_df['mode_status'] != 'X')

# Merge applicant data with status mode
merged_spark_df = merged_spark_df.join(status_mode_df, on="ID", how="inner")

# Apply the label encoding logic to 'mode_status'
merged_spark_df = merged_spark_df.withColumn(
    "label", when(merged_spark_df["mode_status"].isin("0", "1", "2", "3", "4", "5"), 0)
    .when(merged_spark_df["mode_status"] == "C", 1)
    .otherwise(lit(None))
)

# Show the DataFrame after transformations
merged_spark_df.show(5)


[Stage 175:>                                                        (0 + 1) / 1]

+-------+-----------+------------+---------------+------------+----------------+----------------+--------------------+------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+--------------+------+-----------+-----+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|MONTHS_BALANCE|STATUS|mode_status|label|
+-------+-----------+------------+---------------+------------+----------------+----------------+--------------------+------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+--------------+------+-----------+-----+
|5008806|          M|           Y|              Y|           0|        112500.0|         Worki

                                                                                

In [18]:
# Sample 5 rows randomly from the test_data_loaded DataFrame
_ , sample_data = merged_spark_df.randomSplit([0.9, 0.1], seed=42)

# Show the sample data
sample_data.show()

[Stage 184:>                                                        (0 + 1) / 1]

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+--------------+------+-----------+-----+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|MONTHS_BALANCE|STATUS|mode_status|label|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+--------------+------+-----------+-----+
|5008806|          M|           Y|              Y|           0|        11250

                                                                                

In [19]:
sample_data.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- FLAG_MOBIL: integer (nullable = true)
 |-- FLAG_WORK_PHONE: integer (nullable = true)
 |-- FLAG_PHONE: integer (nullable = true)
 |-- FLAG_EMAIL: integer (nullable = true)
 |-- OCCUPATION_TYPE: string (nullable = true)
 |-- CNT_FAM_MEMBERS: double (nullable = true)
 |-- MONTHS_BALANCE: integer (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- mode_status: string (nullable = true)
 |-- label: integer (nullable = true)



In [20]:
sample_raw_directory = "sample/raw"
sample_data.write.parquet(sample_raw_directory)
print("saved!!")

[Stage 193:>                                                        (0 + 4) / 4]

saved!!


                                                                                

In [7]:
from process import DataProcessor  

In [8]:
sample_processed_directory = "sample/processed"

In [9]:
data_processor = DataProcessor()
data_processor.process(input_df=sample_data,output_dir=sample_processed_directory)
print("Data processing complete!")



Data processing complete!


                                                                                

In [10]:
test_data_loaded = spark.read.parquet(sample_processed_directory)
test_data_loaded.show(5)

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+---------------+----------+----------+---------------+---------------+--------------+------+-----------+-----+------------------+-----------------+---------------------+---------------------+---------------+--------------------+--------------------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|MONTHS_BALANCE|STATUS|mode_status|label|               AGE|   YEARS_EMPLOYED|INCOME_PER_FAM_MEMBER|CREDIT_HISTORY_LENGTH|RECENT_ACTIVITY|            features|     scaled_features|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-------------

In [11]:
model_path = "rf_model"  

In [12]:
from pyspark.ml.classification import RandomForestClassificationModel
rf_model = RandomForestClassificationModel.load(model_path)
print("RandomForest model loaded successfully!")

[Stage 159:>                                                        (0 + 1) / 1]

RandomForest model loaded successfully!


                                                                                

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Perform prediction on test data
rf_predictions = rf_model.transform(test_data_loaded)

rf_predictions.select("label").show()


+-----+
|label|
+-----+
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    1|
+-----+
only showing top 20 rows

