In [1]:
from utils import (
    create_spark_session, load_config, custom_read_parquet,
    count_nulls, custom_count_distinct,
    custom_to_timestamp, custom_max, custom_min,
    custom_group_by, plot_column_distribution, plot_aggregated_by_time, 
    categorize_loan_amount
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
spark = create_spark_session()
config = load_config()
month = config["months"][0]

24/08/27 19:51:22 WARN Utils: Your hostname, sajjad-Legion-5-15ACH6 resolves to a loopback address: 127.0.1.1; using 10.218.52.77 instead (on interface eno1)
24/08/27 19:51:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 19:51:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# assign

In [3]:
assign = custom_read_parquet(spark, config, "loan_assign", month)
assign.show()

+--------------------+--------+-------------------+--------------------+---------------+-----------+-----------------+
|              bib_id|date_key|            fake_id|            nid_hash|        loan_id|loan_amount|   date_timestamp|
+--------------------+--------+-------------------+--------------------+---------------+-----------+-----------------+
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785789014|    10000.0|20240428 12:57:06|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283786107933|    10000.0|20240428 14:45:06|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785277888|    10000.0|20240428 09:51:05|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785354149|    10000.0|20240428 10:25:15|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785552861|    10000.0|20240428 11:40:13|
|A81565F025E839BAA...|20240428|f-9801-0001435262

In [4]:
assign.printSchema()

root
 |-- bib_id: string (nullable = true)
 |-- date_key: string (nullable = true)
 |-- fake_id: string (nullable = true)
 |-- nid_hash: string (nullable = true)
 |-- loan_id: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- date_timestamp: string (nullable = true)



In [5]:
for col_name in assign.columns:
    print(f"number of null value in column {col_name} : {count_nulls(df = assign, col_name=col_name)}")

number of null value in column bib_id : 0
number of null value in column date_key : 0
number of null value in column fake_id : 354
number of null value in column nid_hash : 0
number of null value in column loan_id : 0
number of null value in column loan_amount : 0
number of null value in column date_timestamp : 0


In [6]:
print(
    f"number of samples in DataFrame 'assign': {assign.count()}",
    f"\nnumber of distinct values in column 'bib_id': {custom_count_distinct(df=assign, col_name='bib_id')}",
    f"\nnumber of distinct values in column 'nid_hash': {custom_count_distinct(df=assign, col_name='nid_hash')}"
)



number of samples in DataFrame 'assign': 994924 
number of distinct values in column 'bib_id': 71608 
number of distinct values in column 'nid_hash': 71523


                                                                                

In [7]:
custom_group_by(df=assign, group_by_col="nid_hash", agg_col="bib_id", agg_func="count_distinct", alias_name = "count_distinct_bib_id", sort_order='desc').show()



+--------------------+---------------------+
|            nid_hash|count_distinct_bib_id|
+--------------------+---------------------+
|79BC87C9F3AF92B07...|                    2|
|2F15656935C6A1884...|                    2|
|B176CA25111FA446B...|                    2|
|6C6049E166E266DE6...|                    2|
|02988F433771D75F4...|                    2|
|19CE179E29F1EF5D5...|                    2|
|BDA123486587CEAD0...|                    2|
|F308B92D0FEA62656...|                    2|
|A20A97F1ADA5668B5...|                    2|
|EE82644DBBCFE29B0...|                    2|
|506D80BB9A9C2951D...|                    2|
|5D6D15145F90A4542...|                    2|
|86D6FA78AE91CE967...|                    2|
|AC6FC498A2A900627...|                    2|
|58B0B10A5F4735E17...|                    2|
|4DEB46C1ACAF449CF...|                    2|
|5D18DF123B41A4439...|                    2|
|CDF4DF1C67B2084BF...|                    2|
|C8AF079CDCC7FC052...|                    2|
|152DB4F62

                                                                                

In [8]:
col_name = "date_timestamp"
assign = custom_to_timestamp(assign, col_name)
print(
    f"max_{col_name} : {custom_max(assign, col_name)}",
    f"\nmin{col_name} : {custom_min(assign, col_name)}",
)


                                                                                

max_date_timestamp : 2024-05-20 23:59:58 
mindate_timestamp : 2024-04-20 00:00:05


In [9]:
col_name = "loan_amount"
print(
    f"max_{col_name} : {custom_max(assign, col_name)}",
    f"\nmin{col_name} : {custom_min(assign, col_name)}",
)

max_loan_amount : 179167.0 
minloan_amount : 5000.0


In [10]:
aggregated_nid = custom_group_by(df=assign, group_by_col="nid_hash", agg_col="loan_amount", agg_func="sum", alias_name="SumAggregated_loan_amount_by_nid", sort_order='desc')
aggregated_nid.show()

[Stage 56:===>                                                    (1 + 15) / 16]

+--------------------+--------------------------------+
|            nid_hash|SumAggregated_loan_amount_by_nid|
+--------------------+--------------------------------+
|C0C600B914418408E...|                       9600000.0|
|A8B419013F4F3126A...|                       7410000.0|
|5A4E68534605CFAAC...|                       7295042.0|
|77AD72BE45556D48E...|                       6653842.0|
|17A6DAF2E6B68F7B7...|                       5660000.0|
|E13504DE230D651B1...|                       5650000.0|
|20F9EA15ACF5DA8F7...|                       5515000.0|
|792676B4A579C0914...|                       5460000.0|
|C064331039138D2ED...|                       5070000.0|
|3AC5EA35124CE6275...|                       4905000.0|
|F807CD00B7C54F13C...|                       4900000.0|
|77C5358360FB9FBA9...|                       4881667.0|
|474959DA2DC2C0CDD...|                       4857357.0|
|1C65A47C9315C7960...|                       4804998.0|
|934300F83D266F9FA...|                       480

                                                                                

In [11]:
plot_column_distribution(col_df=aggregated_nid.select("SumAggregated_loan_amount_by_nid"), plot_type='hist', month=month, x="SumAggregated_loan_amount_by_nid")

                                                                                

Figure saved to: output/HistPlot_of_SumAggregated_loan_amount_by_nid__month_38.png


In [12]:
plot_column_distribution(col_df=assign.select("loan_amount"), plot_type='count', month=month, x="loan_amount")

Figure saved to: output/CountPlot_of_loan_amount__month_38.png


In [13]:
plot_aggregated_by_time(df=assign, timestamp_column="date_timestamp", agg_col="loan_amount", month=month)

Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_byHour__month_38.png
Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_byDayOfWeek__month_38.png
Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_byDayOfMonth__month_38.png


In [14]:
assign = categorize_loan_amount(df=assign, col_name="loan_amount")
plot_column_distribution(col_df=assign.select("loan_category"), plot_type='count', month=month, x="loan_category")


Figure saved to: output/CountPlot_of_loan_category__month_38.png
