In [1]:
from utils import (
     create_spark_session, load_config, custom_read_parquet, custom_to_timestamp
)

from plot_tools import (
    plot_distribution, 
    plot_aggregated_by_time, 
    categorize,
    plot_analysis_of_loan_recovery,
    plot_tops,
    categorize_time_difference
)

In [2]:
spark = create_spark_session()
config = load_config()
month_index = 0
month = config["months"][month_index]
next_month = config["months"][month_index+1]

24/09/01 14:15:26 WARN Utils: Your hostname, sajjad-Legion-5-15ACH6 resolves to a loopback address: 127.0.1.1; using 10.218.52.77 instead (on interface eno1)
24/09/01 14:15:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 14:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# assign

In [3]:
assign = custom_read_parquet(spark=spark, config=config, key="loan_assign", month=month)
assign = custom_to_timestamp(df=assign, col_name="date_timestamp")
assign.show()

                                                                                

+--------------------+--------+-------------------+--------------------+---------------+-----------+-------------------+
|              bib_id|date_key|            fake_id|            nid_hash|        loan_id|loan_amount|     date_timestamp|
+--------------------+--------+-------------------+--------------------+---------------+-----------+-------------------+
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785789014|    10000.0|2024-04-28 12:57:06|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283786107933|    10000.0|2024-04-28 14:45:06|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785277888|    10000.0|2024-04-28 09:51:05|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785354149|    10000.0|2024-04-28 10:25:15|
|A81565F025E839BAA...|20240428|f-9801-000143526223|D522B41B28781AE6A...|404283785552861|    10000.0|2024-04-28 11:40:13|
|A81565F025E839BAA...|20240428|f

In [4]:
assign.printSchema()

root
 |-- bib_id: string (nullable = true)
 |-- date_key: string (nullable = true)
 |-- fake_id: string (nullable = true)
 |-- nid_hash: string (nullable = true)
 |-- loan_id: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [5]:
assign = categorize(
df=assign, 
col_name="loan_amount",
bins=[float('-inf'), 5000, 10000, 20000, 50000, 100000, float('inf')],
labels=["VeryLow", "5T-10T", "10T-20T", "20T-50T", "50T-100T", "Extreme"],
new_column_name="loan_category"
)

plot_distribution(df=assign, plot_type='count', month=month, x="loan_category")


                                                                                

Figure saved to: output/CountPlot_of_loan_category__month_38.png


In [7]:
from pyspark.sql.functions import col, sum

aggregated_nid = assign.groupBy("nid_hash").agg(sum(col("loan_amount")).alias("SumAggregated_loan_amount_by_nid"))
plot_distribution(df=aggregated_nid.select("SumAggregated_loan_amount_by_nid"), plot_type='hist', month=month, x="SumAggregated_loan_amount_by_nid")


                                                                                

Figure saved to: output/HistPlot_of_SumAggregated_loan_amount_by_nid__month_38.png


In [8]:
plot_aggregated_by_time(df=assign, timestamp_column="date_timestamp", agg_col="loan_amount", month=month)

                                                                                

Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_by_Hour__month_38.png
Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_by_DayOfWeek__month_38.png


                                                                                

Figure saved to: output/LinePlot_of_SumAggregated_loan_amount_by_DayOfMonth__month_38.png


# recovery

In [9]:
recovery = custom_read_parquet(spark=spark, config=config, key="loan_recovery", month=month)
recovery = custom_to_timestamp(df=recovery, col_name="date_timestamp")
recovery.show()

+--------------------+--------+-------------------+--------------------+---------------+-----------+-------------+-------------------+
|              bib_id|date_key|            fake_id|            nid_hash|        loan_id|loan_amount|hsdp_recovery|     date_timestamp|
+--------------------+--------+-------------------+--------------------+---------------+-----------+-------------+-------------------+
|587476A6837ABCAE2...|20240514|f-9801-000104060322|52673E25F6A3EE999...|405063809636961|    10000.0|       2909.0|2024-05-14 19:03:06|
|587476A6837ABCAE2...|20240505|f-9801-000104060322|52673E25F6A3EE999...|404263780566810|    20000.0|      18228.0|2024-05-05 11:52:54|
|587476A6837ABCAE2...|20240514|f-9801-000104060322|52673E25F6A3EE999...|405063810034865|    10000.0|      10000.0|2024-05-14 19:03:07|
|587476A6837ABCAE2...|20240516|f-9801-000104060322|52673E25F6A3EE999...|405123828317223|    10000.0|      10000.0|2024-05-16 13:31:13|
|587476A6837ABCAE2...|20240516|f-9801-000104060322|5267

In [10]:
recovery.printSchema()

root
 |-- bib_id: string (nullable = true)
 |-- date_key: string (nullable = true)
 |-- fake_id: string (nullable = true)
 |-- nid_hash: string (nullable = true)
 |-- loan_id: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- hsdp_recovery: double (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)



In [11]:
plot_analysis_of_loan_recovery(assign=assign, recovery=recovery)

                                                                                

Figure saved to: output/Comparison of Recovered and Assigned Loan Amounts --- 82.00% of loan_amounts are recovered.png


# package

In [12]:
package = custom_read_parquet(spark=spark, config=config, key="package", month=month)
package = custom_to_timestamp(df=package, col_name="activation_date")
package = custom_to_timestamp(df=package, col_name="deactivation_date") 
package.show()

+--------------------+--------+--------------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+
|              bib_id|date_key|             fake_id|            nid_hash|offering_code|offer_amount|       offering_name|    activation_date|  deactivation_date|
+--------------------+--------+--------------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+
|0D2379F0B99698833...|20240515| f-9801-000078374670|5A9967E423C684E52...|    PO1974XOI|    175000.0|Weekly 2 5GB Inte...|2024-05-15 17:32:21|2024-05-22 23:59:59|
|0D2379F0B99698833...|20240519| f-9801-000078374670|5A9967E423C684E52...|    PO1974XOI|    175000.0|Weekly 2 5GB Inte...|2024-05-19 11:15:19|2024-05-26 23:59:59|
|0D2379F0B99698833...|20240430| f-9801-000078374670|5A9967E423C684E52...|    PO1974XOI|    175000.0|Weekly 2 5GB Inte...|2024-04-30 08:28:30|2024-05-07 23:59:59|
|0D2379F0B99698833...|202405

In [13]:
package.printSchema()

root
 |-- bib_id: string (nullable = true)
 |-- date_key: string (nullable = true)
 |-- fake_id: string (nullable = true)
 |-- nid_hash: string (nullable = true)
 |-- offering_code: string (nullable = true)
 |-- offer_amount: double (nullable = true)
 |-- offering_name: string (nullable = true)
 |-- activation_date: timestamp (nullable = true)
 |-- deactivation_date: timestamp (nullable = true)



In [14]:
diff = categorize_time_difference(df=package)
diff.show()

+--------------------+--------+--------------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+--------------------+-----------------+
|              bib_id|date_key|             fake_id|            nid_hash|offering_code|offer_amount|       offering_name|    activation_date|  deactivation_date|time_difference_days|duration_category|
+--------------------+--------+--------------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+--------------------+-----------------+
|0D2379F0B99698833...|20240515| f-9801-000078374670|5A9967E423C684E52...|    PO1974XOI|    175000.0|Weekly 2 5GB Inte...|2024-05-15 17:32:21|2024-05-22 23:59:59|                   7|           Weekly|
|0D2379F0B99698833...|20240519| f-9801-000078374670|5A9967E423C684E52...|    PO1974XOI|    175000.0|Weekly 2 5GB Inte...|2024-05-19 11:15:19|2024-05-26 23:59:59|                   7|           Wee

In [15]:
plot_tops(diff, col_name="duration_category", index=10, month=month)

                                                                                

Figure saved to: output/top_10_popular_duration_categorys__month_38.png


In [16]:
plot_tops(df=package, col_name="offering_name", index=25, month=month)

Figure saved to: output/top_25_popular_offering_names__month_38.png


# recharge

In [17]:
recharge = custom_read_parquet(spark=spark, config=config, key="recharge", month=month)
recharge = custom_to_timestamp(df=recharge, col_name="recharge_dt")
recharge.show()

+--------------------+--------+--------------------+--------------------+------------------+-------------------+--------------+--------------------------+-------------------------+
|              bib_id|date_key|             fake_id|            nid_hash|recharge_value_amt|        recharge_dt|origin_host_nm|account_balance_before_amt|account_balance_after_amt|
+--------------------+--------+--------------------+--------------------+------------------+-------------------+--------------+--------------------------+-------------------------+
|4242ED39C4472A9EF...|20240501| f-9801-000081960733|FE1D37955B3872882...|          100000.0|2024-05-01 11:04:05|    MFSEREFILL|                    1086.0|                 101086.0|
|4242ED39C4472A9EF...|20240518| f-9801-000081960733|FE1D37955B3872882...|          100000.0|2024-05-18 16:48:03|    MFSEREFILL|                       4.0|                 100004.0|
|427F2D7D2FE46665E...|20240510|f-9801-3620004212...|7F350C4824AFC30C2...|          100000.0|202

In [18]:
recharge.printSchema()

root
 |-- bib_id: string (nullable = true)
 |-- date_key: string (nullable = true)
 |-- fake_id: string (nullable = true)
 |-- nid_hash: string (nullable = true)
 |-- recharge_value_amt: double (nullable = true)
 |-- recharge_dt: timestamp (nullable = true)
 |-- origin_host_nm: string (nullable = true)
 |-- account_balance_before_amt: double (nullable = true)
 |-- account_balance_after_amt: double (nullable = true)



In [19]:
recharge = categorize(
    df=recharge, 
    col_name="recharge_value_amt", 
    bins=[float('-inf'), 10000, 20000, 50000, 100000, 200000, 500000, float('inf')],
    labels=["1T", "2T", "5T", "10T", "20T", "50T", "More"],
    new_column_name="recharge_category"
)

plot_distribution(df=recharge.select("recharge_category"), plot_type='count', month=month, x="recharge_category")

Figure saved to: output/CountPlot_of_recharge_category__month_38.png


In [20]:
plot_aggregated_by_time(df=recharge, timestamp_column="recharge_dt", agg_col="recharge_value_amt", month=month )

                                                                                

Figure saved to: output/LinePlot_of_SumAggregated_recharge_value_amt_by_Hour__month_38.png


                                                                                

Figure saved to: output/LinePlot_of_SumAggregated_recharge_value_amt_by_DayOfWeek__month_38.png


                                                                                

Figure saved to: output/LinePlot_of_SumAggregated_recharge_value_amt_by_DayOfMonth__month_38.png


# user

In [21]:
user = custom_read_parquet(spark=spark, config=config, key="user", month=month)
user.show()

+--------------------+--------------------+--------------------+---------------+--------+-------------------+---------------+--------------+---------------+---------------+-----+
|              bib_id|             fake_id|            nid_hash|contract_type_v|gender_v|registration_date_d|date_of_birth_d|ability_status|account_balance|base_station_cd|sitei|
+--------------------+--------------------+--------------------+---------------+--------+-------------------+---------------+--------------+---------------+---------------+-----+
|7A90F5DFA460EBDDC...|f-9801-2960001699...|E0B7D16302BF2B4B6...|              P|       F|           20090714|       19610628|        Active|  120431.349997|      LH1292XB1|H1292|
|7A9BA18851A16FD85...|                NULL|32B355B1462D9033C...|              N|       F|           20211229|       19780801|          Hard|            0.0|        H7206XC|H7206|
|7A9BF2DF116AAB41D...| f-9801-000121961192|CC6518A6B366B6DDD...|              P|       F|           20191

In [22]:
user.select("gender_v").toPandas().value_counts()

gender_v
M           602268
F           370249
N            25989
Name: count, dtype: int64

# labeling

In [23]:
from label_preparation import generate_churn_label
from data_preparation import generate_dataset

In [24]:
churn_label_df = generate_churn_label(spark, config, month, next_month)
df = generate_dataset(spark, config, month, churn_label_df)
df.show()



+--------------------+-----+--------------------+-------------------+----------------------+---------------------+------------+---------------------+--------------------+-------------+------------------+-----------------+---------------+--------+--------------+---------------+
|              bib_id|label|AveragePackageAmount|MedianPackageAmount|AveragePackageDuration|MedianPackageDuration|CountPackage|AverageRechargeAmount|MedianRechargeAmount|CountRecharge|               age|registration_year|contract_type_v|gender_v|ability_status|account_balance|
+--------------------+-----+--------------------+-------------------+----------------------+---------------------+------------+---------------------+--------------------+-------------+------------------+-----------------+---------------+--------+--------------+---------------+
|0000CFC78AA4CD390...|    0|            213750.0|           213750.0|                  30.0|                 30.0|           2|             100000.0|            10000

                                                                                