In [193]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date,col,isnan,count,when,isnull,max,date_diff,days,current_date,desc,sum
from pyspark.sql.types import DateType

In [157]:
session = SparkSession.builder.appName("spark_session").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [158]:
df = session.read.csv("C:/Users/acer/Downloads/esewa_airlines.csv",header=True,inferSchema=True)

In [159]:
df.show()

+---+----------+------------------+---------------------+------------+
|_c0|account_id|last_modified_date|travelled_flight_name|total_amount|
+---+----------+------------------+---------------------+------------+
|  0|1.55947E19|        10/11/2023|           BUDDHA AIR|      6400.0|
|  1|2.69308E18|        10/11/2023|           BUDDHA AIR|      5200.0|
|  2|7.49454E18|        10/11/2023|       SHREE AIRLINES|      8800.0|
|  3|1.49332E19|        10/11/2023|           BUDDHA AIR|      9001.0|
|  4|1.68207E19|        10/11/2023|        YETI AIRLINES|     23200.0|
|  5| 4.7877E18|        10/11/2023|       SHREE AIRLINES|      8800.0|
|  6|2.24826E18|        10/11/2023|        YETI AIRLINES|     14300.0|
|  7|1.24577E19|        10/11/2023|           BUDDHA AIR|      6400.0|
|  8|1.36404E19|        10/11/2023|           BUDDHA AIR|      4400.0|
|  9|7.28483E18|        10/11/2023|           BUDDHA AIR|     10900.0|
| 10|1.30363E19|        10/11/2023|       SHREE AIRLINES|     33500.0|
| 11|2

In [160]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- account_id: double (nullable = true)
 |-- last_modified_date: string (nullable = true)
 |-- travelled_flight_name: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [161]:
df.select("account_id","last_modified_date","total_amount").show()

+----------+------------------+------------+
|account_id|last_modified_date|total_amount|
+----------+------------------+------------+
|1.55947E19|        10/11/2023|      6400.0|
|2.69308E18|        10/11/2023|      5200.0|
|7.49454E18|        10/11/2023|      8800.0|
|1.49332E19|        10/11/2023|      9001.0|
|1.68207E19|        10/11/2023|     23200.0|
| 4.7877E18|        10/11/2023|      8800.0|
|2.24826E18|        10/11/2023|     14300.0|
|1.24577E19|        10/11/2023|      6400.0|
|1.36404E19|        10/11/2023|      4400.0|
|7.28483E18|        10/11/2023|     10900.0|
|1.30363E19|        10/11/2023|     33500.0|
|2.19566E18|        10/11/2023|     28800.0|
|6.16025E18|        10/11/2023|      6200.0|
|1.76357E19|        10/11/2023|     14701.0|
|8.30972E18|        10/11/2023|      7900.0|
|5.99028E18|        10/11/2023|      5700.0|
| 1.7915E18|        10/11/2023|      5200.0|
|  2.282E18|        10/11/2023|      9001.0|
|5.16977E18|        10/11/2023|     14400.0|
|2.19595E1

In [162]:
df = df.withColumn("last_modified_date",to_date(col("last_modified_date"),"MM/dd/yyyy"))

In [163]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- account_id: double (nullable = true)
 |-- last_modified_date: date (nullable = true)
 |-- travelled_flight_name: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [164]:
def datatype(df=df):
    list = []
    for c in df.columns:
        if df.select(c).schema[0].dataType != DateType():
            list.append(c)
    return list

In [165]:
datatype()

['_c0', 'account_id', 'travelled_flight_name', 'total_amount']

In [166]:
def isnull():
    return df.select([count(when(col(c).isNull(),c)).alias(c)for c in datatype()]).show()

In [167]:
isnull()

+---+----------+---------------------+------------+
|_c0|account_id|travelled_flight_name|total_amount|
+---+----------+---------------------+------------+
|  0|         0|                    0|           0|
+---+----------+---------------------+------------+



In [168]:
df = df.withColumnRenamed("last_modified_date","login_date")

In [169]:
df.show()

+---+----------+----------+---------------------+------------+
|_c0|account_id|login_date|travelled_flight_name|total_amount|
+---+----------+----------+---------------------+------------+
|  0|1.55947E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  1|2.69308E18|2023-10-11|           BUDDHA AIR|      5200.0|
|  2|7.49454E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  3|1.49332E19|2023-10-11|           BUDDHA AIR|      9001.0|
|  4|1.68207E19|2023-10-11|        YETI AIRLINES|     23200.0|
|  5| 4.7877E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  6|2.24826E18|2023-10-11|        YETI AIRLINES|     14300.0|
|  7|1.24577E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  8|1.36404E19|2023-10-11|           BUDDHA AIR|      4400.0|
|  9|7.28483E18|2023-10-11|           BUDDHA AIR|     10900.0|
| 10|1.30363E19|2023-10-11|       SHREE AIRLINES|     33500.0|
| 11|2.19566E18|2023-10-11|        YETI AIRLINES|     28800.0|
| 12|6.16025E18|2023-10-11|           BUDDHA AIR|      

In [170]:
df_recency = df.alias("df_recency")

In [171]:
df_recency.show(10)

+---+----------+----------+---------------------+------------+
|_c0|account_id|login_date|travelled_flight_name|total_amount|
+---+----------+----------+---------------------+------------+
|  0|1.55947E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  1|2.69308E18|2023-10-11|           BUDDHA AIR|      5200.0|
|  2|7.49454E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  3|1.49332E19|2023-10-11|           BUDDHA AIR|      9001.0|
|  4|1.68207E19|2023-10-11|        YETI AIRLINES|     23200.0|
|  5| 4.7877E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  6|2.24826E18|2023-10-11|        YETI AIRLINES|     14300.0|
|  7|1.24577E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  8|1.36404E19|2023-10-11|           BUDDHA AIR|      4400.0|
|  9|7.28483E18|2023-10-11|           BUDDHA AIR|     10900.0|
+---+----------+----------+---------------------+------------+
only showing top 10 rows



In [172]:
df_recency.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- account_id: double (nullable = true)
 |-- login_date: date (nullable = true)
 |-- travelled_flight_name: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [173]:
df_recency= df_recency.groupBy("account_id").agg(max("login_date").alias("last_login_date"))

In [174]:
df_recency.show(10)

+----------+---------------+
|account_id|last_login_date|
+----------+---------------+
|1.54303E19|     2024-03-02|
|2.92302E18|     2024-04-23|
|1.89273E18|     2024-04-26|
|5.55953E18|     2024-04-20|
|1.52739E19|     2024-04-24|
|5.06732E18|     2024-04-14|
|9.44376E18|     2024-04-20|
|3.11258E18|     2024-04-17|
| 1.6252E19|     2024-05-02|
|1.53173E19|     2024-05-05|
+----------+---------------+
only showing top 10 rows



In [175]:
from datetime import datetime

In [176]:
df_recency = df_recency.withColumn("Recency", date_diff(current_date(), df_recency["last_login_date"]))

In [177]:
df_recency.show()

+----------+---------------+-------+
|account_id|last_login_date|Recency|
+----------+---------------+-------+
|1.54303E19|     2024-03-02|     70|
|2.92302E18|     2024-04-23|     18|
|1.89273E18|     2024-04-26|     15|
|5.55953E18|     2024-04-20|     21|
|1.52739E19|     2024-04-24|     17|
|5.06732E18|     2024-04-14|     27|
|9.44376E18|     2024-04-20|     21|
|3.11258E18|     2024-04-17|     24|
| 1.6252E19|     2024-05-02|      9|
|1.53173E19|     2024-05-05|      6|
|5.13044E18|     2023-10-09|    215|
|1.51924E19|     2024-03-16|     56|
|8.80347E18|     2023-10-08|    216|
|5.17166E18|     2024-05-05|      6|
|1.66495E19|     2023-12-05|    158|
|1.76714E19|     2023-11-07|    186|
|3.01135E18|     2023-12-21|    142|
|7.01765E18|     2024-02-27|     74|
|2.41429E18|     2024-04-18|     23|
|1.01818E19|     2023-11-22|    171|
+----------+---------------+-------+
only showing top 20 rows



In [189]:
df_recency.orderBy(col("Recency")).show()

+----------+---------------+-------+
|account_id|last_login_date|Recency|
+----------+---------------+-------+
|6.10411E18|     2024-05-06|      5|
|4.90113E18|     2024-05-06|      5|
|1.04022E19|     2024-05-06|      5|
|1.19958E19|     2024-05-06|      5|
|2.60586E18|     2024-05-06|      5|
|1.50357E19|     2024-05-06|      5|
|1.18968E19|     2024-05-06|      5|
|2.29342E17|     2024-05-06|      5|
|1.22646E19|     2024-05-06|      5|
|1.46792E19|     2024-05-06|      5|
| 2.2156E18|     2024-05-06|      5|
|1.57426E19|     2024-05-06|      5|
|9.09688E18|     2024-05-06|      5|
|1.52314E19|     2024-05-06|      5|
|1.01469E19|     2024-05-06|      5|
|9.24407E17|     2024-05-06|      5|
|1.01602E19|     2024-05-06|      5|
|1.78123E19|     2024-05-06|      5|
|5.23229E18|     2024-05-06|      5|
|1.14267E19|     2024-05-06|      5|
+----------+---------------+-------+
only showing top 20 rows



In [178]:
df_frequency = df.alias("df_frequency")

In [179]:
df_frequency.show()

+---+----------+----------+---------------------+------------+
|_c0|account_id|login_date|travelled_flight_name|total_amount|
+---+----------+----------+---------------------+------------+
|  0|1.55947E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  1|2.69308E18|2023-10-11|           BUDDHA AIR|      5200.0|
|  2|7.49454E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  3|1.49332E19|2023-10-11|           BUDDHA AIR|      9001.0|
|  4|1.68207E19|2023-10-11|        YETI AIRLINES|     23200.0|
|  5| 4.7877E18|2023-10-11|       SHREE AIRLINES|      8800.0|
|  6|2.24826E18|2023-10-11|        YETI AIRLINES|     14300.0|
|  7|1.24577E19|2023-10-11|           BUDDHA AIR|      6400.0|
|  8|1.36404E19|2023-10-11|           BUDDHA AIR|      4400.0|
|  9|7.28483E18|2023-10-11|           BUDDHA AIR|     10900.0|
| 10|1.30363E19|2023-10-11|       SHREE AIRLINES|     33500.0|
| 11|2.19566E18|2023-10-11|        YETI AIRLINES|     28800.0|
| 12|6.16025E18|2023-10-11|           BUDDHA AIR|      

In [180]:
df_frequency = df_frequency.groupBy("account_id").agg(count("login_date").alias("frequency"))

In [181]:
df_frequency.show()

+----------+---------+
|account_id|frequency|
+----------+---------+
|1.54303E19|        2|
|2.92302E18|       16|
|1.89273E18|        6|
|5.55953E18|       22|
|1.52739E19|        9|
|5.06732E18|        2|
|9.44376E18|       17|
|3.11258E18|        5|
| 1.6252E19|       26|
|1.53173E19|       72|
|5.13044E18|        2|
|1.51924E19|       10|
|8.80347E18|        2|
|5.17166E18|      252|
|1.66495E19|        6|
|1.76714E19|        1|
|3.01135E18|        6|
|7.01765E18|        8|
|2.41429E18|        5|
|1.01818E19|       14|
+----------+---------+
only showing top 20 rows



In [188]:
df_frequency.orderBy(col("frequency").desc()).show()

+----------+---------+
|account_id|frequency|
+----------+---------+
|5.72365E17|     1595|
|4.25625E18|      918|
|8.21472E18|      906|
|9.42867E18|      895|
|1.44714E19|      753|
|1.61058E19|      620|
|1.20827E19|      595|
|1.24736E18|      560|
|1.70215E19|      508|
|7.93818E18|      490|
|1.53148E19|      468|
| 2.3006E18|      450|
|1.26214E19|      446|
|8.27837E18|      434|
|1.28232E19|      380|
|8.67017E18|      373|
|7.93545E18|      373|
|1.77906E19|      354|
| 5.9793E18|      349|
|6.67876E18|      345|
+----------+---------+
only showing top 20 rows



In [190]:
df_Monetary = df.alias("df_Monetary")

In [192]:
df_Monetary.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- account_id: double (nullable = true)
 |-- login_date: date (nullable = true)
 |-- travelled_flight_name: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [194]:
df_Monetary = df_Monetary.groupBy("account_id").agg(sum("total_amount").alias("Total_Spent"))

In [196]:
df_Monetary.orderBy(col("Total_Spent").desc()).show()

+----------+--------------------+
|account_id|         Total_Spent|
+----------+--------------------+
|5.72365E17|1.1034386669999998E7|
|1.44714E19|          6392218.06|
|9.42867E18|          6303989.71|
|1.20827E19|          5837493.53|
|8.21472E18|   5707496.600000001|
|4.25625E18|   5550332.649999999|
|1.53148E19|   4537335.680000001|
|7.93818E18|          3933330.61|
|1.26214E19|          3923563.17|
|1.28232E19|          3514419.55|
|1.61058E19|          3512043.57|
| 2.3006E18|          3458611.49|
|1.24736E18|  3380198.7699999996|
|2.02965E18|          2985918.78|
|1.11928E19|          2957247.77|
|1.38456E19|          2922585.06|
|1.07714E19|          2861976.66|
|1.66276E19|          2842912.95|
|1.82705E19|           2837596.7|
|7.15769E18|  2810892.9699999997|
+----------+--------------------+
only showing top 20 rows



In [197]:
final_df = df_recency.join(df_frequency,df_recency["account_id"] == df_frequency["account_id"],"inner").select(df_recency["account_id"],df_recency["Recency"],df_frequency["frequency"])

In [198]:
final_df.show(10)

+----------+-------+---------+
|account_id|Recency|frequency|
+----------+-------+---------+
|1.54303E19|     70|        2|
|2.92302E18|     18|       16|
|1.89273E18|     15|        6|
|5.55953E18|     21|       22|
|1.52739E19|     17|        9|
|5.06732E18|     27|        2|
|9.44376E18|     21|       17|
|3.11258E18|     24|        5|
| 1.6252E19|      9|       26|
|1.53173E19|      6|       72|
+----------+-------+---------+
only showing top 10 rows



In [199]:
final_df = final_df.join(df_Monetary,final_df["account_id"] == df_Monetary["account_id"],"inner").select(final_df["account_id"],df_Monetary["Total_Spent"],final_df["frequency"],final_df["Recency"])

In [200]:
final_df.show(10)

+----------+-----------+---------+-------+
|account_id|Total_Spent|frequency|Recency|
+----------+-----------+---------+-------+
|1.54303E19|     8799.2|        2|     70|
|2.92302E18|   103590.0|       16|     18|
|1.89273E18|    39702.0|        6|     15|
|5.55953E18|   151774.0|       22|     21|
|1.52739E19|    76801.8|        9|     17|
|5.06732E18|     8800.0|        2|     27|
|9.44376E18|   133624.0|       17|     21|
|3.11258E18|    41261.0|        5|     24|
| 1.6252E19|  261403.85|       26|      9|
|1.53173E19|   549494.7|       72|      6|
+----------+-----------+---------+-------+
only showing top 10 rows



In [201]:
final_df.orderBy(col("Total_Spent").desc()).show()

+----------+--------------------+---------+-------+
|account_id|         Total_Spent|frequency|Recency|
+----------+--------------------+---------+-------+
|5.72365E17|1.1034386669999998E7|     1595|      5|
|1.44714E19|          6392218.06|      753|      5|
|9.42867E18|          6303989.71|      895|      5|
|1.20827E19|          5837493.53|      595|     23|
|8.21472E18|   5707496.600000001|      906|      5|
|4.25625E18|   5550332.649999999|      918|      5|
|1.53148E19|   4537335.680000001|      468|      5|
|7.93818E18|          3933330.61|      490|      8|
|1.26214E19|          3923563.17|      446|      5|
|1.28232E19|          3514419.55|      380|      8|
|1.61058E19|          3512043.57|      620|      5|
| 2.3006E18|          3458611.49|      450|      8|
|1.24736E18|  3380198.7699999996|      560|      5|
|2.02965E18|          2985918.78|      302|      5|
|1.11928E19|          2957247.77|      278|      5|
|1.38456E19|          2922585.06|      254|     11|
|1.07714E19|