In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, countDistinct, lag, col, sum, when
from pyspark.sql.window import Window

# create a SparkSession
spark = SparkSession.builder.appName("myApp").getOrCreate()

In [None]:
# read the CSV file into a DataFrame
df = spark.read.csv("pret", header=True, inferSchema=True)


In [None]:

# filter the DataFrame to only include new clients (i.e., those who did not appear in any previous month)
windowSpec = Window.partitionBy("code_agent", "code_client").orderBy("date_de_deboursement")
df_new_clients = df.withColumn("prev_code_client", lag("code_client").over(windowSpec)) \
                  .withColumn("is_new_client", when(col("prev_code_client").isNull(), 1).otherwise(0)) \
                  .filter(col("is_new_client") == 1) \
                  .drop("prev_code_client", "is_new_client")

# group by code_agent and year/month and count distinct code_client
num_clients_df = df_new_clients.groupBy("code_agent", year("date_de_deboursement").alias("year"), month("date_de_deboursement").alias("month")).agg(countDistinct("code_client").alias("num_new_clients"))

# add a column with the previous month's number of clients for each code_agent
windowSpec = Window.partitionBy("code_agent").orderBy("year", "month")
lagged_df = num_clients_df.withColumn("prev_num_new_clients", lag("num_new_clients").over(windowSpec))

# calculate the percentage of new clients relative to the total number of clients for each code_agent
final_df = lagged_df.withColumn("num_all_clients", sum("num_new_clients").over(windowSpec)) \
                   .withColumn("pct_new_clients", col("num_new_clients") / col("num_all_clients") * 100)

# show the final DataFrame
final_df.show(1000)