In [0]:
%python
# Set up and connect to Azure Data Lake Store Gen2

storage_account_name = dbutils.widgets.get("storage_account_name")
storage_access_key = dbutils.widgets.get("storage_access_key")
curated_container = dbutils.widgets.get("curated_container")
curated_data_path = dbutils.widgets.get("curated_data_path")
analytics_container = dbutils.widgets.get("analytics_container")



spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", f"{storage_access_key}")

file_path = f"abfss://{curated_container}@datalake876.dfs.core.windows.net/{curated_data_path}"

# Read the data from Azure Data Lake Store Gen2 and convert to pandas dataframe from Spark
df = spark.read.format("csv").option("header", "true").load(file_path)

In [0]:
# df.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: string (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: string (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: string (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)
 |-- ValueSegment: string (nullable = true)
 |-- TenureBucket: string (nullable = tr

In [0]:
#get the value segment statistics such as churn rate, average monthly charges, and averag total charges
from pyspark.sql.functions import col, lower, when, round, avg

value_segment_count = df.groupBy("ValueSegment").count().withColumnRenamed("count", "Subcriber Count")
value_segment_churned_count = df.filter(col("Churn") == True).groupBy("ValueSegment").count().withColumnRenamed("count", "Churned Subscriber Count")

joined = value_segment_count.join(value_segment_churned_count, "ValueSegment")

joined = joined.withColumn("Churn Rate", when(col("Churned Subscriber Count") == 0, 0).otherwise(round(col("Churned Subscriber Count") / col("Subcriber Count") * 100, 2)))

average_total_charges = df.groupBy("ValueSegment").agg(round(avg(col("TotalCharges")), 2).alias("Average Total Charges"))
average_monthly_charges = df.groupBy("ValueSegment").agg(round(avg(col("MonthlyCharges")), 2).alias("Average Monthly Charges"))
senior_count = df.filter(col("SeniorCitizen") == True).groupBy("ValueSegment").count().withColumnRenamed("count", "Senior Citizen Count")

value_segment_agg_metrics = joined.join(average_total_charges, "ValueSegment").join(average_monthly_charges, "ValueSegment").join(senior_count, "ValueSegment")
# print("*******************Churn Rate by Value Segment*******************")
# display(joined)
# display(average_total_charges)
# display(average_monthly_charges)
# display(senior_count)

In [0]:
#aggregates the data by tenure bucket and calculates the churn rate and average monthly charges, and averag total charges

tenure_bucket_count = df.groupBy("TenureBucket").count().withColumnRenamed("count", "Subcriber Count")
tenure_bucket_churned_count = df.filter(col("Churn") == True).groupBy("TenureBucket").count().withColumnRenamed("count", "Churned Subscriber Count")

joined = tenure_bucket_count.join(tenure_bucket_churned_count, "TenureBucket")

joined = joined.withColumn("Churn Rate", when(col("Churned Subscriber Count") == 0, 0).otherwise(round(col("Churned Subscriber Count") / col("Subcriber Count") * 100, 2)))

average_total_charges = df.groupBy("TenureBucket").agg(round(avg(col("TotalCharges")), 2).alias("Average Total Charges"))
average_monthly_charges = df.groupBy("TenureBucket").agg(round(avg(col("MonthlyCharges")), 2).alias("Average Monthly Charges"))
senior_count = df.filter(col("SeniorCitizen") == True).groupBy("TenureBucket").count().withColumnRenamed("count", "Senior Citizen Count")

tenure_buckets_metrics = joined.join(average_total_charges, "TenureBucket").join(average_monthly_charges, "TenureBucket").join(senior_count, "TenureBucket").orderBy("TenureBucket")
# print("*******************Churn Rate by Tenure Bucket*******************") 
# display(joined)

In [0]:
#write the two agg dataframes to Azure Data Lake Store Gen2
value_segment_agg_metrics.write.format("csv").mode("overwrite").save(f"abfss://{analytics_container}@{storage_account_name}.dfs.core.windows.net/Telco/telco_value_segments_agg_metrics.csv", header = True)

tenure_buckets_metrics.write.format("csv").mode("overwrite").save(f"abfss://{analytics_container}@{storage_account_name}.dfs.core.windows.net/Telco/telco_tenure_buckets_agg_metrics.csv", header = True)