In [0]:
%python
# Set up and connect to Azure Data Lake Store Gen2

storage_name = dbutils.widgets.get("storage_account_name")
storage_access_key = dbutils.widgets.get("access_key")
staging_container = dbutils.widgets.get("staging_container")
curated_container = dbutils.widgets.get("curated_container")
curated_data_path = dbutils.widgets.get("curated_data_path")
staging_data_path = dbutils.widgets.get("staging_data_path")


spark.conf.set(f"fs.azure.account.key.{storage_name}.dfs.core.windows.net", f"{storage_access_key}")

file_path = f"abfss://{staging_container}@datalake876.dfs.core.windows.net/{staging_data_path}"

# Read the data from Azure Data Lake Store Gen2 and convert to pandas dataframe from Spark
df = spark.read.format("csv").option("header", "true").load(file_path)

In [0]:
#drop missing values and duplicate rows
df = df.dropna()
df = df.dropDuplicates()

In [0]:
# Update data types for the columns
df = df.withColumn("SeniorCitizen", df["SeniorCitizen"].cast("int").cast("boolean"))

In [0]:
#updating the yes/no only columns to boolean values of true/false
from pyspark.sql.functions import col, lower, when

replacement_values = {'yes': True, 'no': False}
string_cols_to_bool = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col_name in string_cols_to_bool:
    df = df.withColumn(col_name, when(lower(col(col_name)) == 'yes', True).otherwise(False))

In [0]:
#updating the data types of the numeric columns
df = df.withColumn("TotalCharges", col("TotalCharges").cast("float"))
df = df.withColumn("MonthlyCharges", col("MonthlyCharges").cast("float"))
df = df.withColumn("tenure", col("tenure").cast("int"))

In [0]:
#create the valuesegment column based on monthly charges and total charges
df = df.withColumn(
    'ValueSegment',
    when((col('TotalCharges') >= 7000) | (col('MonthlyCharges') >= 90), 'High Value')
    .when((col('TotalCharges') >= 3000) | (col('MonthlyCharges') >= 40), 'Medium Value')
    .otherwise('Low Value')
)

In [0]:

df = df.withColumn(
    'TenureBucket',
    when(col('tenure') <= 11, '0-11 Months')
    .when((col('tenure') <= 23), '12-23 Months')
    .when((col('tenure') <= 35), '24-35 Months')
    .when((col('tenure') <= 47), '36-47 Months')
    .when((col('tenure') <= 59), '48-59 Months')
    .otherwise('60+ Months')
)

# Display the updated DataFrame
display(df)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,ValueSegment,TenureBucket
7590-VHVEG,Female,False,True,False,1,False,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,True,Electronic check,29.85,29.85,False,Low Value,0-11 Months
5575-GNVDE,Male,False,False,False,34,True,No,DSL,Yes,No,Yes,No,No,No,One year,False,Mailed check,56.95,1889.5,False,Medium Value,24-35 Months
3668-QPYBK,Male,False,False,False,2,True,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,True,Mailed check,53.85,108.15,True,Medium Value,0-11 Months
7795-CFOCW,Male,False,False,False,45,False,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,False,Bank transfer (automatic),42.3,1840.75,False,Medium Value,36-47 Months
9237-HQITU,Female,False,False,False,2,True,No,Fiber optic,No,No,No,No,No,No,Month-to-month,True,Electronic check,70.7,151.65,True,Medium Value,0-11 Months
9305-CDSKC,Female,False,False,False,8,True,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,True,Electronic check,99.65,820.5,True,High Value,0-11 Months
1452-KIOVK,Male,False,False,True,22,True,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,True,Credit card (automatic),89.1,1949.4,False,Medium Value,12-23 Months
6713-OKOMC,Female,False,False,False,10,False,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,False,Mailed check,29.75,301.9,False,Low Value,0-11 Months
7892-POOKP,Female,False,True,False,28,True,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,True,Electronic check,104.8,3046.05,True,High Value,24-35 Months
6388-TABGU,Male,False,False,True,62,True,No,DSL,Yes,Yes,No,No,No,No,One year,False,Bank transfer (automatic),56.15,3487.95,False,Medium Value,60+ Months


In [0]:
#write the file to a CSV in the curated container

df.write.format("csv").option("header", "true").mode("overwrite").save(f"abfss://{curated_container}@datalake876.dfs.core.windows.net/{curated_data_path}")