In [None]:
import re

# Define data lake paths
bronze_path = "/mnt/bronze/clinical_trials.csv"
silver_path = "/mnt/silver/clinical_trials"



# Read raw CSV from bronze layer
try:
    bronze_df = spark.read.csv(bronze_path,
                               header=True,
                               inferSchema=True,
                               multiLine=True,
                               escape='"')
    print("Successfully read CSV from bronze layer.")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    dbutils.notebook.exit("Failed to read bronze data")



def clean_col_name(name):
    """
    Cleans a column name for Delta Lake compatibility:
    1. Converts to lowercase.
    2. Replaces all non-alphanumeric characters with underscores.
    3. Collapses multiple underscores into a single one.
    """
    clean_name = name.lower()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', clean_name)
    clean_name = re.sub(r'_+', '_', clean_name)
    return clean_name

# Apply the cleaning function to all columns
cleaned_df = bronze_df
for col in bronze_df.columns:
    cleaned_df = cleaned_df.withColumnRenamed(col, clean_col_name(col))

print("Cleaned column names. Displaying sample:")
display(cleaned_df.limit(5))



# Write cleaned data to the silver layer in Delta format
try:
    cleaned_df.write.format("delta").mode("overwrite").save(silver_path)
    print(f"Successfully wrote data to Delta table at: {silver_path}")
except Exception as e:
    print(f"Error writing to silver layer: {e}")
    dbutils.notebook.exit("Failed to write silver data")



# Verify the data in the silver layer
try:
    silver_df = spark.read.format("delta").load(silver_path)
    print("Successfully read from silver Delta table. Row count:", silver_df.count())
    display(silver_df.limit(5))
except Exception as e:
    print(f"Error verifying silver table: {e}")

