# **Silver Layer Transformations**

#### **Packages**

In [27]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, lit, trim, when, substring, year, current_timestamp, current_date,
    regexp_replace, split, size, length, lower, upper, isnan, isnull,
    datediff, to_date, array_contains, concat_ws, round, coalesce
)
from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, BooleanType

StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 29, Finished, Available, Finished)

In [28]:
# Ensure Spark handles column names case-insensitively
spark.conf.set("spark.sql.caseSensitive", "false")

StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 30, Finished, Available, Finished)

#### **User Identity**

In [29]:
# Add this function in your notebook
def user_identity_transformations(spark: SparkSession, df: DataFrame) -> DataFrame:
    from pyspark.sql.functions import trim
    for col_name in df.columns:
        df = df.withColumn(col_name, trim(col(col_name)))
    return df


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 31, Finished, Available, Finished)

#### **LinkedIn Users**

In [30]:
# 1. LinkedIn Users Table Transformations
def linkedin_users_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    valid_df = valid_df.withColumn("join_date", to_date(col("join_date"),"dd-MM-yyyy"))
    
    # Clean and standardize string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Standardize Gender column
    valid_df = valid_df.withColumn(
        "Gender",
        when(lower(col("Gender")).isin(["m", "male", "man"]), "Male")
        .when(lower(col("Gender")).isin(["f", "female", "woman"]), "Female")
        .otherwise("Other")
    )
    
    # Parse Age column to extract age range start and end
    valid_df = valid_df.withColumn(
        "Age_Range_Start",
        when(col("Age").contains("–"), 
             regexp_replace(col("Age"), r"(\d+)–.*", "$1").cast(IntegerType()))
        .when(col("Age").contains("-"), 
             regexp_replace(col("Age"), r"(\d+)-.*", "$1").cast(IntegerType()))
        .otherwise(col("Age").cast(IntegerType()))
    ).withColumn(
        "Age_Range_End",
        when(col("Age").contains("–"), 
             regexp_replace(col("Age"), r".*–(\d+)", "$1").cast(IntegerType()))
        .when(col("Age").contains("-"), 
             regexp_replace(col("Age"), r".*-(\d+)", "$1").cast(IntegerType()))
        .otherwise(col("Age").cast(IntegerType()))
    )
    
    # Create age band classification
    valid_df = valid_df.withColumn(
        "Age_Band",
        when(col("Age_Range_Start") < 23, "Young Professional")
        .when(col("Age_Range_Start") < 28, "Early Career")
        .when(col("Age_Range_Start") < 35, "Mid Career")
        .when(col("Age_Range_Start") < 42, "Senior Professional")
        .otherwise("Experienced")
    )
    
    # Process Skills column - count number of skills
    valid_df = valid_df.withColumn(
        "Skills_Array",
        split(col("Skills"), ",")
    ).withColumn(
        "Skills_Count",
        size(col("Skills_Array"))
    )
    
    # Standardize Education Level
    valid_df = valid_df.withColumn(
        "Education_Level_Standardized",
        when(lower(col("EducationLevel")).contains("bachelor"), "Bachelor's")
        .when(lower(col("EducationLevel")).contains("master"), "Master's")
        .when(lower(col("EducationLevel")).contains("diploma"), "Diploma")
        .when(lower(col("EducationLevel")).contains("phd"), "PhD")
        .otherwise(col("EducationLevel"))
    )
    
    # Create connection tier classification
    valid_df = valid_df.withColumn(
        "Connection_Tier",
        when(col("Connections") < 100, "Starter")
        .when(col("Connections") < 500, "Growing")
        .when(col("Connections") < 900, "Established")
        .otherwise("Influencer")
    )
    
    # Create follower tier classification
    valid_df = valid_df.withColumn(
        "Follower_Tier",
        when(col("Followers") < 1500, "Emerging")
        .when(col("Followers") < 3000, "Rising")
        .otherwise("Popular")
    )
    
    # Calculate follower to connection ratio
    valid_df = valid_df.withColumn(
        "Follower_Connection_Ratio",
        when(col("Connections") > 0, 
             round(col("Followers") / col("Connections"), 2))
        .otherwise(0)
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    return valid_df


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 32, Finished, Available, Finished)

#### **Posts**

In [31]:
# 2. Posts Table Transformations
def posts_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Convert DatePosted to date type
    valid_df = valid_df.withColumn("DatePosted", to_date(col("DatePosted"),"dd-MM-yyyy"))
    
    # Standardize PostType
    valid_df = valid_df.withColumn(
        "Post_Type_Standardized",
        when(lower(col("PostType")).contains("blog"), "Career Update")
        .when(lower(col("PostType")).contains("skill share"), "Skill Share")
        .when(lower(col("PostType")).contains("achievement"), "Achievement")
        .otherwise(col("PostType"))
    )
    
    # Create engagement rate calculation
    valid_df = valid_df.withColumn(
        "Engagement_Rate",
        when(col("Impressions") > 0, 
             round((col("Reactions") + col("Comments") + col("Shares")) / col("Impressions") * 100, 2))
        .otherwise(0)
    )
    
    # Create post length classification
    valid_df = valid_df.withColumn(
        "Post_Length_Category",
        when(col("PostLength") <= 30, "Short")
        .when(col("PostLength") <= 65, "Medium")
        .when(col("PostLength") <= 100, "Long")
        .otherwise("Very Long")
    )
    
    # Create viral post flag
    valid_df = valid_df.withColumn(
        "Is_Viral",
        when((col("Reactions") > 100) | (col("Shares") > 50), True)
        .otherwise(False)
    )
    
    # Create high engagement flag
    valid_df = valid_df.withColumn(
        "High_Engagement",
        when(col("Engagement_Rate") > 5, True)
        .otherwise(False)
    )
    
    # Standardize AttachmentType
    valid_df = valid_df.withColumn(
        "Attachment_Type_Standardized",
        when(lower(col("AttachmentType")).contains("text"), "Text")
        .when(lower(col("AttachmentType")).contains("image"), "Image")
        .when(lower(col("AttachmentType")).contains("video"), "Video")
        .when(lower(col("AttachmentType")).contains("link"), "Link")
        .otherwise("Other")
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df

StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 33, Finished, Available, Finished)

#### **Connection Growth**

In [32]:
# 3. Connection Growth Table Transformations
def connection_growth_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Convert Date to date type
    valid_df = valid_df.withColumn("Date", to_date(col("Date"),"dd-MM-yyyy"))
    
    # Calculate acceptance rate
    valid_df = valid_df.withColumn(
        "Invite_Acceptance_Rate",
        when(col("InvitesSent") > 0, 
             round(col("InvitesAccepted") / col("InvitesSent") * 100, 2))
        .otherwise(0)
    )
    
    # Create growth category
    valid_df = valid_df.withColumn(
        "Growth_Category",
        when(col("ConnectionGrowthRate") < 0, "Declining")
        .when(col("ConnectionGrowthRate") == 0, "Stagnant")
        .when(col("ConnectionGrowthRate") < 15, "Slow Growth")
        .when(col("ConnectionGrowthRate") < 40, "Moderate Growth")
        .otherwise("High Growth")
    )
    
    # Create activity level classification
    valid_df = valid_df.withColumn(
        "Activity_Level",
        when((col("MessagesSent") > 10) | (col("InvitesSent") > 20), "High")
        .when((col("MessagesSent") > 5) | (col("InvitesSent") > 10), "Medium")
        .otherwise("Low")
    )
    
    # Create engagement score
    valid_df = valid_df.withColumn(
        "Engagement_Score",
        col("MessagesSent") + col("InvitesSent") + col("GroupsJoined") + (col("ProfileViews") / 10)
    )
    
    # Flag for network expansion
    valid_df = valid_df.withColumn(
        "Network_Expanding",
        when(col("NewConnections") > 0, True)
        .otherwise(False)
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 34, Finished, Available, Finished)

#### **Post Performance**

In [33]:
# 4. Post Performance Table Transformations
def post_performance_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Recalculate engagement rate for consistency
    valid_df = valid_df.withColumn(
        "Calculated_Engagement_Rate",
        when(col("Impressions") > 0, 
             round((col("Reactions") + col("Comments") + col("Shares")) / col("Impressions") * 100, 2))
        .otherwise(0)
    )
    
    # Calculate click-through rate percentage
    valid_df = valid_df.withColumn(
        "CTR_Percentage",
        when(col("Impressions") > 0, 
             round(col("Clicks") / col("Impressions") * 100, 2))
        .otherwise(0)
    )
    
    # Create performance tier
    valid_df = valid_df.withColumn(
        "Performance_Tier",
        when(col("Calculated_Engagement_Rate") > 10, "Excellent")
        .when(col("Calculated_Engagement_Rate") > 5, "Good")
        .when(col("Calculated_Engagement_Rate") > 2, "Average")
        .otherwise("Poor")
    )
    
    # Standardize ContentType
    valid_df = valid_df.withColumn(
        "Content_Type_Standardized",
        when(lower(col("ContentType")).contains("text"), "Text")
        .when(lower(col("ContentType")).contains("image"), "Image")
        .when(lower(col("ContentType")).contains("video"), "Video")
        .when(lower(col("ContentType")).contains("link"), "Link")
    )
    
    # Create viral content flag
    valid_df = valid_df.withColumn(
        "Is_Viral_Content",
        when((col("Reactions") > 400) | (col("Shares") > 40) | (col("Reach") > 3500), True)
        .otherwise(False)
    )
    
    # Calculate reach rate
    valid_df = valid_df.withColumn(
        "Reach_Rate",
        when(col("Impressions") > 0, 
             round(col("Reach") / col("Impressions") * 100, 2))
        .otherwise(0)
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 35, Finished, Available, Finished)

#### **Job Applications**

In [34]:
# 5. Job Applications Table Transformations
def job_applications_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Convert ApplicationDate to date type
    valid_df = valid_df.withColumn("ApplicationDate", to_date(col("ApplicationDate"),"dd-MM-yyyy"))
    
    # Standardize Status
    valid_df = valid_df.withColumn(
        "Status_Standardized",
        when(lower(col("Status")).contains("submit"), "Submitted")
        .when(lower(col("Status")).contains("interview"), "Interview")
        .when(lower(col("Status")).contains("select"), "Selected")
        .when(lower(col("Status")).contains("reject"), "Rejected")
        .otherwise(col("Status"))
    )
    
    # Create application outcome flags
    valid_df = valid_df.withColumn(
        "Is_Successful",
        when(lower(col("Status_Standardized")) == "selected", True)
        .otherwise(False)
    ).withColumn(
        "Is_In_Progress",
        when(lower(col("Status_Standardized")).isin(["submitted", "interview"]), True)
        .otherwise(False)
    )
    
    # Create referral flag
    valid_df = valid_df.withColumn(
        "Used_Referral",
        when(lower(col("ReferralUsed")).isin(["yes", "true", "1"]), True)
        .otherwise(False)
    )
    
    # Parse experience required
    valid_df = valid_df.withColumn(
        "Experience_Years",
        regexp_replace(col("ExperienceRequired"), r"[^\d]", "").cast(IntegerType())
    )
    
    # Create experience category
    valid_df = valid_df.withColumn(
        "Experience_Category",
        when(col("Experience_Years") == 0, "Entry Level")
        .when(col("Experience_Years") < 3, "Junior")
        .when(col("Experience_Years") < 6, "Mid Level")
        .when(col("Experience_Years") < 9, "Senior")
        .otherwise("Expert")
    )
    
    # Calculate days since application
    valid_df = valid_df.withColumn(
        "Days_Since_Application",
        datediff(current_date(), col("ApplicationDate"))
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df

StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 36, Finished, Available, Finished)

#### **User Activity**

In [35]:
# 6. User Activity Table Transformations
def user_activity_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Convert Date to date type
    valid_df = valid_df.withColumn("Date", to_date(col("Date"),"dd-MM-yyyy"))
    
    # Create total activity score
    valid_df = valid_df.withColumn(
        "Total_Activity_Score",
        col("Logins") + col("MessagesSent") + col("MessagesReceived") + 
        col("JobsViewed") + col("ProfilesViewed") + col("PostsCreated") + 
        col("GroupsJoined") + col("NotificationsViewed")
    )
    
    # Create activity level classification
    valid_df = valid_df.withColumn(
        "Activity_Level",
        when(col("Total_Activity_Score") > 50, "Very High")
        .when(col("Total_Activity_Score") > 35, "High")
        .when(col("Total_Activity_Score") > 20, "Medium")
        .when(col("Total_Activity_Score") > 10, "Low")
        .otherwise("Very Low")
    )
    
    # Create engagement ratios
    valid_df = valid_df.withColumn(
        "Message_Response_Rate",
        when(col("MessagesSent") > 0, 
             round(col("MessagesReceived") / col("MessagesSent"), 2))
        .otherwise(0)
    )
    
    # Create job seeker flag
    valid_df = valid_df.withColumn(
        "Is_Job_Seeker",
        when(col("JobsViewed") > 7, True)
        .otherwise(False)
    )
    
    # Create social networker flag
    valid_df = valid_df.withColumn(
        "Is_Social_Networker",
        when((col("ProfilesViewed") > 7) | (col("MessagesSent") > 7), True)
        .otherwise(False)
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 37, Finished, Available, Finished)

#### **Company Affiliation**

In [36]:
# 7. Company Affiliation Table Transformations
def company_affiliation_transformations(spark: SparkSession, valid_df: DataFrame) -> DataFrame:
    
    # Clean string columns
    for col_name in valid_df.columns:
        if valid_df.schema[col_name].dataType == StringType():
            valid_df = valid_df.withColumn(col_name, trim(col(col_name)))
    
    # Convert date columns to date type
    valid_df = valid_df.withColumn("StartDate", to_date(col("StartDate"),"dd-MM-yyyy"))
    valid_df = valid_df.withColumn("EndDate", to_date(col("EndDate"),"dd-MM-yyyy"))
    
    # Create current employment flag
    valid_df = valid_df.withColumn(
        "Is_Current_Employment",
        when(col("EndDate").isNull(), True)
        .otherwise(False)
    )
    
    # Calculate employment duration
    valid_df = valid_df.withColumn(
        "Employment_Duration_Days",
        when(col("EndDate").isNull(), 
             datediff(current_date(), col("StartDate")))
        .otherwise(datediff(col("EndDate"), col("StartDate")))
    )
    
    # Convert duration to years
    valid_df = valid_df.withColumn(
        "Employment_Duration_Years",
        round(col("Employment_Duration_Days") / 365.25, 1)
    )
    
    # Create tenure category
    valid_df = valid_df.withColumn(
        "Tenure_Category",
        when(col("Employment_Duration_Years") < 3, "Short")
        .when(col("Employment_Duration_Years") < 6, "Medium")
        .when(col("Employment_Duration_Years") < 10, "Long ")
    )
    
    # Standardize Employment Type
    valid_df = valid_df.withColumn(
        "Employment_Type_Standardized",
        when(lower(col("EmploymentType")).contains("full"), "Full-time")
        .when(lower(col("EmploymentType")).contains("contract"), "Contract")
        .when(lower(col("EmploymentType")).contains("intern"), "Internship")
        .otherwise(col("EmploymentType"))
    )
    
    # Parse salary range
    valid_df = valid_df.withColumn(
        "Salary_Range_Lower",
        when(col("SalaryRange").contains("-"), 
             regexp_replace(col("SalaryRange"), r"[^\d]", "").cast(IntegerType()))
        .otherwise(None)
    )
    
    # Create salary category
    valid_df = valid_df.withColumn(
        "Salary_Category",
        when(col("Salary_Range_Lower") < 400000, "Entry Level")
        .when(col("Salary_Range_Lower") < 700000, "Mid Level")
        .when(col("Salary_Range_Lower") < 100000, "Senior Level")
        .otherwise("Executive Level")
    )
    
    # Convert Src_Transaction_Date to date type
    valid_df = valid_df.withColumn("Src_Transaction_Date", to_date(col("Src_Transaction_Date"),"dd-MM-yyyy"))
    
    return valid_df

StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 38, Finished, Available, Finished)

#### **Reprocessing Layer**

In [37]:
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType

# Configuration metadata for silver layer
conf_data = [
    ("user_identity",        False, "UserID",     None,     None,            1),
    ("linkedin_users",       True,  "UserID",     "UserID",     "user_identity", 2),
    ("post",                 True,  "PostID",        "UserID",  "linkedin_users",3),
    ("connection_growth",    True,  "RecordID",       "UserID", "linkedin_users",4),
    ("post_performance",     True,  "PostID",        "PostID" , "post",          5),
    ("job_applications",     True,  "ApplicationID",  "UserID", "linkedin_users",6),
    ("user_activity",        True,  "RecordID",       "UserID", "linkedin_users",7),
    ("company_affiliation",  True,  "AffiliationID",  "UserID", "linkedin_users",8),
]

schema = StructType([
    StructField("table_name", StringType(), True),
    StructField("requires_reprocessing", BooleanType(), True),
    StructField("business_key", StringType(), True),
    StructField("comparison_key", StringType(), True),
    StructField("comparison", StringType(), True),
    StructField("order", IntegerType(), True)
])

conf_silver_df = spark.createDataFrame(conf_data, schema)


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 39, Finished, Available, Finished)

In [38]:
# Transformation function mapping
transformation_mapping = {
    "user_identity": user_identity_transformations,
    "linkedin_users": linkedin_users_transformations,
    "post": posts_transformations,
    "connection_growth": connection_growth_transformations,
    "post_performance": post_performance_transformations,
    "job_applications": job_applications_transformations,
    "user_activity": user_activity_transformations,
    "company_affiliation": company_affiliation_transformations,
}

from delta.tables import DeltaTable
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

def save_all_transformed_tables(spark: SparkSession):
    silver_base_path = "abfss://LinkedIN@onelake.dfs.fabric.microsoft.com/LinkedIN.Lakehouse/Tables/silver/"
    invalid_base_path = "abfss://LinkedIN@onelake.dfs.fabric.microsoft.com/LinkedIN.Lakehouse/Tables/reproc/"

    # Step 1: Preload comparison tables (only once)
    comparison_tables = {}
    for row in conf_data:
        table_name, _, _, comparison_key, comparison_table, _ = row
        if comparison_table and comparison_table not in comparison_tables:
            comparison_path = f"{silver_base_path}{comparison_table}"
            try:
                ref_df = spark.read.format("delta").load(comparison_path).select(comparison_key).distinct()
                comparison_tables[comparison_table] = ref_df
            except Exception as e:
                print(f"⚠️ Skipping comparison load for {comparison_table}: {str(e)}")

    # Step 2: Loop through all tables based on conf
    for row in conf_data:
        table_name, requires_reprocessing, business_key, comparison_key, comparison_table, _ = row

        raw_path = f"abfss://LinkedIN@onelake.dfs.fabric.microsoft.com/LinkedIN.Lakehouse/Tables/BRONZE/{table_name}"
        silver_path = f"{silver_base_path}{table_name}"
        invalid_path = f"{invalid_base_path}{table_name}"

        print(f"\n🔄 Processing table: {table_name}")

        raw_df = spark.read.format("delta").load(raw_path)
        transform_func = transformation_mapping.get(table_name)
        transformed_df = transform_func(spark, raw_df) if transform_func else raw_df

        # Initialize valid & invalid
        valid_df = transformed_df
        invalid_df = spark.createDataFrame([], transformed_df.schema)

        # Step 3: Comparison check using comparison_key
        if comparison_table and comparison_table in comparison_tables:
            ref_df = comparison_tables[comparison_table]
            missing_keys_df = transformed_df.join(ref_df, on=comparison_key, how="left_anti")
            valid_df = transformed_df.join(ref_df, on=comparison_key, how="inner")
            invalid_df = invalid_df.unionByName(missing_keys_df)

        # Step 4: Reprocessing logic
        if requires_reprocessing:
            cols = transformed_df.columns

            if "Src_Transaction_Date" in cols and business_key in cols:
                valid_records = valid_df.filter(col(business_key).isNotNull() & col("Src_Transaction_Date").isNotNull())
                null_records = valid_df.filter(col(business_key).isNull() | col("Src_Transaction_Date").isNull())
            elif business_key in cols:
                valid_records = valid_df.filter(col(business_key).isNotNull())
                null_records = valid_df.filter(col(business_key).isNull())
            else:
                valid_records = valid_df
                null_records = spark.createDataFrame([], transformed_df.schema)

            # Add to invalids
            invalid_df = invalid_df.unionByName(null_records)

            # Deduplicate if needed
            if business_key in cols and "Src_Transaction_Date" in cols:
                window_spec = Window.partitionBy(business_key).orderBy(col("Src_Transaction_Date").desc())
                deduped_df = valid_records.withColumn("row_num", row_number().over(window_spec)) \
                                          .filter("row_num = 1") \
                                          .drop("row_num")
            else:
                 deduped_df = valid_records.dropDuplicates([business_key])

            # Merge to silver
            if DeltaTable.isDeltaTable(spark, silver_path):
                delta_table = DeltaTable.forPath(spark, silver_path)
                delta_table.alias("target").merge(
                    deduped_df.alias("source"),
                    f"target.{business_key} = source.{business_key}"
                ).whenMatchedUpdateAll() \
                 .whenNotMatchedInsertAll() \
                 .execute()
            else:
                deduped_df.write.format("delta").mode("overwrite").save(silver_path)

            print(f"✅ Valid data written to silver: {table_name}")
        else:
            # For static tables like user_identity
            valid_df.write.format("delta").mode("overwrite").save(silver_path)
            print(f"✅ Static table written to silver: {table_name}")

        # Step 5: Save invalid records
        if not invalid_df.rdd.isEmpty():
            invalid_df.write.format("delta").mode("overwrite").save(invalid_path)
            print(f"⚠️ Invalid data written to reproc: {table_name}")
        else:
            print(f"✅ No invalid data present for table: {table_name}")


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 40, Finished, Available, Finished)

In [40]:
save_all_transformed_tables(spark)


StatementMeta(, c49d8603-0c98-49e6-bb92-e22e62fd3fff, 42, Finished, Available, Finished)


🔄 Processing table: user_identity
✅ Static table written to silver: user_identity
✅ No invalid data present for table: user_identity

🔄 Processing table: linkedin_users
✅ Valid data written to silver: linkedin_users
⚠️ Invalid data written to reproc: linkedin_users

🔄 Processing table: post
✅ Valid data written to silver: post
✅ No invalid data present for table: post

🔄 Processing table: connection_growth
✅ Valid data written to silver: connection_growth
✅ No invalid data present for table: connection_growth

🔄 Processing table: post_performance
✅ Valid data written to silver: post_performance
✅ No invalid data present for table: post_performance

🔄 Processing table: job_applications
✅ Valid data written to silver: job_applications
✅ No invalid data present for table: job_applications

🔄 Processing table: user_activity
✅ Valid data written to silver: user_activity
✅ No invalid data present for table: user_activity

🔄 Processing table: company_affiliation
✅ Valid data written to silve