In [0]:
from enum import Enum
from pyspark.sql.types import (
    ByteType,
    DoubleType,
    StringType,
    StructField,
    StructType,
    TimestampType,
    BooleanType,
    FloatType
)

TEST_CLIENTS = ['test multi','davidh test2 multi','ice demo multi', 'monitoring client pod2 multi']

BOOLEAN_STRING_COLUMN = ['is_currency_converted', 'is_eea', 'is_external_mpi', 'is_partial_amount', 'is_prepaid',
                         'is_sale_3d', 'is_void', 'liability_shift', 'manage_3d_decision', 'mc_scheme_token_used',
                         'partial_approval_is_void', 'rebill', 'is_3d']

In [0]:
import time
from typing import Any, Dict, Optional
from pyspark.sql import functions as F, DataFrame
from pyspark.sql.functions import col, when, isnan
from pyspark.sql.functions import col, when, regexp_extract, lower, trim, lit

In [0]:
def create_conversions_columns(df: DataFrame) -> DataFrame:
    """
    Creates new columns in the DataFrame based on specific conditions and transformations.
    Adds flags for transaction status, challenge success, exemption logic, frictionless logic,
    successful authentication, approval, and decline logic.
    
    Args:
        df (DataFrame): Input DataFrame with raw transaction data.
    
    Returns:
        DataFrame: DataFrame with additional columns based on the specified logic.
    """
    existing_cols = df.columns
    new_cols = {}

    # --- 1. Conditional copies ---
    new_cols["is_sale_3d_auth_3d"] = F.when(
        F.col("transaction_type") == "auth3d", F.col("is_sale_3d")
    )
    new_cols["manage_3d_decision_auth_3d"] = F.when(
        F.col("transaction_type") == "auth3d", F.col("manage_3d_decision")
    )

    # --- 2. Transaction result status flags ---
    status_map = {
        "init_status": "initauth3d",
        "auth_3d_status": "auth3d",
        "sale_status": "sale",
        "auth_status": "auth",
        "settle_status": "settle",
        "verify_auth_3d_status": "verify_auth_3d",
    }

    for new_col, txn_type in status_map.items():
        new_cols[new_col] = F.when(
            F.col("transaction_type") == txn_type,
            F.when(F.col("transaction_result_id") == "1006", F.lit("true")).otherwise(
                F.lit("false")
            ),
        )

    # --- 3. Challenge success ---
    new_cols["is_successful_challenge"] = F.when(
        F.col("3d_flow_status") == "3d_success", F.lit("true")
    ).when(
        F.col("3d_flow_status").isin("3d_failure", "3d_wasnt_completed"), F.lit("false")
    )

    # --- 4. Exemption logic ---
    new_cols["is_successful_exemption"] = F.when(
        F.col("authentication_flow") == "exemption", F.lit("true")
    ).when(F.col("challenge_preference") == "y_requested_by_acquirer", F.lit("false"))

    # --- 5. Frictionless logic ---
    new_cols["is_successful_frictionless"] = F.when(
        (F.col("authentication_flow") == "frictionless") & (F.col("status") == "40"),
        F.lit("true"),
    ).when(F.col("authentication_flow") == "frictionless", F.lit("false"))

    # --- 6. Successful authentication ---
    new_cols["is_successful_authentication"] = F.when(
        (F.col("3d_flow_status") == "3d_success")
        | (
            (F.col("authentication_flow") == "frictionless") & (F.col("status") == "40")
        ),
        F.lit("true"),
    ).when(
        (F.col("acs_url").isNotNull()) & (F.col("authentication_flow") != "exemption")
        | (
            (F.col("authentication_flow") == "frictionless") & (F.col("status") != "40")
        ),
        F.lit("false"),
    )

    # --- 7. Approval logic ---
    new_cols["is_approved"] = F.when(
        (F.col("auth_status") == "true") | (F.col("sale_status") == "true"),
        F.lit("true"),
    ).when(
        (F.col("auth_status") == "false") | (F.col("sale_status") == "false"),
        F.lit("false"),
    )

    # --- 8. Decline logic ---
    new_cols["is_declined"] = F.when(
        (F.col("transaction_type").isin("sale", "auth"))
        & (F.col("transaction_result_id") == "1008"),
        F.lit("true"),
    ).when(
        F.col("auth_status").isNotNull() | F.col("sale_status").isNotNull(),
        F.lit("false"),
    )

    # --- Final projection ---
    # Keep all existing columns and add new ones
    final_cols = [F.col(col_name) for col_name in existing_cols] + [
        expr.alias(new_col) for new_col, expr in new_cols.items()
    ]

    return df.select(*final_cols)

In [0]:
def fixing_dtypes(df: DataFrame, schema: StructType) -> DataFrame:
    """
    Fixes the data types of the columns in the DataFrame based on the provided schema.
    Normalizes boolean strings, trims and lowers string columns, and handles null values.
    
    Args:
        df (DataFrame): Input DataFrame with raw data.
        schema (StructType): Schema defining the expected data types of the columns.
    
    Returns:
        DataFrame: DataFrame with corrected data types.
    """
    struct_fields_dict = {f.name: f for f in schema.fields}
    
    # bool_conversion_dict = {
    #     "1": "true",
    #     "1.0": "true",
    #     "true": "true",
    #     "yes": "true",
    #     "0": "false",
    #     "0.0": "false",
    #     "false": "false",
    #     "no": "false",
    # }
    # bool_keys = list(bool_conversion_dict.keys())

    columns_to_force_null = {
        "user_agent_3d",
        "authentication_request",
        "authentication_response",
        "authorization_req_duration",
    }

    new_cols = []

    for field in schema.fieldNames():
        if field in columns_to_force_null:
            new_cols.append(lit(None).cast(schema[field].dataType).alias(field))
            continue

        expr = col(field)
        field_type = struct_fields_dict[field].dataType
        
        valid_true  = ["true", "1", "yes", "1.0"]
        valid_false = ["false", "0", "no", "0.0"]

        if field_type == BooleanType() or field in BOOLEAN_STRING_COLUMN:
            # Normalize first
            expr_norm = trim(lower(expr))
            
            # Only allow values that are valid booleans
            expr = when(expr_norm.isin(*valid_true), lit(True)) \
                .when(expr_norm.isin(*valid_false), lit(False)) \
                .otherwise(lit(None))
            
            expr = expr.cast(BooleanType())

        elif isinstance(field_type, StringType):
            expr = when(expr.rlike(r"^\d+\.?\d*$"), regexp_extract(expr, r"(\d+)", 1)).otherwise(expr)
            expr = trim(lower(expr))
            expr = when(expr.isin(["<na>", "na", "nan", "none", "", " ", "\x00"]), None).otherwise(expr)
            expr = when(expr == "deprecated", None).otherwise(expr)

        else:
            if isinstance(field_type, (FloatType, DoubleType)):
                expr = when(expr.isNull(), lit(float("nan"))).otherwise(expr)
            expr = expr.cast(field_type)

        new_cols.append(expr.alias(field))

    # Also select any additional columns in the DataFrame that are not part of the schema
    passthrough_cols = [col(c) for c in df.columns if c not in schema.fieldNames()]
    return df.select(*new_cols, *passthrough_cols)

def filter_and_transform_transactions(df, schema=None):
    """
    Filters and transforms the transactions DataFrame.
    Removes test clients, fixes data types, and creates new transaction status columns.
    
    Args:
        df (DataFrame): Input DataFrame with raw transaction data.
        schema (StructType, optional): Schema defining the expected data types of the columns.
    
    Returns:
        DataFrame: Transformed DataFrame with filtered and processed transactions.
    """

    df = df.filter(~col("multi_client_name").isin(TEST_CLIENTS))
    df = create_conversions_columns(df)
    df = fixing_dtypes(df, schema)
    return df