In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

class DataQuality:
    def __init__(self, spark: SparkSession):
        self.spark = spark

    def standardize_column_names(self, df: DataFrame) -> DataFrame:
        """
        Standardizes column names by replacing spaces and hyphens with underscores
        and converting all names to lowercase.

        Parameters:
            df (DataFrame): Input DataFrame

        Returns:
            DataFrame: DataFrame with standardized column names

        Raises:
            ValueError: If DataFrame is invalid or has no columns
            Exception: For unexpected transformation errors
        """
        try:
            # Validate DataFrame
            if df is None or not isinstance(df, DataFrame):
                raise ValueError("Invalid DataFrame provided.")

            if not df.columns:
                raise ValueError("DataFrame has no columns to standardize.")

            # Standardize names
            new_cols = [col.replace(" ", "_").replace("-", "_").lower() for col in df.columns]
            for old_col, new_col in zip(df.columns, new_cols):
                df = df.withColumnRenamed(old_col, new_col)

            return df

        except ValueError as ve:
            # Re-raise validation errors for clarity
            raise ve
        except Exception as e:
            # Wrap and raise any other unexpected errors
            raise Exception(f"Unexpected error while standardizing column names: {e}") from e
    
    def filter_not_null(self, df: DataFrame, cols: list) -> DataFrame:
        """
        Filter rows where none of the given columns are null.

        Args:
            df (DataFrame): Input Spark DataFrame.
            cols (list): List of column names to check for null values.

        Returns:
            DataFrame: Filtered DataFrame where all given columns are non-null.
        """
        if not isinstance(cols, list):
            raise TypeError(f"'cols' must be a list of column names, got {type(cols).__name__}")
    
        if not all(isinstance(c, str) for c in cols):
            raise TypeError(f"All column names must be strings. Invalid entries: {[c for c in cols if not isinstance(c, str)]}")\

        condition = F.lit(True)
        for col in cols:
            condition = condition & F.col(col).isNotNull()
        return df.filter(condition)

    def filter_valid_contact(self,df: DataFrame, col: str) -> DataFrame:
        """
        Filter rows with invalid US phone numbers.

        Args:
            df (DataFrame): Input Spark DataFrame.
            col (str): Column containing phone numbers.

        Returns:
            DataFrame: Filtered DataFrame with only invalid phone numbers.
        """
        try:
            # --- Input validation ---
            if not isinstance(df, DataFrame):
                raise TypeError("df must be a pyspark.sql.DataFrame")

            if not isinstance(col, str):
                raise TypeError("col must be a string")

            if col not in df.columns:
                raise ValueError(f"Column '{col}' not found in DataFrame")

            field_type = dict(df.dtypes)[col]
            if field_type != "string":
                raise ValueError(f"Column '{col}' must be of type string, found {field_type}")
            # Regex for valid US phone numbers
            valid_phone_regex = r"^(\+1\s?)?(\(?\d{3}\)?[\s.-]?)\d{3}[\s.-]?\d{4}$"

            return df.filter(F.col(col).rlike(valid_phone_regex))
        except (TypeError, ValueError) as e:
            raise e
        except Exception as e:
            # Unexpected errors → wrap in RuntimeError
            raise RuntimeError(f"Unexpected error in filter_valid_contact: {str(e)}") from e

    def clean_name_column(self,df: DataFrame, col_name: str = "name") -> DataFrame:
        """
        Removes special characters and numbers from the given column in a PySpark DataFrame.

        Parameters:
        df (DataFrame): Input DataFrame
        col_name (str): Column to clean (overwritten)

        Returns:
        DataFrame: DataFrame with the cleaned column (same name)

        Raises:
        ValueError: If DataFrame or column is invalid
        """
        try:
            # Validate DataFrame
            if df is None or not isinstance(df, DataFrame):
                raise ValueError("Invalid DataFrame provided.")

            # Validate column existence
            if col_name not in df.columns:
                raise ValueError(f"Column '{col_name}' does not exist in the DataFrame.")

            # Overwrite the same column with cleaned values
            cleaned_df = df.withColumn(
                col_name,
                F.trim(F.regexp_replace(F.col(col_name), "[^A-Za-z ]", ""))
            )
            return cleaned_df

        except Exception as e:
            print(f"Error while cleaning column '{col_name}': {str(e)}")
            raise

    def filter_valid_orders(self,df: DataFrame, order_col: str = "order_date", ship_col: str = "ship_date") -> DataFrame:
        """
        Filters records where order_date is less than ship_date.

        Parameters:
        df (DataFrame): Input DataFrame
        order_col (str): Column name for order date
        ship_col (str): Column name for ship date

        Returns:
        DataFrame: Filtered DataFrame with valid records

        Raises:
        ValueError: If DataFrame or required columns are invalid
        """
        try:
            # Validate DataFrame
            if df is None or not isinstance(df, DataFrame):
                raise ValueError("Invalid DataFrame provided.")

            # Validate column existence
            missing_cols = [col for col in [order_col, ship_col] if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in DataFrame: {missing_cols}")

            # Filter where order_date < ship_date
            filtered_df = df.filter(F.col(order_col) < F.col(ship_col))
            return filtered_df

        except Exception as e:
            print(f"Error while filtering records: {str(e)}")
            raise

    def filter_positive_values(self, df: DataFrame, col_names: list) -> DataFrame:
        """
        Filters records where the given columns have only positive values.

        Parameters:
        df (DataFrame): Input DataFrame
        col_names (list): List of columns to filter

        Returns:
        DataFrame: Filtered DataFrame with only positive values for the given columns

        Raises:
        ValueError: If DataFrame is invalid, columns missing, or not numeric
        """
        try:
            # Validate DataFrame
            if df is None or not isinstance(df, DataFrame):
                raise ValueError("Invalid DataFrame provided.")

            # Validate column list
            if not isinstance(col_names, list) or not col_names:
                raise ValueError("Parameter 'col_names' must be a non-empty list.")

            # Build filter conditions
            filter_condition = None
            for col_name in col_names:
                # Validate column existence
                if col_name not in df.columns:
                    raise ValueError(f"Column '{col_name}' does not exist in the DataFrame.")

                # Validate numeric type
                col_type = dict(df.dtypes)[col_name]
                numeric_types = ("int", "bigint", "double", "float", "decimal", "long", "smallint")
                if not any(nt in col_type for nt in numeric_types):
                    raise ValueError(f"Column '{col_name}' must be numeric, found type '{col_type}'.")

                # Add positive value condition
                condition = F.col(col_name) > 0
                filter_condition = condition if filter_condition is None else (filter_condition & condition)

            # Apply combined filter
            filtered_df = df.filter(filter_condition)
            return filtered_df

        except Exception as e:
            print(f"Error while filtering positive values for columns {col_names}: {str(e)}")
            raise