In [0]:
%run ../jobs/data_quality

In [0]:
import unittest
import pytest
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType, StructField

class TestDataQuality(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.spark = SparkSession.builder.appName("test-reader").getOrCreate()
        cls.dq = DataQuality(cls.spark)
    
    def test_standardize_column_names(self):
        input_df = self.spark.createDataFrame([("O1", 100.0)], ["Order Id", "profit-margin"])
        result_df = self.dq.standardize_column_names(input_df)
        result_df_cols = result_df.columns
        expected_columns = ["order_id", "profit_margin"]

        self.assertEqual(result_df_cols, expected_columns)
    
    # ---------- POSITIVE CASES ----------
    def test_columns_with_spaces(self):
        df = self.spark.createDataFrame([(1, "Alice")], ["Order ID", "Customer Name"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["order_id", "customer_name"]
    
    def test_columns_with_hyphens(self):
        df = self.spark.createDataFrame([(1, "Alice")], ["order-id", "ship-date"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["order_id", "ship_date"]

    def test_columns_with_spaces_and_hyphens(self):
        df = self.spark.createDataFrame([(1, "Alice","24/08/2025")], ["Order-ID", "Customer Name", "Ship-Date"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["order_id", "customer_name", "ship_date"]

    def test_columns_already_clean(self):
        df = self.spark.createDataFrame([(1, "X1-212")], ["order_id", "customer_id"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["order_id", "customer_id"]

    # ---------- EDGE CASES ----------
    def test_empty_dataframe(self):
        schema = StructType([])
        df = self.spark.createDataFrame([], schema=schema)
        result = self.dq.standardize_column_names(df)
        assert result.columns == []

    def test_multiple_consecutive_spaces_and_hyphens(self):
        df = self.spark.createDataFrame([(1, "Alice")], ["Order  ID", "Ship--Date"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["order__id", "ship__date"]

    def test_leading_trailing_spaces_and_hyphens(self):
        df = self.spark.createDataFrame([(1, "Alice")], [" Order ", "-Customer-"])
        result = self.dq.standardize_column_names(df)
        assert result.columns == ["_order_", "_customer_"]

    def test_duplicate_columns_after_transform(self):
        df = self.spark.createDataFrame([(1, "Alice")], ["Order-ID", "Order ID"])
        with pytest.raises(Exception):  # duplicate after transform ("order_id")
            sel.dq.standardize_column_names(df)

    #==========================

    def test_filter_not_null(self):
        data = [
            (1, "A", 100),
            (2, None, 200),
            (3, "B", None),
            (4, "C", 300)
        ]
        df = spark.createDataFrame(data, ["id", "name", "salary"])

        # Filter out rows where name or salary is null
        df_filtered = self.dq.filter_not_null(df, ["name", "salary"])
        self.assertEqual(df_filtered.count(), 2)

    def test_filter_single_column_not_null(self):
        data = [(1, "Alice"), (2, None), (3, "Charlie")]
        df = self.spark.createDataFrame(data, ["id", "name"])
        result = self.dq.filter_not_null( df, ["name"])
        assert result.count() == 2
        assert [r["id"] for r in result.collect()] == [1, 3]

    def test_filter_multiple_columns_not_null(self):
        data = [(1, "Alice", 100), (2, None, 200), (3, "Bob", None)]
        df = self.spark.createDataFrame(data, ["id", "name", "salary"])
        result = self.dq.filter_not_null(df, ["name", "salary"])
        assert result.count() == 1
        assert result.collect()[0]["id"] == 1

    def test_no_nulls_present(self):
        data = [(1, "Alice", 100), (2, "Bob", 200)]
        df = self.spark.createDataFrame(data, ["id", "name", "salary"])
        result = self.dq.filter_not_null(df, ["name", "salary"])
        assert result.count() == 2

    def test_empty_dataframe(self):
        schema = "id INT, name STRING"
        df = self.spark.createDataFrame([], schema=schema)
        result = self.dq.filter_not_null(df, ["name"])
        assert result.count() == 0
    
    def test_all_rows_null_in_filtered_column(self):
        data = [(1, None), (2, None)]
        schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True)
        ])
        df = self.spark.createDataFrame(data, schema=schema)
        result = self.dq.filter_not_null( df, ["name"])
        assert result.count() == 0

    def test_filter_with_no_columns_passed(self):
        data = [(1, "Alice"), (2, None)]
        df = self.spark.createDataFrame(data, ["id", "name"])
        result = self.dq.filter_not_null(df, [])
        # No filter applied → return same count
        assert result.count() == 2

    def test_some_columns_not_in_dataframe(self):
        data = [(1, "Alice", 100)]
        df = self.spark.createDataFrame(data, ["id", "name", "salary"])
        with pytest.raises(Exception):  # should fail because col doesn't exist
            self.dq.filter_not_null(df, ["missing_col"])


    # ---------- NEGATIVE CASES ----------
    def test_none_as_dataframe(self):
        with pytest.raises(AttributeError):
            self.dq.filter_not_null( None, ["col1"])

    def test_invalid_column_list_type(self):
        data = [(1, "Alice")]
        df = self.spark.createDataFrame(data, ["id", "name"])
        with pytest.raises(TypeError):
            self.dq.filter_not_null(df, "name")  # string instead of list

    def test_invalid_column_name_type(self):
        data = [(1, "Alice")]
        df = self.spark.createDataFrame(data, ["id", "name"])
        with pytest.raises(Exception):  # int is not a valid col name
            self.dq.filter_not_null( df, [123])


    def test_filter_invalid_contact(self):
        data = [
        (1, "123-456-7890"),(2, "(123) 456-7890"),(3, "+1 123 456 7890"),
        (4, "1234567890"),(5, "123-45-6789"), (6, "12-3456-7890"),(7, "phone123456")]
        
        df = self.spark.createDataFrame(data, ["id", "phone"])

        valid_contact_df = self.dq.filter_valid_contact(df, "phone")
        self.assertEqual(valid_contact_df.count(), 4)

    # --- positive test cases
    def test_valid_us_numbers(self):
        data = [("123-456-7890",), ("(123) 456-7890",), ("+1 123 456 7890",)]
        df = self.spark.createDataFrame(data, ["phone"])
        result = self.dq.filter_valid_contact(df, "phone")
        # all are valid → should return all 3
        assert result.count() == 3
        assert set(r["phone"] for r in result.collect()) == {"123-456-7890", "(123) 456-7890", "+1 123 456 7890"}

    
    def test_valid_numbers_excluded(self):
        data = [("12345",), ("abc-def-ghij",), ("999999",)]
        df = self.spark.createDataFrame(data, ["phone"])
        result = self.dq.filter_valid_contact(df, "phone")
        # none are valid → expect 0 rows
        assert result.count() == 0


    def test_mixed_valid_and_invalid(self):
        data = [("123-456-7890",), ("999999",), ("(123) 456-7890",), ("abc",)]
        df = self.spark.createDataFrame(data, ["phone"])
        result = self.dq.filter_valid_contact(df, "phone")
        assert set(result.toPandas()["phone"]) == {"123-456-7890", "(123) 456-7890"}

    #-----Edge cases------------------------
    def test_empty_dataframe_with_schema(self):
        schema = StructType([StructField("phone", StringType(), True)])
        df = self.spark.createDataFrame([], schema)
        result = self.dq.filter_valid_contact(df, "phone")
        assert result.count() == 0


    def test_all_null_values(self):
        data = [(None,), (None,)]
        schema = StructType([StructField("phone", StringType(), True)])
        df = self.spark.createDataFrame(data, schema=schema)
        result = self.dq.filter_valid_contact(df, "phone")
        # nulls are not valid → expect 0
        assert result.count() == 0


    def test_all_valid_numbers(self):
        data = [("(555) 555-5555",), ("555.555.5555",)]
        df = self.spark.createDataFrame(data, ["phone"])
        result = self.dq.filter_valid_contact(df, "phone")
        assert result.count() == 2

    #-------------negative test cases
    def test_non_existent_column(self):
        data = [("123-456-7890",)]
        df = self.spark.createDataFrame(data, ["phone"])
        with pytest.raises(Exception):  # Spark AnalysisException
            self.dq.filter_valid_contact(df, "wrong_col")


    def test_invalid_column_type_integer(self):
        data = [(1234567890,), (9876543210,)]
        df = self.spark.createDataFrame(data, ["phone"])
        with pytest.raises(Exception):  # rlike requires string type
            self.dq.filter_valid_contact(df, "phone")


    def test_invalid_col_argument_type(self):
        data = [("123-456-7890",)]
        df = spark.createDataFrame(data, ["phone"])
        with pytest.raises(TypeError):
            self.dq.filter_valid_contact(df, 123)  # col must be str

    ### Positive cases ###
    def test_clean_valid_names(self):
        data = [("John123",), ("A@nna!!",), ("R0bert$",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == ["John", "Anna", "Rbert"]

    def test_clean_with_spaces(self):
        data = [(" Jo hn ",), (" Ma ry ",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == ["Jo hn", "Ma ry"]


    ### Edge cases ###
    def test_empty_string(self):
        data = [("",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == [""]

    def test_only_special_chars(self):
        data = [("!@#$%",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == [""]

    def test_only_numbers(self):
        data = [("12345",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == [""]

    def test_null_values(self):
        data = [(None,), ("Valid123",)]
        df = self.spark.createDataFrame(data, ["name"])
        result_df = self.dq.clean_name_column(df, "name")
        result = [row["name"] for row in result_df.collect()]
        assert result == [None, "Valid"]


    ### Negative cases ###
    def test_invalid_dataframe(self):
        with pytest.raises(ValueError):
            self.dq.clean_name_column(None, "name")

    def test_missing_column(self):
        data = [("John",)]
        df = self.spark.createDataFrame(data, ["first_name"])
        with pytest.raises(ValueError):
            self.dq.clean_name_column(df, "name")

    def test_non_dataframe_input(self):
        with pytest.raises(ValueError):
            self.dq.clean_name_column("Not a DF", "name")

    #------------------Test cases for valid orders---------------
    ### Positive cases ###
    def test_valid_orders_filtered(self):
        data = [
            (1, date(2023, 1, 1), date(2023, 1, 5)),  # valid
            (2, date(2023, 2, 10), date(2023, 2, 5)), # invalid
            (3, date(2023, 3, 1), date(2023, 3, 1)),  # equal
            (4, date(2023, 4, 1), date(2023, 4, 3))   # valid
        ]
        df = self.spark.createDataFrame(data, ["order_id", "order_date", "ship_date"])
        result_df = self.dq.filter_valid_orders(df, "order_date", "ship_date")
        result = [row["order_id"] for row in result_df.collect()]
        assert set(result) == {1, 4}


    def test_all_invalid(self):
        data = [
            (1, date(2023, 1, 10), date(2023, 1, 5)),
            (2, date(2023, 2, 5), date(2023, 2, 5))
        ]
        df = self.spark.createDataFrame(data, ["order_id", "order_date", "ship_date"])
        result_df = self.dq.filter_valid_orders(df, "order_date", "ship_date")
        assert result_df.count() == 0


    def test_all_valid(self):
        data = [
            (1, date(2023, 1, 1), date(2023, 1, 2)),
            (2, date(2023, 2, 1), date(2023, 2, 3))
        ]
        df = self.spark.createDataFrame(data, ["order_id", "order_date", "ship_date"])
        result_df = self.dq.filter_valid_orders(df, "order_date", "ship_date")
        assert result_df.count() == 2


    ### Edge cases ###
    def test_empty_dataframe(self):
        df = self.spark.createDataFrame([], "order_id INT, order_date DATE, ship_date DATE")
        result_df = self.dq.filter_valid_orders(df, "order_date", "ship_date")
        assert result_df.count() == 0


    def test_null_values(self):
        data = [
            (1, None, date(2023, 1, 5)),  # order_date null
            (2, date(2023, 2, 1), None),  # ship_date null
            (3, None, None)               # both null
        ]
        df = self.spark.createDataFrame(data, ["order_id", "order_date", "ship_date"])
        result_df = self.dq.filter_valid_orders(df, "order_date", "ship_date")
        assert result_df.count() == 0   # null comparisons always False


    ### Negative cases ###
    def test_invalid_dataframe(self):
        with pytest.raises(ValueError):
            self.dq.filter_valid_orders(None, "order_date", "ship_date")


    def test_missing_order_column(self):
        data = [(1, date(2023, 1, 1))]
        df = spark.createDataFrame(data, ["ship_date"])
        with pytest.raises(ValueError):
            self.dq.filter_valid_orders(df, "order_date", "ship_date")


    def test_missing_ship_column(self):
        data = [(1, date(2023, 1, 1))]
        df = self.spark.createDataFrame(data, ["order_date"])
        with pytest.raises(ValueError):
            self.dq.filter_valid_orders(df, "order_date", "ship_date")


    def test_non_dataframe_input(self):
        with pytest.raises(ValueError):
            self.dq.filter_valid_orders("not_a_dataframe", "order_date", "ship_date")

    # ---------------Test cases for filtering columns with positive values
    ### Positive cases ###
    def test_filter_positive_numeric(self):
        data = [(1, 100.0), (2, -50.0), (3, 0.0), (4, 25.5)]
        df = self.spark.createDataFrame(data, ["id", "price"])
        result_df = self.dq.filter_positive_values(df, ["price"])
        result = [row["id"] for row in result_df.collect()]
        assert set(result) == {1, 4}

    def test_filter_positive_integer(self):
        data = [(1, 5), (2, -3), (3, 0), (4, 10)]
        df = self.spark.createDataFrame(data, ["id", "quantity"])
        result_df = self.dq.filter_positive_values(df, ["quantity"])
        result = [row["id"] for row in result_df.collect()]
        assert set(result) == {1, 4}

    def test_all_positive(self):
        data = [(1, 10), (2, 20)]
        df = self.spark.createDataFrame(data, ["id", "score"])
        result_df = self.dq.filter_positive_values(df, ["score"])
        assert result_df.count() == 2

    def test_all_non_positive(self):
        data = [(1, -10), (2, 0)]
        df = self.spark.createDataFrame(data, ["id", "score"])
        result_df = self.dq.filter_positive_values(df, ["score"])
        assert result_df.count() == 0


    ### Edge cases ###
    def test_empty_dataframe(self):
        df = self.spark.createDataFrame([], "id INT, value INT")
        result_df = self.dq.filter_positive_values(df, ["value"])
        assert result_df.count() == 0

    def test_null_values(self):
        data = [(1, None), (2, -5), (3, 10)]
        df = self.spark.createDataFrame(data, ["id", "value"])
        result_df = self.dq.filter_positive_values(df, ["value"])
        result = [row["id"] for row in result_df.collect()]
        assert result == [3]


    ### Negative cases ###
    def test_invalid_dataframe(self):
        with pytest.raises(ValueError):
            self.dq.filter_positive_values(None, ["value"])

    def test_missing_column(self):
        data = [(1, 10)]
        df = self.spark.createDataFrame(data, ["id"])
        with pytest.raises(ValueError):
            self.dq.filter_positive_values(df, "value")

    def test_non_numeric_column(self):
        data = [(1, "abc"), (2, "xyz")]
        df = self.spark.createDataFrame(data, ["id", "name"])
        with pytest.raises(ValueError):
            self.dq.filter_positive_values(df, ["name"])

    def test_non_dataframe_input(self):
        with pytest.raises(ValueError):
            self.dq.filter_positive_values("not_a_dataframe", ["value"])



if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
.........

Error while filtering positive values for columns ['value']: Invalid DataFrame provided.
Error while filtering positive values for columns value: Parameter 'col_names' must be a non-empty list.


..

Error while filtering records: Missing columns in DataFrame: ['order_date']
Error while filtering records: Missing columns in DataFrame: ['ship_date']


  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):
  self._sock = None
  if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"):
.......

Error while filtering positive values for columns ['value']: Invalid DataFrame provided.
Error while filtering positive values for columns ['name']: Column 'name' must be numeric, found type 'string'.


  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
.
----------------------------------------------------------------------
Ran 48 tests in 15.108s

OK
