In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import pyarrow.parquet as pq
import param
from typing import Tuple
from typing import Callable

def log_time(func):
  """
  Decorator to log the execution time of a function.
  """
  def wrapper(*args, **kwargs):
    #Capture the start time before the target function is executed.
    start_time = time.time()

    #Call the target function and store the result.
    result = func(*args, **kwargs)

    #Capture the end time after the function completes.
    end_time = time.time()

    #Print the execution time along with the function name.
    print(f"Execution time for {func.__name__}: {end_time - start_time:.4f} seconds")

    #Return the result of the target function from the wrapper.
    return result
  return wrapper


In [None]:
class DataConfig(param.Parameterized):
    """
    A class to configure and generate synthetic data.
    This class automatically regenerates data when parameters like num_samples, age, height, etc. change.
    Here age, height, weight, income, expenditure configs represent mean[0] and standard deviation[1] values.
    """
    random_seed: int = param.Integer(default = 111, bounds = (1, None))
    num_samples: int = param.Integer(default=1000, bounds=(1, None))
    age: Tuple[float, float] = param.NumericTuple(default=(18.0, 80.0), length=2)
    height: Tuple[float, float] = param.NumericTuple(default=(165.0, 10.0), length=2)
    weight: Tuple[float, float] = param.NumericTuple(default=(70.0, 15.0), length=2)
    income: Tuple[float, float] = param.NumericTuple(default=(50000.0, 15000.0), length=2)
    expenditure: Tuple[float, float] = param.NumericTuple(default=(0.6, 0.1), length=2)
    gender: str = param.ListSelector(default=['Male', 'Female', 'Other'], objects=['Male', 'Female', 'Other'])
    _data: pd.DataFrame = param.DataFrame(default=pd.DataFrame())

    def __init__(self, **params):
        super().__init__(**params)
        self._generate()

    @param.depends('random_seed', 'num_samples', 'age', 'height', 'weight', 'income', 'expenditure', 'gender', watch=True)
    def _generate(self, *events):
        """
        Generates synthetic data sets based on the provided parameters.
        Triggered automatically whenever configuration parameters change.
        """

        print(f"New Data Generated!!!")

        #Generate Synthetic Data using numpy
        np.random.seed(self.random_seed)

        #Generating ages using randint where self.age[0] is minimum and self.age[1] is maximum (+1 because maximum is exclusive)
        ages = np.random.randint(self.age[0], self.age[1] + 1, self.num_samples)

        #Generating Heights, Weights, Incomes, and Expenditures using normal distribution where self.attribute[0] is mean and self.attribute[1] is standard deviation
        heights = np.random.normal(self.height[0], self.height[1], self.num_samples)
        weights = np.random.normal(self.weight[0], self.weight[1], self.num_samples)
        incomes = np.random.normal(self.income[0], self.income[1], self.num_samples)
        expenditures = incomes * np.random.normal(self.expenditure[0], self.expenditure[1], self.num_samples)

        #Generating Genders
        genders = np.random.choice(self.gender, self.num_samples)

        #Store dataset as a pandas DataFrame in self._data
        self._data = pd.DataFrame({
            'Age': ages,
            'Height': heights,
            'Weight': weights,
            'Income': incomes,
            'Expenditure': expenditures,
            'Gender': genders
        })

    @property
    def data(self) -> pd.DataFrame:
        """Returns the generated dataset as a pandas DataFrame."""
        return self._data

    def __repr__(self):
        return str(self.param)

In [None]:
class AdvancedDataAnalyzer:
    def __init__(self, config: DataConfig):
        """
        Initialize the AdvancedDataAnalyzer with a DataConfig instance, and initializes a statistics distionary.
        Parameters:
        config: (DataConfig): An instance of the DataConfig class.
        """
        #Store the passed config object
        self.config = config

        #Initialize self._data with the config.data from the input DataConfig instance.
        self._data = self.config.data

        #Initialize an empty dictionary to store calculated statistics
        self._statistics = {}

    @property
    def data(self):
        """Returns the generated dataset as a pandas DataFrame."""
        return self.config.data

    #Use log_time decorator
    @log_time
    def calculate_statistics(self):
        """ Group the data by the specified column and calculate the mean for each group."""
        #Mean
        self._statistics['mean'] = self._data.mean(numeric_only=True)

        #Median
        self._statistics['median'] = self._data.median(numeric_only=True)

        #Variance
        self._statistics['variance'] = self._data.var(numeric_only=True)

        #Correlation Matrix
        self._statistics['correlation_matrix'] = self._data.corr(numeric_only=True)

        #Store these statistics in self._statistics and return the dictionary.
        return self._statistics

    def group_by_column(self, column: str):
        """Group the data by the specified column and calculate the mean for each group."""
        #Check if the column exists in self._data
        if column not in self._data.columns:
            raise ValueError(f"Column '{column}' does not exist in the data.")

        #Use groupby() on the specified column and calculate the mean for each group.
        grouped_dataframe = self._data.groupby(column).mean()

        #Return the resulting grouped DataFrame.
        return grouped_dataframe

    def apply_function(self, column: str, func: Callable):
        """Apply a custom function to all values in the specified column and return the modified series."""
        #Check if the column exists in self._data
        if column not in self._data.columns:
            raise ValueError(f"Column '{column}' does not exist in the data.")

        #Use apply() to apply the passed function (func) to the column.
        modified_series = self._data[column].apply(func)

        #Return the resulting series
        return modified_series

    def lazy_filter(self, column: str, condition: Callable):
        """Lazily filter rows that meet a condition in the specified column using a generator."""
        #Check if the column exists in self._data
        if column not in self._data.columns:
            raise ValueError(f"Column '{column}' does not exist in the data.")

        #Iterate through each row of the DataFrame using iterrows().
        for _, row in self._data.iterrows():
            #Apply the condition to the column.
            if condition(row[column]):
                #Yield rows that meet the condition.
                yield row

    def filter_data(self, column: str, condition: Callable):
        """Filter the data based on a condition applied to a column and return the filtered DataFrame."""
        #Check if the column exists in self._data
        if column not in self._data.columns:
            raise ValueError(f"Column '{column}' does not exist in the data.")

        #Use apply() to apply the condition and filter the rows.
        filtered_dataframe = self._data[self._data[column].apply(condition)]

        #Return the filtered DataFrame.
        return filtered_dataframe

    def visualize_relationship(self, column_x: str, column_y: str):
        """Visualize the relationship between two columns using a scatter plot."""
        #Check if the columns exist in self._data
        if column_x not in self._data.columns or column_y not in self._data.columns:
            raise ValueError(f"Columns '{column_x}' and/or '{column_y}' do not exist in the data.")

        #Use plt.scatter() to create the scatter plot
        plt.scatter(self._data[column_x], self._data[column_y])

        #Set the x-label, y-label
        plt.xlabel(column_x)
        plt.ylabel(column_y)

        #Call plt.show() to display the plot.
        plt.show()

    def visualize_distribution(self, column: str):
        """Plot the distribution of a specified column using a histogram."""
        #Check if the column exists in self._data
        if column not in self._data.columns:
            raise ValueError(f"Column '{column}' does not exist in the data.")

        #Use plt.hist() to plot the histogram.
        plt.hist(self._data[column])

        #Set the x-label, y-label
        plt.xlabel(column)
        plt.ylabel('Frequency')

        #Call plt.show() to display the plot.
        plt.show()

    #Use log_time decorator
    @log_time
    def save_data(self, format: str, file_path: str, **kwargs):
        """Save the DataFrame to disk in the specified format (csv, parquet, or npz)."""
        #Check the format (csv, parquet, npz).
        if format == 'csv':
            #Add the correct file extension to the file_path
            file_path += '.csv'

            #Save the data using the appropriate pandas or numpy method
            self._data.to_csv(file_path, **kwargs)

        elif format == 'parquet':
            #Add the correct file extension to the file_path
            file_path += '.parquet'

            #Save the data using the appropriate pandas or numpy method
            self._data.to_parquet(file_path, **kwargs)

        elif format == 'npz':
            #Add the correct file extension to the file_path
            file_path += '.npz'

            #Save the data using the appropriate pandas or numpy method
            np.savez(file_path, self._data.to_numpy())

        #If invalid format found
        else:
            raise ValueError("Invalid format. Please use 'csv', 'parquet', or 'npz'.")

        #Print a success message when saving is complete.
        print(f"Data saved successfully to {file_path}.")

In [None]:
import unittest
import pandas as pd
import numpy as np
import pathlib

class TestDataConfig(unittest.TestCase):
    def setUp(self):
        # Initialize DataConfig with default parameters
        self.config = DataConfig()

    def test_data_generation(self):
        # Test if the data is generated correctly
        data = self.config.data
        self.assertIsInstance(data, pd.DataFrame)
        self.assertEqual(len(data), self.config.num_samples)
        self.assertIn('Age', data.columns)
        self.assertIn('Height', data.columns)
        self.assertIn('Weight', data.columns)
        self.assertIn('Income', data.columns)
        self.assertIn('Expenditure', data.columns)
        self.assertIn('Gender', data.columns)

    def test_reactive_data_generation(self):
        # Change num_samples and check if data regenerates
        old_data = self.config.data.copy()
        self.config.num_samples = 2000
        new_data = self.config.data
        self.assertNotEqual(len(old_data), len(new_data))
        self.assertEqual(len(new_data), 2000)

    def test_data_randomness(self):
        # Check if data generation respects random_seed
        self.config.random_seed = 123
        data1 = self.config.data.copy()
        self.config.random_seed = 123
        data2 = self.config.data.copy()
        pd.testing.assert_frame_equal(data1, data2)


class TestAdvancedDataAnalyzer(unittest.TestCase):

    def setUp(self):
        # Initialize DataConfig and AdvancedDataAnalyzer with default parameters
        self.config = DataConfig()
        self.analyzer = AdvancedDataAnalyzer(self.config)

    def test_calculate_statistics(self):
        # Test if statistics are calculated correctly
        stats = self.analyzer.calculate_statistics()
        self.assertIsInstance(stats, dict)
        self.assertIn('mean', stats)
        self.assertIn('median', stats)
        self.assertIn('variance', stats)
        self.assertIn('correlation_matrix', stats)

    def test_group_by_column(self):
        # Test if data is grouped correctly by Gender
        grouped_data = self.analyzer.group_by_column('Gender')
        self.assertIsInstance(grouped_data, pd.DataFrame)
        self.assertIn('Age', grouped_data.columns)

    def test_apply_function(self):
        # Test applying a custom function to the Income column
        increased_income = self.analyzer.apply_function('Income', lambda x: x * 1.1)
        self.assertIsInstance(increased_income, pd.Series)
        self.assertTrue(np.allclose(increased_income.values, self.config.data['Income'] * 1.1))

    def test_filter_data(self):
        # Test filtering data where Age > 50
        filtered_data = self.analyzer.filter_data('Age', lambda x: x > 50)
        self.assertTrue((filtered_data['Age'] > 50).all())

    def test_lazy_filter(self):
        # Test lazy filtering where Age > 50
        filtered_rows = list(self.analyzer.lazy_filter('Age', lambda x: x > 50))
        self.assertTrue(all(row['Age'] > 50 for row in filtered_rows))

    def test_save_data(self):
        #  Test saving data in CSV format by checking file existence using pathlib
        file_path = pathlib.Path("test_data.csv")
        # Ensure the file does not exist before the test
        if file_path.exists():
            file_path.unlink()
        # Save data in CSV format
        self.analyzer.save_data(format='csv', file_path="test_data")

        # Check if the file exists
        self.assertTrue(file_path.exists())

        # Cleanup the file after the test
        file_path.unlink()


# Run the tests in Jupyter Notebook
def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestDataConfig)
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestAdvancedDataAnalyzer))
    unittest.TextTestRunner(verbosity=2).run(suite)

# Execute tests
run_tests()

test_data_generation (__main__.TestDataConfig) ... ok
test_data_randomness (__main__.TestDataConfig) ... ok
test_reactive_data_generation (__main__.TestDataConfig) ... ok
test_apply_function (__main__.TestAdvancedDataAnalyzer) ... ok
test_calculate_statistics (__main__.TestAdvancedDataAnalyzer) ... ok
test_filter_data (__main__.TestAdvancedDataAnalyzer) ... ok
test_group_by_column (__main__.TestAdvancedDataAnalyzer) ... 

New Data Generated!!!
New Data Generated!!!
New Data Generated!!!
New Data Generated!!!
New Data Generated!!!
New Data Generated!!!
New Data Generated!!!
Execution time for calculate_statistics: 0.0114 seconds
New Data Generated!!!
New Data Generated!!!


ok
test_lazy_filter (__main__.TestAdvancedDataAnalyzer) ... ok
test_save_data (__main__.TestAdvancedDataAnalyzer) ... ok

----------------------------------------------------------------------
Ran 9 tests in 0.335s

OK


New Data Generated!!!
New Data Generated!!!
Data saved successfully to test_data.csv.
Execution time for save_data: 0.0108 seconds
