In [27]:
# Import required libraries
import pandas as pd
import numpy as np

In [29]:
def generate_dummy_data(num_rows=100):
    """
    Generates dummy transaction data.
    """
    np.random.seed(42)
    data = {
        'transaction_id': np.arange(1, num_rows + 1),
        'customer_id': np.random.randint(1000, 2000, num_rows),
        'transaction_date': pd.to_datetime('2023-01-01') + pd.to_timedelta(
            np.random.randint(0, 365, num_rows), unit='D'),
        'amount': np.random.uniform(10, 1000, num_rows),
        'product_category': np.random.choice(['Electronics', 'Clothing', 'Home Goods', 'Books'], num_rows),
        'location': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston'], num_rows)
    }
    return pd.DataFrame(data)

In [31]:
def data_cleaning(df):
    """
    Cleans the dataset: removes nulls and invalid values.
    """
    df = df.dropna()
    df = df[df['amount'] > 0].copy()  # Prevent warning
    return df

In [33]:
def data_transformation(df):
    """
    Adds a column for the transaction year.
    """
    df['transaction_year'] = df['transaction_date'].dt.year
    return df

In [35]:
def data_aggregation(df):
    """
    Aggregates total amount by product category.
    """
    agg_df = df.groupby('product_category')['amount'].sum().reset_index()
    return agg_df

In [37]:
# Step 1: Generate data
dummy_data = generate_dummy_data(100)
print(" Step 1: Dummy Data")
print(dummy_data.head())

# Step 2: Clean data
cleaned_data = data_cleaning(dummy_data)
print("\n Step 2: Cleaned Data")
print(cleaned_data.head())

# Step 3: Transform data
transformed_data = data_transformation(cleaned_data)
print("\n Step 3: Transformed Data")
print(transformed_data.head())

# Step 4: Aggregate data
aggregated_data = data_aggregation(transformed_data)
print("\n Step 4: Aggregated Data")
print(aggregated_data)

 Step 1: Dummy Data
   transaction_id  customer_id transaction_date      amount product_category  \
0               1         1102       2023-02-13  331.101571       Home Goods   
1               2         1435       2023-06-11  130.867075      Electronics   
2               3         1860       2023-07-21  362.734860            Books   
3               4         1270       2023-09-27  907.760157      Electronics   
4               5         1106       2023-12-17  279.410927            Books   

      location  
0  Los Angeles  
1      Chicago  
2  Los Angeles  
3      Chicago  
4  Los Angeles  

 Step 2: Cleaned Data
   transaction_id  customer_id transaction_date      amount product_category  \
0               1         1102       2023-02-13  331.101571       Home Goods   
1               2         1435       2023-06-11  130.867075      Electronics   
2               3         1860       2023-07-21  362.734860            Books   
3               4         1270       2023-09-27  907.7