In [2]:
# Step 1: Define the Pipeline Stages

# Every professional analysis has these stages:

# Load data

# Clean data

# Feature engineering

# Validation

# Analysis

# Output (CSV / report)

In [3]:
# Step 2: Load Function

import pandas as pd

file_path = 'Datasets/retail_sales_dataset.csv'
def load_data(file_path):
    return pd.read_csv(file_path)

In [4]:
# Step 3: Cleaning Function

def clean_data(dataset):
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    return dataset.dropna()

In [5]:
# Step 4: Feature Engineering Function

def add_features(df):
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    return df

In [11]:
# Step 5: Validation Function

def validate_data(df):
    assert df['Quantity'].min() > 0
    assert df['Price per Unit'].min() > 0
    assert df['Total Amount'].min() > 0
    assert df['Transaction ID'].is_unique

In [13]:
# Step 6: Analysis Function

def analyze_data(df):
    return {
        'total_revenue': df['Total Amount'].sum(),
        'top_category': (
            df.groupby('Product Category')['Total Amount']
              .sum()
              .idxmax()
        )
    }

In [15]:
# Step 7: Pipeline Runner

def run_pipeline(file_path):
    df = load_data(file_path)
    df = clean_data(df)
    df = add_features(df)
    validate_data(df)       
    results = analyze_data(df)
    return results

In [17]:
# Usage:

results = run_pipeline(file_path)
results

{'total_revenue': 456000, 'top_category': 'Electronics'}