In [2]:
# Step 1: Recommended Project Structure

# Create this structure (VERY IMPORTANT):

In [3]:
# retail_analysis/
# │
# ├── data/
# │   ├── raw/
# │   │   └── retail_sales.csv
# │   └── processed/
# │       └── retail_sales_clean.csv
# │
# ├── src/
# │   ├── load.py
# │   ├── clean.py
# │   ├── features.py
# │   ├── validate.py
# │   ├── analyze.py
# │   └── pipeline.py
# │
# ├── output/
# │   ├── reports/
# │   └── figures/
# │
# ├── notebooks/
# │   └── exploration.ipynb
# │
# └── README.md

In [4]:
# Step 2: File Paths (Avoid Hardcoding)
import pandas as pd
# Bad:
df = pd.read_csv("Datasets/retail_sales_dataset.csv")

# Good:
from pathlib import Path

DATA_DIR = Path("Datasets")
dataset = pd.read_csv(DATA_DIR / "retail_sales_dataset.csv")

In [9]:
# Step 3: Convert Notebook Code into Scripts

# load.py
def load_data(path):
    return pd.read_csv(path)

In [11]:
# clean.py
def clean_data(df):
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    return dataset.dropna()

In [13]:
# features.py
def add_features(df):
    dataset['year'] = dataset['Date'].dt.year
    dataset['month'] = dataset['Date'].dt.month
    return dataset

In [15]:
# validate.py
def validate_data(dataset):
    assert dataset['Quantity'].min() > 0
    assert dataset['Price per Unit'].min() > 0
    assert dataset['Total Amount'].min() > 0
    assert dataset['Transaction ID'].is_unique

In [17]:
# analyze.py
def analyze_data(dataset):
    return {
        "total_revenue": dataset['Total Amount'].sum(),
        "top_category": (
            dataset.groupby('Product Category')['Total Amount']
              .sum()
              .idxmax()
        )
    }

In [None]:
# pipeline.py
from load import load_data
from clean import clean_data
from features import add_features
from validate import validate_data
from analyze import analyze_data

def run_pipeline(path):
    dataset = load_data(path)
    dataset = clean_data(dataset)
    dataset = add_features(dataset)
    validate_data(dataset)
    return analyze_data(dataset)

in above pipeline there will be an error while executing the file Python cannot find the module since the file is inside a 
folder that isn’t defined as a package or included in the import path.

In [22]:
# Step 3: Run the script from PROJECT ROOT (IMPORTANT)

# You must be here:
# retail_analysis/


# Then run:
# python -m src.pipeline


# Don’t run it from inside src/
# Don’t open pipeline.py and hit run randomly

In [None]:
# Step 4: Save Outputs Properly

df.to_csv("data/processed/retail_sales_clean.csv", index=False)