In [1]:
import pandas as pd
import json


In [2]:
df_clean = pd.read_csv('../data/processed/cleaned_transactions.csv', parse_dates=['InvoiceDate'])

print("Shape:", df_clean.shape)
df_clean.head()


Shape: (333234, 13)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Year,Month,DayOfWeek,Hour
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,2,8
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,2,8
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8


In [3]:
assert df_clean.isnull().sum().sum() == 0, "Missing values found!"
print("CHECK 1 PASSED: No missing values")


CHECK 1 PASSED: No missing values


In [4]:
assert (df_clean['Quantity'] > 0).all(), "Negative quantities found!"
print("CHECK 2 PASSED: All quantities are positive")


CHECK 2 PASSED: All quantities are positive


In [5]:
assert (df_clean['UnitPrice'] > 0).all(), "Invalid prices found!"
print("CHECK 3 PASSED: All prices are positive")


CHECK 3 PASSED: All prices are positive


In [6]:
assert df_clean['CustomerID'].dtype == 'int64', "CustomerID is not integer!"
print("CHECK 4 PASSED: CustomerID is integer type")


CHECK 4 PASSED: CustomerID is integer type


In [7]:
print("Date range:")
print(df_clean['InvoiceDate'].min(), "to", df_clean['InvoiceDate'].max())


Date range:
2010-12-01 08:26:00 to 2011-12-09 12:50:00


In [8]:
validation_report = {
    "total_rows": len(df_clean),
    "total_columns": len(df_clean.columns),
    "date_range": {
        "start": str(df_clean['InvoiceDate'].min()),
        "end": str(df_clean['InvoiceDate'].max())
    },
    "unique_customers": int(df_clean['CustomerID'].nunique()),
    "unique_products": int(df_clean['StockCode'].nunique()),
    "unique_countries": int(df_clean['Country'].nunique()),
    "total_revenue": float(df_clean['TotalPrice'].sum()),
    "average_order_value": float(
        df_clean.groupby('InvoiceNo')['TotalPrice'].sum().mean()
    ),
    "validation_passed": True,
    "checks": {
        "no_missing_values": True,
        "all_quantities_positive": True,
        "all_prices_positive": True,
        "customer_id_is_integer": True
    }
}

with open('../data/processed/validation_report.json', 'w') as f:
    json.dump(validation_report, f, indent=4)

print("All validation checks passed!")


All validation checks passed!
