In [1]:
import pandas as pd
import json


In [2]:
df = pd.read_csv("../data/processed/cleaned_transactions.csv")

print("Shape:", df.shape)
df.head()


Shape: (333234, 13)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Year,Month,DayOfWeek,Hour
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,2,8
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,2,8
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8


In [3]:
total_rows = len(df)
total_columns = len(df.columns)

date_start = df['InvoiceDate'].min()
date_end = df['InvoiceDate'].max()

unique_customers = df['CustomerID'].nunique()
unique_products = df['StockCode'].nunique()
unique_countries = df['Country'].nunique()

total_revenue = (df['Quantity'] * df['UnitPrice']).sum()
average_order_value = (df['Quantity'] * df['UnitPrice']).mean()

total_rows, total_columns, unique_customers, unique_products, unique_countries


(333234, 13, 4191, 3392, 37)

In [4]:
checks = {
    "no_missing_values": df.isnull().sum().sum() == 0,
    "all_quantities_positive": (df['Quantity'] > 0).all(),
    "all_prices_positive": (df['UnitPrice'] > 0).all(),
    "customer_id_is_integer": df['CustomerID'].dtype == "int64"
}

checks


{'no_missing_values': True,
 'all_quantities_positive': True,
 'all_prices_positive': True,
 'customer_id_is_integer': True}

In [10]:
validation_report = {
    "total_rows": int(total_rows),
    "total_columns": int(total_columns),
    "date_range": {
        "start": str(date_start),
        "end": str(date_end)
    },
    "unique_customers": int(unique_customers),
    "unique_products": int(unique_products),
    "unique_countries": int(unique_countries),
    "total_revenue": float(total_revenue),
    "average_order_value": float(average_order_value),
    "validation_passed": bool(all(checks.values())),
    "checks": {
        "no_missing_values": bool(checks["no_missing_values"]),
        "all_quantities_positive": bool(checks["all_quantities_positive"]),
        "all_prices_positive": bool(checks["all_prices_positive"]),
        "customer_id_is_integer": bool(checks["customer_id_is_integer"])
    }
}


In [11]:
import json

with open("../data/processed/validation_report.json", "w") as f:
    json.dump(validation_report, f, indent=4)

print("validation_report.json saved successfully")


validation_report.json saved successfully
