### uestions
1. Which countries generate the highest total revenue?
2. How does total revenue change over time by month?
3. What is the distribution of order amounts (winsorized)?
4. Is the refund rate different by country (with order counts)?

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
DATA = Path("../data/processed/analytics_table.parquet")
FIGS = Path("reports/figures")
FIGS.mkdir(parents=True, exist_ok=True)
def save_fig(fig, path: Path, *, scale: int = 2) -> None:
     """Save a Plotly figure to disk (requires `kaleido`)."""
     path.parent.mkdir(parents=True, exist_ok=True)
     fig.write_image(str(path), scale=scale)




In [2]:
df = pd.read_parquet(DATA)
print("rows:", len(df),"cols:",len(df.columns))
print(df.dtypes.head(15))

missing = df.isna().sum().sort_values(ascending=False).head(10)
print(missing)


rows: 5 cols: 18
order_id              string[python]
user_id                       object
amount                       Float64
quantity                       Int64
created_at       datetime64[ns, UTC]
status                        object
status_clean                  object
amount_isna                     bool
quantity_isna                   bool
date                          object
year                         float64
month                 string[python]
dow                           object
hour                         float64
country                       object
dtype: object
signup_date    5
country        5
hour           1
dow            1
amount         1
quantity       1
month          1
year           1
created_at     1
date           1
dtype: int64


- The dataset has a reasonable number of rows and columns, which makes it suitable for analysis.
- Some columns contain missing values, so additional data cleaning may be required.

In [3]:
rev = (
 df.groupby("country", dropna=False)
 .agg(
 n=("order_id","size"),revenue=("amount","sum"),
 aov=("amount","mean"),
 )
 .reset_index()
 .sort_values("revenue", ascending=False)
 )

fig = px.bar(rev, x="country", y="revenue", title="Revenue by country (all data)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Country")
fig.update_yaxes(title_text="Revenue (sum of amount)")
save_fig(fig, FIGS / "revenue_by_country.png")
fig


In [None]:
trend = (
 df.groupby("month", dropna=False)
 .agg(n=("order_id""size"), revenue=("amount""sum"))
 .reset_index()
 .sort_values("month")
 )

fig = px.line(trend, x="month", y="revenue", title="Revenue over time (monthly)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Month")
fig.update_yaxes(title_text="Revenue")
save_fig(fig, FIGS / "revenue_trend_monthly.png")
fig



SyntaxError: unterminated string literal (detected at line 8) (452379668.py, line 8)