In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
DATA = Path(r"C:\Users\user\OneDrive - University of Prince Mugrin\سطح المكتب\Bootcamp2\data\processed\analytics_table2.parquet")
FIGS = Path(r"C:\Users\user\OneDrive - University of Prince Mugrin\سطح المكتب\Bootcamp2\reports\figures")
FIGS.mkdir(parents=True, exist_ok=True)

def save_fig(fig, path: Path, *, scale: int = 2) -> None:
    """Save a Plotly figure to disk (requires `kaleido`)."""
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.write_image(str(path), scale=scale)



In [2]:
fig = px.scatter(x=[0, 1, 2, 3, 4], y=[0, 1, 4, 9, 16])
save_fig(fig, FIGS / "random.png" )

In [None]:
import kaleido
kaleido.__version__

'0.1.0.post1'

In [3]:
df = pd.read_parquet(DATA)

print("rows:", len(df), "cols:", len(df.columns))
print(df.dtypes.head(15))

missing = df.isna().sum().sort_values(ascending=False).head(10)
print(missing)

rows: 5250 cols: 18
order_id               string[python]
user_id                string[python]
amount                        Float64
quantity                        Int64
created_at        datetime64[ns, UTC]
status                         object
status_clean                   object
amount__isna                     bool
quantity__isna                   bool
date                           object
year                          float64
month                  string[python]
dow                            object
hour                          float64
country                        object
dtype: object
quantity              534
status                513
status_clean          513
dow                   507
hour                  507
created_at            507
date                  507
year                  507
month                 507
amount__is_outlier    495
dtype: int64


# piad by country 


In [4]:
rev = (
    df.groupby("country", dropna=False)
      .agg(
          n=("order_id","size"),
          paid=("amount","sum"),
          aov=("amount","mean"),
      )
      .reset_index()
      .sort_values("paid", ascending=False)
)

fig = px.bar(rev, x="country", y="paid", title="paid by country (all data)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Country")
fig.update_yaxes(title_text="paid (sum of amount)")
save_fig(fig, FIGS / "paid_by_country.png")
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# trand per month


In [None]:
monthly_trend = df.groupby(pd.Grouper(key='created_at', freq='ME'))['amount'].sum().reset_index()

fig = px.line(
    monthly_trend, 
    x='created_at', 
    y='amount', 
    title='Total Order Amount Trend per Month',
    markers=True,  
    labels={'created_at': 'Month', 'amount': 'Total Sales (SAR)'}
)
fig.update_layout(
    xaxis_tickformat='%b %Y', 
    hovermode='x unified'
)
fig.save_fig(fig, FIGS / "reports/figures/monthly_amount_trend.png")
fig
max_month = monthly_trend.loc[monthly_trend['amount'].idxmax()]
print(f"The highest amount was in {max_month['created_at'].strftime('%B %Y')} with total: {max_month['amount']:,.2f} SAR")

# Revenue trend (monthly)


In [None]:
#Revenue trend (monthly)
trend = (
    df.groupby("month", dropna=False)
      .agg(n=("order_id","size"), paid=("amount","sum"))
      .reset_index()
      .sort_values("month")
)

fig = px.line(trend, x="month", y="paid", title="paid over time (monthly)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Month")
fig.update_yaxes(title_text="paid")
save_fig(fig, FIGS / "paid_trend_monthly.png")
fig

#  Amount distribution (winsorized)

In [None]:
fig = px.histogram(df, x="amount_winsor", nbins=30, title="Order amount distribution (winsorized)")
fig.update_layout(title={"x": 0.02})
fig.update_xaxes(title_text="Amount (winsorized)")
fig.update_yaxes(title_text="Number of orders")
save_fig(fig, FIGS / "amount_hist_winsor.png")
fig

# bootstrap diff means

In [None]:
def bootstrap_diff_means(a: pd.Series, b: pd.Series, *, n_boot: int = 2000, seed: int = 0) -> dict:
    rng = np.random.default_rng(seed)
    a = pd.to_numeric(a, errors="coerce").dropna().to_numpy()
    b = pd.to_numeric(b, errors="coerce").dropna().to_numpy()
    assert len(a) > 0 and len(b) > 0, "Empty group after cleaning"

    diffs = []
    for _ in range(n_boot):
        sa = rng.choice(a, size=len(a), replace=True)
        sb = rng.choice(b, size=len(b), replace=True)
        diffs.append(sa.mean() - sb.mean())
    diffs = np.array(diffs)

    return {
        "diff_mean": float(a.mean() - b.mean()),
        "ci_low": float(np.quantile(diffs, 0.025)),
        "ci_high": float(np.quantile(diffs, 0.975)),
    }

d = df.assign(is_refund=df["status_clean"].eq("refund").astype(int))

a = d.loc[d["country"].eq("SA"), "is_refund"]
b = d.loc[d["country"].eq("AE"), "is_refund"]

print("n_SA:", len(a), "n_AE:", len(b))
res = bootstrap_diff_means(a, b, n_boot=2000, seed=0)
print(res)