In [None]:
%%capture
# If you're running this in SageMaker Studio:
# - Keep this notebook in the same folder as 01_Data_Preparation.ipynb
%run ./01_Data_Preparation.ipynb


In [None]:
#imports
import os
import io
import json
import time
import boto3
import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Session / region / role
sm_sess = sagemaker.Session()
region = sm_sess.boto_region_name
role = sagemaker.get_execution_role()

print("Region:", region)
print("Role  :", role)


In [None]:
# Check: make sure that these exist from Data_Preparation
assert "df_train" in globals(), "df_train not found. Ensure 01_Data_Preparation.ipynb ran successfully."
assert "df_test" in globals(), "df_test not found. Ensure 01_Data_Preparation.ipynb ran successfully."
assert "df_val" in globals(), "df_val not found. Ensure 01_Data_Preparation.ipynb ran successfully."

label_col = "label_satisfied"
for _df, _name in [(df_train,"df_train"), (df_test,"df_test"), (df_val,"df_val")]:
    assert label_col in _df.columns, f"{label_col} missing from {_name}"

print("Train shape:", df_train.shape)
print("Test  shape:", df_test.shape)
print("Val   shape:", df_val.shape)
print("Label prevalence (train):", df_train[label_col].mean().round(4))


In [None]:
# Feature configuration 
num_features = [
    "total_items",
    "total_price",
    "total_freight",
    "payment_value_sum",
    "payment_installments_max",
    "delivery_time_days",
    "estimated_time_days",
    "delivered_late",
]
cat_features = ["customer_state", "payment_types"]

# Keep only existing columns 
num_features = [c for c in num_features if c in df_train.columns]
cat_features = [c for c in cat_features if c in df_train.columns]

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

def make_model_frame(df: pd.DataFrame) -> pd.DataFrame:
    cols = num_features + cat_features + [label_col]
    out = df[cols].copy()
    # Ensure types
    for c in num_features:
        out[c] = pd.to_numeric(out[c], errors="coerce")
    out[num_features] = out[num_features].fillna(out[num_features].median(numeric_only=True))
    for c in cat_features:
        out[c] = out[c].fillna("UNK").astype(str)
    out[label_col] = out[label_col].astype(int)
    return out

train_df = make_model_frame(df_train)
test_df  = make_model_frame(df_test)
val_df   = make_model_frame(df_val)

train_df.head()


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X_train = train_df.drop(columns=[label_col])
y_train = train_df[label_col].values

X_test = test_df.drop(columns=[label_col])
y_test = test_df[label_col].values

# Benchmark A: majority-class baseline
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)

def classification_metrics(y_true, y_pred, y_score=None):
    out = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
    }
    if y_score is not None:
        try:
            out["roc_auc"] = float(roc_auc_score(y_true, y_score))
        except Exception:
            pass
    return out

metrics_dummy = classification_metrics(y_test, pred_dummy)
print("Benchmark A — DummyClassifier:", metrics_dummy)


In [None]:
# Benchmark B: logistic regression on features
# delivered_late + delivery_time_days + total_price
tiny_feats = [c for c in ["delivered_late", "delivery_time_days", "total_price"] if c in X_train.columns]
assert len(tiny_feats) >= 1, "No tiny benchmark features found; adjust tiny_feats list."

pre = ColumnTransformer(
    transformers=[
        ("num", "passthrough", tiny_feats),
    ],
    remainder="drop",
)

bench_lr = Pipeline(steps=[
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None)),
])
bench_lr.fit(X_train, y_train)

pred_lr = bench_lr.predict(X_test)
proba_lr = None
if hasattr(bench_lr.named_steps["clf"], "predict_proba"):
    proba_lr = bench_lr.predict_proba(X_test)[:, 1]

metrics_lr = classification_metrics(y_test, pred_lr, y_score=proba_lr)
print("Benchmark B — Tiny LogisticRegression:", metrics_lr)


In [None]:
# Full preprocessing for model training 
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_features),
        ("cat", ohe, cat_features),
    ],
    remainder="drop",
)

# Fit on train only
X_train_mat = preprocess.fit_transform(X_train)
X_val_mat   = preprocess.transform(val_df.drop(columns=[label_col]))
X_test_mat  = preprocess.transform(X_test)

y_val = val_df[label_col].values

# Helper to create XGBoost CSV 
def to_xgb_csv(X_mat, y_vec) -> pd.DataFrame:
    y_vec = np.asarray(y_vec).reshape(-1, 1)
    arr = np.hstack([y_vec, X_mat])
    return pd.DataFrame(arr)

train_xgb = to_xgb_csv(X_train_mat, y_train)
val_xgb   = to_xgb_csv(X_val_mat, y_val)
test_xgb  = to_xgb_csv(X_test_mat, y_test)

train_xgb.shape, val_xgb.shape, test_xgb.shape


In [None]:
import awswrangler as wr

if "DATALAKE_BUCKET" in globals() and isinstance(DATALAKE_BUCKET, str) and len(DATALAKE_BUCKET) > 0:
    bucket = DATALAKE_BUCKET
else:
    bucket = sm_sess.default_bucket()

base_prefix = f"s3://{bucket}/modeling/xgb-baseline/"

train_prefix = base_prefix + "train/"
val_prefix   = base_prefix + "val/"
test_prefix  = base_prefix + "test/"

# Writes one or more CSV files under each prefix, overwriting existing data
wr.s3.to_csv(train_xgb, path=train_prefix, index=False, header=False, dataset=True, mode="overwrite")
wr.s3.to_csv(val_xgb,   path=val_prefix,   index=False, header=False, dataset=True, mode="overwrite")
wr.s3.to_csv(test_xgb,  path=test_prefix,  index=False, header=False, dataset=True, mode="overwrite")

print("Uploaded dataset prefixes:")
print("  train:", train_prefix)
print("  val  :", val_prefix)
print("  test :", test_prefix)



In [None]:
# Define paths used in training/validation
s3_train = train_prefix
s3_val   = val_prefix
s3_test  = test_prefix

output_path = f"s3://{bucket}/modeling/output"
transform_output = base_prefix + "transform-output/"

# Image URI
xgb_image = image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.5-1"
)

# Estimator Definition
xgb = sagemaker.estimator.Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    sagemaker_session=sm_sess
)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective="binary:logistic",
    num_round=50
)

# Transformer Definition
transformer = xgb.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=transform_output
)


In [None]:
train_input = TrainingInput(s3_data=os.path.dirname(s3_train) + "/", content_type="text/csv")
val_input   = TrainingInput(s3_data=os.path.dirname(s3_val) + "/",   content_type="text/csv")

# --- COST SAFETY CHECK ---
import boto3
import time
from urllib.parse import urlparse
from sagemaker.model import Model
s3_client = boto3.client('s3')

def check_s3_prefix_has_contents(bucket_name, prefix):
    resp = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return resp.get('KeyCount', 0) > 0

# Parse the output path defined in previous cells
p = urlparse(output_path)
out_bucket = p.netloc
out_key_prefix = p.path.lstrip('/')

if check_s3_prefix_has_contents(out_bucket, out_key_prefix):
    print(f'Found existing training artifacts in {output_path}. Skipping Training to save cost.')
    # Find latest model artifact
    resp = s3_client.list_objects_v2(Bucket=out_bucket, Prefix=out_key_prefix)
    contents = sorted(resp.get('Contents', []), key=lambda x: x['LastModified'], reverse=True)
    model_uri = None
    for c in contents:
        if c['Key'].endswith('/output/model.tar.gz'):
            model_uri = f's3://{out_bucket}/{c["Key"]}'
            break
    if model_uri:
        print(f'   Using latest model artifact: {model_uri}')
        # Recreate Estimator/Model so next cells work
        xgb_model = Model(
            image_uri=xgb_image,
            model_data=model_uri,
            role=role,
            sagemaker_session=sm_sess
        )
        # Swap xgb (Estimator) to xgb_model (Model) for transformer usage
        xgb = xgb_model
    else:
        print('   Output dir exists but no model found. Retraining...')
        xgb.fit({'train': train_input, 'validation': val_input}, logs=False)
else:
    print('No existing training artifacts found. Starting Training...')
    xgb.fit({'train': train_input, 'validation': val_input}, logs=False)


In [None]:
# For transform, we provide features only
test_features_only = test_xgb.drop(columns=[0])  

# Write as a dataset under a prefix 
test_features_prefix = f"s3://{bucket}/{prefix}test/features_only/"

wr.s3.to_csv(
    test_features_only,
    path=test_features_prefix,
    index=False,
    header=False,
    dataset=True,
    mode="overwrite",
)

print("Transform input prefix (features):", test_features_prefix)
print("Transform output path            :", transform_output)

# --- COST SAFETY CHECK ---
t_parse = urlparse(transform_output)
t_bucket = t_parse.netloc
t_prefix = t_parse.path.lstrip('/')

if check_s3_prefix_has_contents(t_bucket, t_prefix):
    print(f'Found existing transform output in {transform_output}. Skipping Transform.')
else:
    print('No existing transform output. Starting Batch Transform...')
    transformer.transform(
        data=test_features_prefix,
        content_type='text/csv',
        split_type='Line',
    )
    transformer.wait()
    print('Batch transform complete.')


In [None]:

import re

# List objects under output prefix to find the output file
s3 = boto3.client("s3")
out_prefix = f"{prefix}batch-output/"

resp = s3.list_objects_v2(Bucket=bucket, Prefix=out_prefix)
keys = [obj["Key"] for obj in resp.get("Contents", [])]
print("Output objects:", keys)

out_files = [k for k in keys if k.endswith(".out") or k.endswith(".csv") or "test_features" in k]

candidate = None
for k in keys:
    if k.endswith(".out"):
        candidate = k
        break
if candidate is None:
    raise RuntimeError("Could not find batch transform output .out file. Check S3 output prefix listing above.")

print("Using output file:", candidate)

obj = s3.get_object(Bucket=bucket, Key=candidate)
raw = obj["Body"].read().decode("utf-8").strip().splitlines()

# Each line is a probability 
y_score = np.array([float(x.strip().split(",")[0]) for x in raw])
y_pred = (y_score >= 0.5).astype(int)

metrics_xgb = classification_metrics(y_test, y_pred, y_score=y_score)

print("SageMaker XGBoost metrics:", metrics_xgb)
print()
print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# Side-by-side comparison
compare = pd.DataFrame([
    {"model": "Benchmark A: Dummy (most_frequent)", **metrics_dummy},
    {"model": f"Benchmark B: Tiny LR ({', '.join(tiny_feats)})", **metrics_lr},
    {"model": "SageMaker: XGBoost (batch transform)", **metrics_xgb},
])

# Reorder columns
cols = ["model"] + [c for c in ["accuracy","precision","recall","f1","roc_auc"] if c in compare.columns]
compare = compare[cols]
compare
