# Cancer Prediction with AWS SageMaker

This notebook demonstrates a complete, self-contained workflow for cancer prediction using AWS SageMaker:
1. Data preparation and upload to S3
2. Creating a training script
3. Training on SageMaker using scikit-learn
4. Deployment to SageMaker endpoint for Lambda and API Gateway integration

In [1]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import json
import io
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:

print("Setting up SageMaker environment...")

# Initialize AWS clients with the correct region
region = 'eu-north-1'  # Set to your region
sagemaker_session = sagemaker.Session(boto3.Session(region_name=region))
role = get_execution_role()

print(f"Using region: {region}")
print(f"SageMaker Role ARN: {role}")

Setting up SageMaker environment...
Using region: eu-north-1
SageMaker Role ARN: arn:aws:iam::876820568174:role/service-role/AmazonSageMaker-ExecutionRole-20250903T123022


## 1. Prepare and Upload Data

First, we'll create a dataset From CSV file which is in S3:

In [3]:
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import boto3
import pandas as pd

# Load CSV from S3
s3 = boto3.client("s3")
bucket_name = "cancer-prediction-data-arash"
s3_key = "data/breast_cancer.csv"  # object path inside the bucket

resp = s3.get_object(Bucket=bucket_name, Key=s3_key)
df = pd.read_csv(io.BytesIO(resp["Body"].read()))

# Determine features and target (expects a 'target' column, otherwise last column is target)
if "target" in df.columns:
    X = df.drop("target", axis=1)
    y = df["target"]
else:
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")
print(f"Feature names: {len(X.columns)}

Training data shape: (455, 30)
Test data shape: (114, 30)
Feature names: 30 features


## 3. Train on SageMaker

Now we'll train our model Random Forest model

In [4]:
print("\nTraining Random Forest model locally...")

# Train model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Model Performance:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")



Training Random Forest model locally...
Model Performance:
  Accuracy:  0.9649
  Precision: 0.9589
  Recall:    0.9859


### Model Deployment
Now let's save and deploy our trained model to a SageMaker endpoint:

In [None]:
# install/upgrade if you see the pandas/numexpr warning (optional)
# !pip install -U numexpr

import os, json, time, tarfile
import joblib
import boto3
import sagemaker
from sagemaker.sklearn.model import SKLearnModel

# CONFIG - adjust if needed
bucket = 'cancer-prediction-data-arash'
prefix = 'cancer-prediction'
timestamp = int(time.time())
s3 = boto3.client('s3')
sm = sagemaker.Session()
role = sagemaker.get_execution_role()    # already used earlier in notebook
endpoint_name = f"cancer-prediction-skl-{timestamp}"

# 1) Save trained model to joblib (you already trained `model`)
local_model_file = "model.joblib"
joblib.dump(model, local_model_file)
print("Saved local model:", local_model_file)

# 2) Package model into model.tar.gz (required by SageMaker SKLearnModel)
tar_path = "model.tar.gz"
with tarfile.open(tar_path, "w:gz") as tar:
    tar.add(local_model_file, arcname="model.joblib")
print("Packaged model into:", tar_path)

# 3) Upload model.tar.gz to S3
s3_key = f"{prefix}/models/model_{timestamp}.tar.gz"
s3.upload_file(tar_path, bucket, s3_key)
model_s3_uri = f"s3://{bucket}/{s3_key}"
print("Uploaded model artifact to:", model_s3_uri)

# 4) Create a minimal inference script (written locally by the notebook)
inference_script = """
import os, joblib, json, numpy as np

def model_fn(model_dir):
    path = os.path.join(model_dir, "model.joblib")
    return joblib.load(path)

def input_fn(request_body, content_type):
    if content_type == "application/json":
        payload = json.loads(request_body)
        # accept {"instances": [[...], ...]} or {"features": [...]} or raw list
        instances = payload.get("instances") or payload.get("features") or payload
        arr = np.array(instances)
        if arr.ndim == 1:
            arr = arr.reshape(1, -1)
        return arr
    raise ValueError("Unsupported content type: " + content_type)

def predict_fn(input_data, model):
    # return probability for positive class and predicted class
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(input_data)[:,1].tolist()
    else:
        probs = model.predict(input_data).tolist()
    preds = model.predict(input_data).tolist()
    return {"probabilities": probs, "predictions": preds}

def output_fn(prediction, content_type):
    if content_type == "application/json":
        return json.dumps(prediction)
    raise ValueError("Unsupported content type: " + content_type)
"""

inference_path = "inference.py"
with open(inference_path, "w") as f:
    f.write(inference_script)
print("Wrote inference script to:", inference_path)

# 5) Create SKLearnModel pointing to the S3 model artifact and local inference script
sk_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    entry_point=inference_path,
    framework_version="0.23-1",   # or another supported sklearn image version
    sagemaker_session=sm
)

# 6) Deploy the model to a SageMaker endpoint
print("Deploying endpoint:", endpoint_name)
predictor = sk_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name
)
print("Deployed endpoint:", endpoint_name)

# 7) Quick test invoke using boto3 runtime
runtime = boto3.client("sagemaker-runtime")
sample = X_test_scaled[0].tolist()   # or X_train_scaled[0]
payload = {"instances": [sample]}
resp = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps(payload)
)
body = resp["Body"].read().decode()
print("Raw response:", body)
print("Parsed:", json.loads(body))

# Optional: cleanup local temporary files
# os.remove(local_model_file); os.remove(tar_path); os.remove(inference_path)

Saved local model: model.joblib
Packaged model into: model.tar.gz
Uploaded model artifact to: s3://cancer-prediction-data-arash/cancer-prediction/models/model_1756893471.tar.gz
Wrote inference script to: inference.py
Deploying endpoint: cancer-prediction-skl-1756893471
