In [1]:
import pandas as pd
import numpy as np
import sagemaker

# --- Setup S3 ---
sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket()
data_bucket = "iu-fraud-project-s3"
data_file = "creditcard.csv"

# Load the original data once
s3_path = f"s3://{data_bucket}/{data_file}"
print(f"Loading base data from: {s3_path}")
df_base = pd.read_csv(s3_path)

# --- Define Your Drift Strategy ---
def apply_drift(df, month):
    df_drift = df.copy()
    
    # Months 1-3: No Drift (Baseline)
    if month <= 3:
        return df_drift

    # Months 4-6: Data Drift (V1 feature shifts slightly)
    # We increase the mean of V1 by a small factor each month
    elif 4 <= month <= 6:
        drift_factor = (month - 3) * 0.1  # 0.1, 0.2, 0.3
        print(f"   -> Applying Data Drift to V1 (Factor: {drift_factor})")
        df_drift['V1'] = df_drift['V1'] + drift_factor

    # Months 7-12: Concept Drift (Fraud patterns change)
    # We flip labels: High amount transactions (top 5%) become fraud
    elif month >= 7:
        print(f"   -> Applying Concept Drift (High Value = Fraud)")
        high_value_threshold = df_drift['Amount'].quantile(0.95)
        # Find high value transactions that were NOT fraud, and make them fraud
        condition = (df_drift['Amount'] > high_value_threshold) & (df_drift['Class'] == 0)
        # Flip 10% of them to be fraud to simulate a new attack vector
        flip_indices = df_drift[condition].sample(frac=0.1, random_state=month).index
        df_drift.loc[flip_indices, 'Class'] = 1
        
    return df_drift

# --- The Simulation Loop ---
print("\n--- Starting 12-Month Simulation Generation ---")
for month in range(1, 13):
    print(f"Generating data for Month {month}...")
    
    # 1. Create the specific month's data
    df_month = apply_drift(df_base, month)
    
    # 2. Save it locally
    file_name = f"month_{month}.csv"
    df_month.to_csv(file_name, index=False)
    
    # 3. Upload to S3 (Simulating the arrival of new data)
    # We upload to a 'simulated_data' folder so we don't mess up your raw training data yet
    s3_prefix = f"fraud-project/simulated_data"
    s3_uri = f"s3://{default_bucket}/{s3_prefix}/{file_name}"
    sagemaker_session.upload_data(path=file_name, bucket=default_bucket, key_prefix=s3_prefix)
    
    print(f"   Saved to: {s3_uri}")

print("\nDone! 12 months of data are ready in S3.")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Loading base data from: s3://iu-fraud-project-s3/creditcard.csv

--- Starting 12-Month Simulation Generation ---
Generating data for Month 1...
   Saved to: s3://sagemaker-us-east-2-307946652610/fraud-project/simulated_data/month_1.csv
Generating data for Month 2...
   Saved to: s3://sagemaker-us-east-2-307946652610/fraud-project/simulated_data/month_2.csv
Generating data for Month 3...
   Saved to: s3://sagemaker-us-east-2-307946652610/fraud-project/simulated_data/month_3.csv
Generating data for Month 4...
   -> Applying Data Drift to V1 (Factor: 0.1)
   Saved to: s3://sagemaker-us-east-2-307946652610/fraud-project/simulated_data/month_4.csv
Generating data for Month 5...
   -> Applying Data Drift to V1 (Factor: 0.2)
   Saved to: s3://sagemaker-us-east-2-307946652610/fraud-pro

In [2]:
%%writefile inference.py
import joblib
import os
import json
import numpy as np

# 1. model_fn: Loads the model from disk
def model_fn(model_dir):
    """
    Load the model from the directory where SageMaker saved it.
    """
    print("Loading model from: {}".format(model_dir))
    model_path = os.path.join(model_dir, "model.joblib")
    model = joblib.load(model_path)
    return model

# 2. input_fn: Deserializes the input data
def input_fn(request_body, request_content_type):
    """
    Parse the incoming request. We support text/csv.
    """
    if request_content_type == 'text/csv':
        # Read the raw CSV string and convert to a list of floats
        # This handles a single line of CSV like "0.1, 1.2, -0.5, ..."
        data = [float(x) for x in request_body.strip().split(',')]
        # Scikit-learn expects a 2D array: [[f1, f2, f3...]]
        return np.array([data])
    else:
        # You can add logic for 'application/json' if needed
        raise ValueError(f"Unsupported content type: {request_content_type}")

# 3. predict_fn: Makes the prediction
def predict_fn(input_data, model):
    """
    Apply the model to the input data.
    """
    # We use predict_proba to get the probability of Fraud (class 1)
    # This gives us a score (e.g., 0.85) rather than just a 0/1 label
    prediction = model.predict_proba(input_data)
    
    # prediction is usually [[prob_0, prob_1]]. We want prob_1.
    return prediction[0][1]

# 4. output_fn: Serializes the prediction result
def output_fn(prediction, content_type):
    """
    Format the output for the client.
    """
    # Return the probability as a simple JSON object
    response = {'fraud_probability': prediction}
    return json.dumps(response)

Overwriting inference.py
