### Set up environment

In [None]:
import sys

sys.path.insert(0, "./src/")

In [None]:
!pip install -U seaborn aws_requests_auth imblearn

In [None]:
import boto3
from package import config
import uuid

instance_type_train = "ml.m5.4xlarge"
instance_type_inference = "ml.g4dn.2xlarge"

s3 = boto3.resource("s3", region_name=config.AWS_REGION)
s3.Object(
    f"{config.SOLUTIONS_S3_BUCKET}-{config.AWS_REGION}",
    f"{config.SOLUTION_NAME}/data/creditcardfraud.zip",
).download_file("creditcardfraud.zip")
unique_hash = str(uuid.uuid4())[:6]

In [None]:
from zipfile import ZipFile

with ZipFile("creditcardfraud.zip", "r") as zf:
    zf.extractall()

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv("creditcard.csv", delimiter=",")

In [None]:
print(data.columns)
data[["Time", "V1", "V2", "V27", "V28", "Amount", "Class"]].describe()

In [None]:
nonfrauds, frauds = data.groupby("Class").size()
print("Number of frauds: ", frauds)
print("Number of non-frauds: ", nonfrauds)
print("Percentage of fradulent data:", 100.0 * frauds / (frauds + nonfrauds))

In [None]:
feature_columns = data.columns[:-1]
label_column = data.columns[-1]

features = data[feature_columns].values.astype("float32")
labels = (data[label_column].values).astype("float32")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [None]:
np.mean(y_test), np.mean(y_train), X_train.shape, X_test.shape

In [None]:
import os
import sagemaker
from package import config

session = sagemaker.Session()
bucket = config.MODEL_DATA_S3_BUCKET
prefix = "fraud-classifier"

In [None]:
from sagemaker import RandomCutForest

training_job_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-rcf"
print(
    f"You can go to SageMaker -> Training -> Training jobs -> a job name started with {training_job_name} to monitor training status and details."
)


rcf = RandomCutForest(
    role=config.SAGEMAKER_IAM_ROLE,
    instance_count=1,
    instance_type=instance_type_train,
    data_location="s3://{}/{}/".format(bucket, prefix),
    output_path="s3://{}/{}/output".format(bucket, prefix),
    base_job_name=training_job_name,
    num_samples_per_tree=512,
    num_trees=50,
)

In [None]:
rcf.fit(rcf.record_set(X_train))

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer


endpoint_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-rcf"
print(
    f"You can go to SageMaker -> Inference -> Endpoints --> an endpoint with name {endpoint_name} to monitor the deployment status."
)

rcf_predictor = rcf.deploy(
    model_name=f"{config.SOLUTION_PREFIX}-{unique_hash}-rcf",
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type_inference,  # use a smaller instance for endpoint deployment
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer(),
)

In [None]:
def predict_rcf(current_predictor, data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = []
    for array in split_array:
        array_preds = [s["score"] for s in current_predictor.predict(array)["scores"]]
        predictions.append(array_preds)

    return np.concatenate([np.array(batch) for batch in predictions])

In [None]:
scores = predict_rcf(rcf_predictor, X_test)

In [None]:
positive_samples_scores = scores[y_test == 1]
negative_samples_scores = scores[y_test == 0]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)

plt.rcParams["figure.figsize"] = [7.00, 3.50]
plt.rcParams["figure.autolayout"] = True
f, axes = plt.subplots(1, 2)
plot1 = sns.histplot(positive_samples_scores, label="fraud", bins=20, color="red", ax=axes[0])
plot2 = sns.histplot(negative_samples_scores, label="not-fraud", bins=20, color="blue", ax=axes[1])
axes[0].set_xlim(0.5, 1.5)
axes[0].set_xlabel("Anomaly Score")
axes[1].set_xlim(0.5, 1.5)
axes[1].set_xlabel("Anomaly Score")
axes[0].legend()
axes[1].legend()
plt.show()

In [None]:
n, bins, patches = plt.hist(scores, 50, density=False, facecolor="g", alpha=0.75)

plt.xlabel("Anomaly Score")
plt.ylabel("Count")
plt.title("Histogram of Scores for Test Examples")
plt.xlim(0.8, 1.4)
plt.grid(True)
plt.axvline(x=1.0, color="r")
plt.show()

In [None]:
y_preds_rcf = np.where(scores > 1.0, 1, 0)

In [None]:
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, f1_score, roc_auc_score

result_rcf = [
    balanced_accuracy_score(y_test, y_preds_rcf),
    cohen_kappa_score(y_test, y_preds_rcf),
    f1_score(y_test, y_preds_rcf),
]
result_rcf.append("-")
result_rcf = pd.DataFrame(
    result_rcf, index=["Balanced accuracy", "Cohen's Kappa", "F1", "ROC_AUC"], columns=["RCF"]
)

print(result_rcf)

## Train a XGBoost model


In [None]:
import io
import sklearn
from sklearn.datasets import dump_svmlight_file

buf = io.BytesIO()

sklearn.datasets.dump_svmlight_file(X_train, y_train, buf)
buf.seek(0);

In [None]:
key = "fraud-dataset"
subdir = "base"
boto3.resource("s3", region_name=config.AWS_REGION).Bucket(bucket).Object(
    os.path.join(prefix, "train", subdir, key)
).upload_fileobj(buf)

s3_train_data = "s3://{}/{}/train/{}/{}".format(bucket, prefix, subdir, key)
print("Uploaded training data location: {}".format(s3_train_data))

output_location = "s3://{}/{}/output".format(bucket, prefix)
print("Training artifacts are uploaded to: {}".format(output_location))

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(container)

In [None]:
from math import sqrt

scale_pos_weight = sqrt(np.count_nonzero(y_train == 0) / np.count_nonzero(y_train))
hyperparams = {
    "max_depth": 5,
    "subsample": 0.8,
    "num_round": 100,
    "eta": 0.9,
    "gamma": 10,
    "min_child_weight": 16,
    "silent": 0,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "scale_pos_weight": scale_pos_weight,
}

In [None]:
training_job_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-xgb"
print(
    f"You can go to SageMaker -> Training -> Training jobs -> a job name started with {training_job_name} to monitor training status and details."
)

clf = sagemaker.estimator.Estimator(
    container,
    role=config.SAGEMAKER_IAM_ROLE,
    hyperparameters=hyperparams,
    instance_count=1,
    instance_type=instance_type_train,
    output_path=output_location,
    sagemaker_session=session,
    base_job_name=training_job_name,
)

In [None]:
clf.fit({"train": s3_train_data})

In [None]:
from sagemaker.serializers import CSVSerializer

endpoint_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-xgb"
print(
    f"You can go to SageMaker -> Inference -> Endpoints --> an endpoint with name {endpoint_name} to monitor the deployment status."
)

predictor = clf.deploy(
    initial_instance_count=1,
    endpoint_name=endpoint_name,
    instance_type=instance_type_inference,  # use a smaller instance for endpoint deployment
    serializer=CSVSerializer(),
)

In [None]:

def predict(current_predictor, data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = ",".join([predictions, current_predictor.predict(array).decode("utf-8")])

    return np.fromstring(predictions[1:], sep=",")

In [None]:
raw_preds = predict(predictor, X_test)

In [None]:
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, f1_score, roc_auc_score


y_preds = np.where(raw_preds > 0.5, 1, 0)
result_xgboost = [
    balanced_accuracy_score(y_test, y_preds),
    cohen_kappa_score(y_test, y_preds),
    f1_score(y_test, y_preds),
]
result_xgboost.append(roc_auc_score(y_test, raw_preds))
result_xgboost = pd.DataFrame(
    result_xgboost, index=["Balanced accuracy", "Cohen's Kappa", "F1", "ROC_AUC"], columns=["XGB"]
)

In [None]:
result_rcf_xgboost = result_rcf.join(result_xgboost)
print(result_rcf_xgboost)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(y_true, y_predicted):
    cm = confusion_matrix(y_true, y_predicted)
    # Get the per-class normalized value for each cell
    cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]


    ax = sns.heatmap(cm_norm, annot=cm, fmt="d")
    ax.set(xticklabels=["non-fraud", "fraud"], yticklabels=["non-fraud", "fraud"])
    ax.set_ylim([0, 2])
    plt.title("Confusion Matrix")
    plt.ylabel("Real Classes")
    plt.xlabel("Predicted Classes")
    plt.show()

In [None]:
plot_confusion_matrix(y_test, y_preds)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds, target_names=["non-fraud", "fraud"]))

## Train a XGBoost model with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
from collections import Counter

print(sorted(Counter(y_smote).items()))

In [None]:
smote_buf = io.BytesIO()

# Dump the SMOTE data into a buffer
sklearn.datasets.dump_svmlight_file(X_smote, y_smote, smote_buf)
smote_buf.seek(0)

# Upload from the buffer to S3
key = "fraud-dataset-smote"
subdir = "smote"
boto3.resource("s3", region_name=config.AWS_REGION).Bucket(bucket).Object(
    os.path.join(prefix, "train", subdir, key)
).upload_fileobj(smote_buf)

s3_smote_train_data = "s3://{}/{}/train/{}/{}".format(bucket, prefix, subdir, key)
print("Uploaded training data location: {}".format(s3_smote_train_data))

smote_output_location = "s3://{}/{}/smote-output".format(bucket, prefix)
print("Training artifacts are uploaded to: {}".format(smote_output_location))

In [None]:

hyperparams.pop("scale_pos_weight", None)

training_job_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-xgb-smote"
print(
    f"You can go to SageMaker -> Training -> Training jobs -> a job name started with {training_job_name} to monitor training status and details."
)

smote_xgb = sagemaker.estimator.Estimator(
    container,
    role=config.SAGEMAKER_IAM_ROLE,
    hyperparameters=hyperparams,
    instance_count=1,
    instance_type=instance_type_train,
    output_path=smote_output_location,
    sagemaker_session=session,
    base_job_name=training_job_name,
)

In [None]:
smote_xgb.fit({"train": s3_smote_train_data})

In [None]:
endpoint_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-xgb-smote"
print(
    f"You can go to SageMaker -> Inference -> Endpoints --> an endpoint with name {endpoint_name} to monitor the deployment status."
)

smote_predictor = smote_xgb.deploy(
    initial_instance_count=1,
    endpoint_name=endpoint_name,
    instance_type=instance_type_inference,  # use a smaller instance for endpoint deployment
    serializer=CSVSerializer(),
)

In [None]:
smote_raw_preds = predict(smote_predictor, X_test)
smote_preds = np.where(
    smote_raw_preds > 0.5, 1, 0
)  # generate predicted labels using a cutoff threshold 0.5

In [None]:
result_xgboost_smote = [
    balanced_accuracy_score(y_test, smote_preds),
    cohen_kappa_score(y_test, smote_preds),
    f1_score(y_test, smote_preds),
]
result_xgboost_smote.append(roc_auc_score(y_test, smote_raw_preds))
result_xgboost_smote = pd.DataFrame(
    result_xgboost_smote,
    index=["Balanced accuracy", "Cohen's Kappa", "F1", "ROC_AUC"],
    columns=["XGB_SMOTE"],
)

result_rcf_xgboost_all = result_rcf_xgboost.join(result_xgboost_smote)
print(result_rcf_xgboost_all)

In [None]:
plot_confusion_matrix(y_test, smote_preds)

In [None]:
print(classification_report(y_test, smote_preds, target_names=["non-fraud", "fraud"]))

In [None]:
for thres in np.linspace(0.1, 0.9, num=9):
    smote_thres_preds = np.where(smote_raw_preds > thres, 1, 0)
    print("Threshold: {:.1f}".format(thres))
    print("Balanced accuracy = {:.3f}".format(balanced_accuracy_score(y_test, smote_thres_preds)))
    print("Cohen's Kappa = {:.3f}\n".format(cohen_kappa_score(y_test, smote_thres_preds)))

## Train XGBoost model with hyper-parameter optimization


In [None]:
X_train_hpo, X_valid_hpo, y_train_hpo, y_valid_hpo = train_test_split(
    X_train, y_train, test_size=0.4, random_state=42, stratify=y_train
)

In [None]:
X_train_hpo.shape, X_train.shape

In [None]:
X_train_combine = np.concatenate((np.reshape(y_train, (-1, 1)), X_train), axis=1)
X_valid_combine = np.concatenate((np.reshape(y_valid_hpo, (-1, 1)), X_valid_hpo), axis=1)

In [None]:
!mkdir -p xgboost_hpo_input
np.savetxt("xgboost_hpo_input/X_train_hpo.csv", X_train_combine, delimiter=",")
np.savetxt("xgboost_hpo_input/X_valid_hpo.csv", X_valid_combine, delimiter=",")

In [None]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

prefix = "fraud-classifier-hpo"

s3_train_data = S3Uploader.upload(
    "xgboost_hpo_input/X_train_hpo.csv", "s3://{}/{}/{}".format(bucket, prefix, "train")
)
print("Uploaded training data location: {}".format(s3_train_data))

s3_validation_data = S3Uploader.upload(
    "xgboost_hpo_input/X_valid_hpo.csv", "s3://{}/{}/{}".format(bucket, prefix, "validation")
)
print("Uploaded training data location: {}".format(s3_validation_data))

output_location = "s3://{}/{}/output".format(bucket, prefix)
print("Training artifacts are uploaded to: {}".format(output_location))

In [None]:
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train/".format(bucket, prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/validation/".format(bucket, prefix), content_type="csv"
)

In [None]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)


scale_pos_weight = sqrt(
    np.count_nonzero(y_train == 0) / np.count_nonzero(y_train)
)

xgb = sagemaker.estimator.Estimator(
    container,
    config.SAGEMAKER_IAM_ROLE,
    instance_count=1,
    instance_type=instance_type_train,
    output_path=output_location,
    sagemaker_session=session,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=1000,
    early_stopping_rounds=10,
    silent=0,
    scale_pos_weight=scale_pos_weight,
)

In [None]:
# Define the hyper-parameters search ranges.
hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 0.5),
    "min_child_weight": ContinuousParameter(1, 10),
    "gamma": ContinuousParameter(2, 7),
    "max_depth": IntegerParameter(1, 10),
    "subsample": ContinuousParameter(0.6, 1),
}

Define objective metric name and objective type.

In [None]:
objective_metric_name = "validation:auc"
objective_type = "Maximize"

In [None]:
tuning_job_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-tuning"
print(
    f"You can go to SageMaker -> Training -> Hyperparameter tuning jobs -> a job name started with {tuning_job_name} to monitor HPO tuning status and details.\n"
    f"Note. You are unable to successfully run the following cells until the tuning job completes. This step may take around 15 min."
)

tuner = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=30,
    max_parallel_jobs=3,
    objective_type=objective_type,
    base_tuning_job_name=tuning_job_name,
)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation})

In [None]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

In [None]:
sm_client = boto3.Session().client("sagemaker")

tuning_job_name = tuner.latest_tuning_job.name
tuning_job_name

In [None]:
tuning_job_result = sm_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

is_maximize = (
    tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]["Type"]
    != "Maximize"
)
objective_name = tuning_job_result["HyperParameterTuningJobConfig"][
    "HyperParameterTuningJobObjective"
]["MetricName"]

In [None]:
import pandas as pd

tuner_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

full_df = tuner_analytics.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=False)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", -1)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

### Deploy endpoint of the best tuning job

In [None]:
from sagemaker.serializers import CSVSerializer

endpoint_name = f"{config.SOLUTION_PREFIX}-{unique_hash}-xgb-tuning"
print(
    f"You can go to SageMaker -> Inference -> Endpoints --> an endpoint with name {endpoint_name} to monitor the deployment status."
)

predictor_hpo = tuner.deploy(
    initial_instance_count=1,
    instance_type=instance_type_train,
    serializer=CSVSerializer(),
    endpoint_name=endpoint_name,
)

In [None]:
raw_preds_hpo = predict(predictor_hpo, X_test)
preds_hpo = np.where(raw_preds_hpo > 0.5, 1, 0)

In [None]:
result_xgboost_hpo = [
    balanced_accuracy_score(y_test, preds_hpo),
    cohen_kappa_score(y_test, preds_hpo),
    f1_score(y_test, preds_hpo),
]
result_xgboost_hpo.append(roc_auc_score(y_test, raw_preds_hpo))
result_xgboost_hpo = pd.DataFrame(
    result_xgboost_hpo,
    index=["Balanced accuracy", "Cohen's Kappa", "F1", "ROC_AUC"],
    columns=["XGB_HPO"],
)

## Compare all model performance
**bold text**

In [None]:
result_xgboost_all_hpo = result_rcf_xgboost_all.join(result_xgboost_hpo)
print(result_xgboost_all_hpo)