In [35]:
import boto3
import numpy as np
import os
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

In [45]:
sess = sagemaker.Session()
sagemaker_iam_role = sagemaker.get_execution_role()

In [None]:
bucket_name="sagemaker-us-east-1-470086202700"
prefix='fraud_train'

athena_client = boto3.client('athena')

# Define your query
query_string = f"""
SELECT * FROM fraud_feature_group_1704739102
"""

output_location = f's3://{bucket_name}/{prefix}'

# Execute the query
response = athena_client.start_query_execution(
    QueryString=query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'  # The default database name
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)

# Get the query execution ID
query_execution_id = response['QueryExecutionId']

In [5]:
def get_query_results(query_execution_id):
    # Check if the query has finished
    query_status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    query_execution_status = query_status['QueryExecution']['Status']['State']
    #print(query_status['QueryExecution']['Status'])

    if query_execution_status == 'SUCCEEDED':
        print("Query succeeded, results are in:", output_location)
        return 200
    elif query_execution_status in ['FAILED', 'CANCELLED']:
        print(f"Query {query_execution_status.lower()}.")
        return 201
    else:
        print("Query in progress...")
        return -1

In [6]:
from time import sleep
while get_query_results(query_execution_id)<0:
    sleep(5)
    continue

Query in progress...
Query succeeded, results are in: s3://sagemaker-us-east-1-470086202700/fraud_train


In [8]:
s3 = boto3.client('s3')

bucket_name = "sagemaker-us-east-1-470086202700"
prefix="fraud_train"
csv_key = f'{prefix}/{query_execution_id}.csv'  
metadata_key = f'{prefix}/{query_execution_id}.csv.metadata'  

# Specify the local file paths where you want to download the files
local_csv_path = 'query_results.csv'
local_metadata_path = 'query_metadata.txt'

# Download the files
s3.download_file(bucket_name, csv_key, local_csv_path)
s3.download_file(bucket_name, metadata_key, local_metadata_path)

In [12]:
import pandas as pd
data = pd.read_csv(local_csv_path)
# Display the first few rows of the DataFrame
len(data)

284807

In [13]:
data.head()

Unnamed: 0,index,time,v1,v2,v3,v4,v5,v6,v7,v8,...,v25,v26,v27,v28,amount,class,event_time,write_time,api_invocation_time,is_deleted
0,261646,160121.0,-0.177158,1.343548,-0.754797,1.090594,1.416297,-0.754299,1.206201,-0.062678,...,-0.021679,-0.397139,0.437727,0.279197,1.0,0,2024-01-08T18:14:05Z,2024-01-08 19:29:26.953,2024-01-08 19:24:26.000,False
1,261766,160171.0,2.033563,-0.093322,-1.17592,0.212529,0.137302,-0.607642,0.084526,-0.161272,...,-0.287737,0.202759,-0.069516,-0.073209,1.98,0,2024-01-08T18:14:05Z,2024-01-08 19:29:26.953,2024-01-08 19:24:28.000,False
2,261824,160196.0,2.072731,0.200732,-1.677515,0.419828,0.451055,-0.886,0.225951,-0.27171,...,-0.242892,0.167666,-0.0589,-0.028837,1.98,0,2024-01-08T18:14:05Z,2024-01-08 19:29:26.953,2024-01-08 19:24:28.000,False
3,261829,160198.0,0.208904,0.958279,-0.831829,-0.768545,1.367007,-0.292268,0.898246,-0.012544,...,-0.306645,0.176332,0.217394,0.073151,1.29,0,2024-01-08T18:14:05Z,2024-01-08 19:29:26.953,2024-01-08 19:24:28.000,False
4,261940,160248.0,-1.976049,-2.945491,1.861101,-0.893697,1.297515,-0.954466,-1.351949,0.193612,...,-0.569773,0.390653,-0.15212,-0.001908,139.5,0,2024-01-08T18:14:05Z,2024-01-08 19:29:26.953,2024-01-08 19:24:29.000,False


In [14]:
data = data.drop(['event_time', 'write_time', 'api_invocation_time', 'is_deleted','index'], axis=1)

In [15]:
data.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,class
0,160121.0,-0.177158,1.343548,-0.754797,1.090594,1.416297,-0.754299,1.206201,-0.062678,-0.376904,...,-0.00015,0.185884,-0.259461,0.50013,-0.021679,-0.397139,0.437727,0.279197,1.0,0
1,160171.0,2.033563,-0.093322,-1.17592,0.212529,0.137302,-0.607642,0.084526,-0.161272,0.248896,...,-0.249078,-0.594433,0.289451,-0.385763,-0.287737,0.202759,-0.069516,-0.073209,1.98,0
2,160196.0,2.072731,0.200732,-1.677515,0.419828,0.451055,-0.886,0.225951,-0.27171,0.403835,...,-0.355675,-0.891227,0.336672,0.583988,-0.242892,0.167666,-0.0589,-0.028837,1.98,0
3,160198.0,0.208904,0.958279,-0.831829,-0.768545,1.367007,-0.292268,0.898246,-0.012544,0.109625,...,-0.413991,-1.067054,-0.022942,-0.435902,-0.306645,0.176332,0.217394,0.073151,1.29,0
4,160248.0,-1.976049,-2.945491,1.861101,-0.893697,1.297515,-0.954466,-1.351949,0.193612,-0.160682,...,-0.101432,-0.534028,0.724851,0.518215,-0.569773,0.390653,-0.15212,-0.001908,139.5,0


In [19]:
## Moving class to first column
model_data = data
model_data.head()
model_data = pd.concat([model_data['class'], model_data.drop(['class'], axis=1)], axis=1)
model_data.head()

Unnamed: 0,class,time,v1,v2,v3,v4,v5,v6,v7,v8,...,v20,v21,v22,v23,v24,v25,v26,v27,v28,amount
0,0,160121.0,-0.177158,1.343548,-0.754797,1.090594,1.416297,-0.754299,1.206201,-0.062678,...,-0.004314,-0.00015,0.185884,-0.259461,0.50013,-0.021679,-0.397139,0.437727,0.279197,1.0
1,0,160171.0,2.033563,-0.093322,-1.17592,0.212529,0.137302,-0.607642,0.084526,-0.161272,...,-0.185765,-0.249078,-0.594433,0.289451,-0.385763,-0.287737,0.202759,-0.069516,-0.073209,1.98
2,0,160196.0,2.072731,0.200732,-1.677515,0.419828,0.451055,-0.886,0.225951,-0.27171,...,-0.119373,-0.355675,-0.891227,0.336672,0.583988,-0.242892,0.167666,-0.0589,-0.028837,1.98
3,0,160198.0,0.208904,0.958279,-0.831829,-0.768545,1.367007,-0.292268,0.898246,-0.012544,...,0.014659,-0.413991,-1.067054,-0.022942,-0.435902,-0.306645,0.176332,0.217394,0.073151,1.29
4,0,160248.0,-1.976049,-2.945491,1.861101,-0.893697,1.297515,-0.954466,-1.351949,0.193612,...,0.452565,-0.101432,-0.534028,0.724851,0.518215,-0.569773,0.390653,-0.15212,-0.001908,139.5


In [22]:
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1229), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])

  return bound(*args, **kwds)


In [26]:
prefix = 'xgboost-fraud'

train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)


boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')) \
                                .upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'validation/validation.csv')) \
                                .upload_file('validation.csv')

In [28]:
s3_train_data = 's3://{}/{}/train/train.csv'.format(bucket_name, prefix)
s3_validation_data = 's3://{}/{}/validation/validation.csv'.format(bucket_name, prefix)

print('Uploaded training data location: {}'.format(s3_train_data))
print('Uploaded training data location: {}'.format(s3_validation_data))

output_location = 's3://{}/{}/output'.format(bucket_name, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Uploaded training data location: s3://sagemaker-us-east-1-470086202700/xgboost-fraud/train/train.csv
Uploaded training data location: s3://sagemaker-us-east-1-470086202700/xgboost-fraud/validation/validation.csv
Training artifacts will be uploaded to: s3://sagemaker-us-east-1-470086202700/xgboost-fraud/output


In [42]:
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.7-1")
container

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1'

In [43]:
s3_input_train = TrainingInput(s3_data="s3://{}/{}/train".format(bucket_name, prefix), content_type="csv")
s3_input_validation = TrainingInput(s3_data="s3://{}/{}/validation/".format(bucket_name, prefix), content_type="csv")

In [48]:
xgb = sagemaker.estimator.Estimator(container,
                                    role=sagemaker_iam_role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=output_location,
                                    sagemaker_session=sess)

In [49]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

In [None]:
xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-08-20-20-24-130


2024-01-08 20:20:24 Starting - Starting the training job......
2024-01-08 20:21:10 Starting - Preparing the instances for training.........
2024-01-08 20:22:46 Downloading - Downloading input data.