In [1]:
import boto3
import time
from botocore.exceptions import ClientError

In [4]:
# Initialize the SageMaker client
sagemaker_client = boto3.client('sagemaker')

def check_ingestion_status(feature_group_name):
    try:
        # Get the details of the feature group
        response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
        ingestion_status = response['FeatureGroupStatus']
        
        while ingestion_status == 'Creating':
            # Wait for a short period before checking again
            time.sleep(30)  # Adjust the sleep time as needed
            response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
            ingestion_status = response['FeatureGroupStatus']
            
        print(f"Ingestion Status: {ingestion_status}")
        if ingestion_status == 'CreateFailed':
            print(f"Failure Reason: {response.get('FailureReason', 'Unknown')}")
    except ClientError as e:
        print(f"An error occurred: {e}")

# Call the function to check the status
check_ingestion_status(feature_group_name="fraud-feature-group")


Ingestion Status: Created


In [32]:
import boto3

athena_client = boto3.client('athena')

# Define your query
query_string = f"""
SELECT * FROM fraud_feature_group_1704739102
"""

# Define the S3 bucket for query results
output_location = 's3://sagemaker-us-east-1-470086202700/fraud_train'

# Execute the query
response = athena_client.start_query_execution(
    QueryString=query_string,
    QueryExecutionContext={
        'Database': 'sagemaker_featurestore'  # The default database name
    },
    ResultConfiguration={
        'OutputLocation': output_location,
    }
)

# Get the query execution ID
query_execution_id = response['QueryExecutionId']

In [34]:
def get_query_results(query_execution_id):
    # Check if the query has finished
    query_status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    query_execution_status = query_status['QueryExecution']['Status']['State']
    #print(query_status['QueryExecution']['Status'])

    if query_execution_status == 'SUCCEEDED':
        print("Query succeeded, results are in:", output_location)
        retun 200
    elif query_execution_status in ['FAILED', 'CANCELLED']:
        print(f"Query {query_execution_status.lower()}.")
        return 201
    else:
        print("Query in progress...")
        return -1

get_query_results(query_execution_id)

Query succeeded, results are in: s3://sagemaker-us-east-1-470086202700/fraud_train


In [35]:
import boto3
s3 = boto3.client('s3')

bucket_name = "sagemaker-us-east-1-470086202700"
prefix="fraud_train"
csv_key = f'{prefix}/{query_execution_id}.csv'  
metadata_key = f'{prefix}/{query_execution_id}.csv.metadata'  

# Specify the local file paths where you want to download the files
local_csv_path = 'query_results.csv'
local_metadata_path = 'query_metadata.txt'

# Download the files
s3.download_file(bucket_name, csv_key, local_csv_path)
s3.download_file(bucket_name, metadata_key, local_metadata_path)


In [36]:
import pandas as pd
df = pd.read_csv(local_csv_path)
# Display the first few rows of the DataFrame
len(df)

284807

In [37]:
df.columns

Index(['index', 'time', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9',
       'v10', 'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19',
       'v20', 'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'amount',
       'class', 'event_time', 'write_time', 'api_invocation_time',
       'is_deleted'],
      dtype='object')