In [26]:
# %load https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter07/scripts/generator.py
import random
import pandas as pd
from time import time, sleep


def log(message):
    print(f"[+] {message}\n")

    
def generate_random_score():
    return random.randint(60, 100)


def generate_list_of_random_scores(total_records=1000):
    return list(map(lambda x: generate_random_score(), range(total_records)))


def generate_event_time_records(num_records):
    time_value = int(round(time()))
    output = pd.Series([time_value]*num_records, 
                       dtype="float64")
    
    return output


def main():
    log("Generating column values for math, science, technology")
    math = generate_list_of_random_scores()
    science = generate_list_of_random_scores()
    technology = generate_list_of_random_scores()
    
    log("Generating column values for random1 and random2")
    random1 = generate_list_of_random_scores()
    random2 = generate_list_of_random_scores()
    sex = ["male"] * 800 + ["female"] * 200

    all_df = pd.DataFrame(dict(sex=sex, 
                               math=math,
                               science=science,
                               technology=technology,
                               random1=random1,
                               random2=random2))

    log("Computing values for the approved column")
    all_df['approved'] = all_df.apply(lambda row: (row.math + row.science + row.technology) > 240, axis=1)
    approved_col = all_df.pop("approved")
    all_df.insert(0, "approved", approved_col)
    all_df.loc[0:599, 'approved'] = True
    
    log("Shuffling DataFrame rows")
    all_df = all_df.sample(frac=1).reset_index(drop=True)
    
    log("Generating the index and event_time column values")
    all_df['index'] = range(1, len(all_df) + 1)
    all_df['event_time'] = generate_event_time_records(len(all_df))
    
    print(all_df)

    log("Converting approved and sex columns")
    all_df['approved'] = all_df.apply(lambda row: 1 if row.approved else 0, axis=1)
    all_df['sex'] = all_df.apply(lambda row: 1 if row.sex == "male" else 0, axis=1)

    print(all_df)
    
    return all_df


log("Running the main() function")
all_df = main()

[+] Running the main() function

[+] Generating column values for math, science, technology

[+] Generating column values for random1 and random2

[+] Computing values for the approved column

[+] Shuffling DataFrame rows

[+] Generating the index and event_time column values

     approved     sex  math  science  technology  random1  random2  index  \
0        True    male    97       97          98       93       82      1   
1        True    male    85       68          62       92       65      2   
2        True    male    99      100          80       71       60      3   
3        True    male    91       79          84       60       70      4   
4        True    male    73       86          66       70       98      5   
..        ...     ...   ...      ...         ...      ...      ...    ...   
995      True  female    99       62          92       71       75    996   
996      True  female    85       74          91       69       63    997   
997      True    male    72  

In [27]:
all_df

Unnamed: 0,approved,sex,math,science,technology,random1,random2,index,event_time
0,1,1,97,97,98,93,82,1,1.623605e+09
1,1,1,85,68,62,92,65,2,1.623605e+09
2,1,1,99,100,80,71,60,3,1.623605e+09
3,1,1,91,79,84,60,70,4,1.623605e+09
4,1,1,73,86,66,70,98,5,1.623605e+09
...,...,...,...,...,...,...,...,...,...
995,1,0,99,62,92,71,75,996,1.623605e+09
996,1,0,85,74,91,69,63,997,1.623605e+09
997,1,1,72,99,86,61,65,998,1.623605e+09
998,1,1,79,89,79,98,80,999,1.623605e+09


In [28]:
import boto3
import sagemaker
from sagemaker.session import Session

In [29]:
region = boto3.Session().region_name
session = boto3.Session(region_name=region)

client = session.client(
    service_name='sagemaker', 
    region_name=region
)

runtime = session.client(
    service_name='sagemaker-featurestore-runtime', 
    region_name=region
)

feature_store_session = Session(
    boto_session=session,
    sagemaker_client=client,
    sagemaker_featurestore_runtime_client=runtime
)

In [30]:
s3_bucket_name = "sagemaker-cookbook-bucket"
prefix = "chapter07"

from sagemaker import get_execution_role
role = get_execution_role()

In [31]:
s3_client = boto3.client('s3', region_name=region)

In [32]:
feature_group_name = 'cookbook-feature-group'

In [33]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(
    name=feature_group_name, 
    sagemaker_session=feature_store_session
)

In [34]:
try:
    feature_group.delete()
    sleep(30)
except:
    print("Feature group does not exist")
    pass

In [35]:
%%time

feature_group.load_feature_definitions(data_frame=all_df)
sleep(1)

CPU times: user 0 ns, sys: 2.13 ms, total: 2.13 ms
Wall time: 1 s


In [36]:
feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}/input",
    record_identifier_name="index",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True
)

sleep(60)

In [37]:
feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:581320662326:feature-group/cookbook-feature-group',
 'FeatureGroupName': 'cookbook-feature-group',
 'RecordIdentifierFeatureName': 'index',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'approved', 'FeatureType': 'Integral'},
  {'FeatureName': 'sex', 'FeatureType': 'Integral'},
  {'FeatureName': 'math', 'FeatureType': 'Integral'},
  {'FeatureName': 'science', 'FeatureType': 'Integral'},
  {'FeatureName': 'technology', 'FeatureType': 'Integral'},
  {'FeatureName': 'random1', 'FeatureType': 'Integral'},
  {'FeatureName': 'random2', 'FeatureType': 'Integral'},
  {'FeatureName': 'index', 'FeatureType': 'Integral'},
  {'FeatureName': 'event_time', 'FeatureType': 'Fractional'}],
 'CreationTime': datetime.datetime(2021, 6, 13, 17, 15, 42, 30000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/inpu

In [38]:
feature_group.describe().get("FeatureGroupStatus")

'Created'

In [39]:
client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'feature-group-16-17-50-17',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:581320662326:feature-group/feature-group-16-17-50-17',
   'CreationTime': datetime.datetime(2021, 1, 16, 17, 50, 42, 246000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'feature-group-10-20-17-14',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:581320662326:feature-group/feature-group-10-20-17-14',
   'CreationTime': datetime.datetime(2021, 1, 10, 20, 26, 30, 564000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'cookbook-feature-group-07',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:581320662326:feature-group/cookbook-feature-group-07',
   'CreationTime': datetime.datetime(2021, 1, 17, 0, 29, 3, 52000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status

In [40]:
all_df.dtypes

approved        int64
sex             int64
math            int64
science         int64
technology      int64
random1         int64
random2         int64
index           int64
event_time    float64
dtype: object

In [41]:
%%time

feature_group.ingest(
    data_frame=all_df, max_workers=3, wait=True
)

CPU times: user 20.4 ms, sys: 1.84 ms, total: 22.3 ms
Wall time: 5.04 s


IngestionManagerPandas(feature_group_name='cookbook-feature-group', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f2aff912410>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f2aff40f410>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [42]:
runtime.get_record(
    FeatureGroupName=feature_group.name, 
    RecordIdentifierValueAsString="300"
)

{'ResponseMetadata': {'RequestId': '3f780b7f-ec25-4bb3-8381-d61a46f4c3b8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3f780b7f-ec25-4bb3-8381-d61a46f4c3b8',
   'content-type': 'application/json',
   'content-length': '442',
   'date': 'Sun, 13 Jun 2021 17:16:48 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'approved', 'ValueAsString': '0'},
  {'FeatureName': 'sex', 'ValueAsString': '0'},
  {'FeatureName': 'math', 'ValueAsString': '65'},
  {'FeatureName': 'science', 'ValueAsString': '61'},
  {'FeatureName': 'technology', 'ValueAsString': '86'},
  {'FeatureName': 'random1', 'ValueAsString': '91'},
  {'FeatureName': 'random2', 'ValueAsString': '68'},
  {'FeatureName': 'index', 'ValueAsString': '300'},
  {'FeatureName': 'event_time', 'ValueAsString': '1623604510.0'}]}

In [43]:
%store feature_group_name
%store s3_bucket_name
%store prefix

Stored 'feature_group_name' (str)
Stored 's3_bucket_name' (str)
Stored 'prefix' (str)
