In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import resample
from sagemaker.feature_store.feature_group import FeatureGroup
from time import gmtime, strftime, sleep
import logging
import time
import sys
import subprocess
import sagemaker
import boto3
import os
from sagemaker.feature_store.inputs import TableFormatEnum
from custom_preprocess import Preprocessor

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()
logger.info(f'Default S3 bucket = {default_bucket}')
prefix = 'sagemaker-feature-store'

Default S3 bucket = sagemaker-us-east-1-995541965594


## Load dataset from S3 bucket

In [5]:
# Initialize an S3 client
s3 = boto3.client('s3')

# Bucket and file details
bucket_name = 'predicting-ctr'
file_key = 'predicting-ctr.csv'

# Read the CSV file from S3 into a DataFrame
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
df = pd.read_csv(obj['Body'])

In [6]:
df.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
1,333291,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0
2,129781,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0
3,464848,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0
4,90569,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0


## Preprocess and feature engineering

In [9]:
preprocessor = Preprocessor(df)

columns_to_encode = ['product','gender','age_level','time_of_day','product_category_1','webpage_id','campaign_id']
encoding_type = 'label'
train_test_split = True
feature_engineer = True

train, validation, test, encoder_dictionary = preprocessor(columns_to_encode, encoding_type, train_test_split, feature_engineer)

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  le = LabelEncoder()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if self.train_test_split == True:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train[f'{col}_encoding'] = le.fit_transform(self.train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [10]:
train.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,...,most_recent_clicked_webpage,gender_encoding,product_encoding,age_level_encoding,time_of_day_encoding,product_category_1_encoding,webpage_id_encoding,campaign_id_encoding,most_recent_clicked_product_encoding,most_recent_clicked_webpage_encoding
0,140690,2017-07-02 00:00:00,858557,C,359520,13787,4,146115.0,10.0,Female,...,13787.0,0,2,4,2,3,3,4,2,3.0
1,333291,2017-07-02 00:00:00,243253,C,105960,11085,5,146115.0,8.0,Female,...,11085.0,0,2,2,2,4,2,2,2,2.0
2,129781,2017-07-02 00:00:00,243253,C,359520,13787,4,146115.0,8.0,Female,...,13787.0,0,2,2,2,3,3,4,2,3.0
3,464848,2017-07-02 00:00:00,1097446,I,359520,13787,3,146115.0,3.0,Male,...,13787.0,1,8,3,2,2,3,4,8,3.0
4,90569,2017-07-02 00:01:00,663656,C,405490,60305,3,146115.0,2.0,Male,...,53587.0,1,2,2,2,2,8,8,8,7.0


## Generate event_time feature

This is the feature indicating the time of insertion of records into feature store. This is a mandatory column. 

In [11]:
from datetime import datetime, timezone, date
def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

train['event_time'] = generate_event_timestamp()
validation['event_time'] = generate_event_timestamp()
test['event_time'] = generate_event_timestamp()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['event_time'] = generate_event_timestamp()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation['event_time'] = generate_event_timestamp()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['event_time'] = generate_event_timestamp()


In [13]:
train.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,...,gender_encoding,product_encoding,age_level_encoding,time_of_day_encoding,product_category_1_encoding,webpage_id_encoding,campaign_id_encoding,most_recent_clicked_product_encoding,most_recent_clicked_webpage_encoding,event_time
0,140690,2017-07-02 00:00:00,858557,C,359520,13787,4,146115.0,10.0,Female,...,0,2,4,2,3,3,4,2,3.0,2024-04-28T13:37:08.466Z
1,333291,2017-07-02 00:00:00,243253,C,105960,11085,5,146115.0,8.0,Female,...,0,2,2,2,4,2,2,2,2.0,2024-04-28T13:37:08.466Z
2,129781,2017-07-02 00:00:00,243253,C,359520,13787,4,146115.0,8.0,Female,...,0,2,2,2,3,3,4,2,3.0,2024-04-28T13:37:08.466Z
3,464848,2017-07-02 00:00:00,1097446,I,359520,13787,3,146115.0,3.0,Male,...,1,8,3,2,2,3,4,8,3.0,2024-04-28T13:37:08.466Z
4,90569,2017-07-02 00:01:00,663656,C,405490,60305,3,146115.0,2.0,Male,...,1,2,2,2,2,8,8,8,7.0,2024-04-28T13:37:08.466Z


### Convert event_time feature data type into string format

In [14]:
def convert_columns_to_string_format(temp_df):
    temp_df['user_id'] = temp_df['user_id'].astype('string')
    temp_df['session_id'] = temp_df['session_id'].astype('string')
    temp_df['event_time'] = temp_df['event_time'].astype('string')
    temp_df['DateTime'] = temp_df['DateTime'].astype('string')
    return temp_df

train = convert_columns_to_string_format(train)
validation = convert_columns_to_string_format(validation)
test = convert_columns_to_string_format(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['user_id'] = temp_df['user_id'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['session_id'] = temp_df['session_id'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['event_time'] = temp_df['event_time'].astype('string')
A value is trying to b

## Transactions feature store

In [16]:
current_timestamp = strftime('%m-%d-%H-%M', gmtime())

In [17]:
fs_prefix = 'feature-store-predicting-ctr-' 
transactions_feature_group_name = f'{fs_prefix}transactions-{current_timestamp}'
%store transactions_feature_group_name

Stored 'transactions_feature_group_name' (str)


In [18]:
transactions_feature_group = FeatureGroup(name=transactions_feature_group_name, sagemaker_session=sagemaker_session)

In [19]:
transactions_feature_group.load_feature_definitions(data_frame=train)

[FeatureDefinition(feature_name='session_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='DateTime', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='user_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='product', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='campaign_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='webpage_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='product_category_1', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='product_category_2', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_

In [20]:
table_format_param = 'ICEBERG' 
table_format = TableFormatEnum.ICEBERG

In [21]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f'Initial status: {status}')
    while status == 'Creating':
        logger.info(f'Waiting for feature group: {feature_group.name} to be created ...')
        time.sleep(5)
        status = feature_group.describe().get('FeatureGroupStatus')
    if status != 'Created':
        raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
    logger.info(f'FeatureGroup {feature_group.name} was successfully created.')

In [22]:
transactions_feature_group.create(s3_uri=f's3://{default_bucket}/{prefix}', 
                               record_identifier_name='session_id', 
                               event_time_feature_name='event_time', 
                               role_arn=role, 
                               enable_online_store=False, 
                               table_format=table_format 
                              )

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:995541965594:feature-group/feature-store-predicting-ctr-transactions-04-28-17-06',
 'ResponseMetadata': {'RequestId': '9605ec6e-1794-46b2-80b0-72aebfbdd638',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9605ec6e-1794-46b2-80b0-72aebfbdd638',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Sun, 28 Apr 2024 17:06:46 GMT'},
  'RetryAttempts': 0}}

In [23]:
wait_for_feature_group_creation_complete(transactions_feature_group)

Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...


Initial status: Creating


Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-transactions-04-28-17-06 to be created ...
FeatureGroup feature-store-predicting-ctr-transactions-04-28-17-06 was successfully created.
FeatureGroup feature-store-predicting-ctr-transactions-04-28-17-06 was successfully created.


### Ingesting records into feature store

In [24]:
%%time

logger.info(f'Ingesting data into feature group: {transactions_feature_group.name} ...')
transactions_feature_group.ingest(data_frame=train, max_processes=16, wait=True)
logger.info(f'{len(df)} transaction records ingested into feature group: {transactions_feature_group.name}')

Ingesting data into feature group: feature-store-predicting-ctr-transactions-04-28-17-06 ...
Ingesting data into feature group: feature-store-predicting-ctr-transactions-04-28-17-06 ...
463291 transaction records ingested into feature group: feature-store-predicting-ctr-transactions-04-28-17-06
463291 transaction records ingested into feature group: feature-store-predicting-ctr-transactions-04-28-17-06


CPU times: user 8.37 s, sys: 429 ms, total: 8.79 s
Wall time: 9min 6s


## Customers feature store

Stores the customer level features such as gender, most recent clicked product, etc.,

This feature store will help us with serving the features such as most recent clicked product at the time of making predictions at endpoint. Calculating features such as most recent clicked product for n number of users would take a long time and we can't compute them on the fly. Hence we will pre-calculate these features for a particular user and store in the feature store.

While making predictions at the endpoint, we can get these features for a particular user from the feature store very quickly and this would in turn help us to serve our predictions at a reduced latency.

In [25]:
customers_df = train[['user_id','gender_encoding','age_level_encoding','most_recent_clicked_product_encoding','most_recent_clicked_webpage_encoding']]
customers_df = customers_df.groupby(['user_id'],as_index=False).agg(gender_encoding=('gender_encoding','first'),
                                                    age_level_encoding=('age_level_encoding','first'),
                                                    most_recent_clicked_product_encoding=('most_recent_clicked_product_encoding','max'),
                                                    most_recent_clicked_webpage_encoding=('most_recent_clicked_webpage_encoding','max'))

customers_df['event_time'] = generate_event_timestamp()
customers_df['event_time'] = customers_df['event_time'].astype('string')

In [26]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117174 entries, 0 to 117173
Data columns (total 6 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   user_id                               117174 non-null  string 
 1   gender_encoding                       117174 non-null  int64  
 2   age_level_encoding                    117174 non-null  int64  
 3   most_recent_clicked_product_encoding  117174 non-null  int64  
 4   most_recent_clicked_webpage_encoding  117174 non-null  float64
 5   event_time                            117174 non-null  string 
dtypes: float64(1), int64(3), string(2)
memory usage: 5.4 MB


In [None]:
customers_feature_group_name = f'{fs_prefix}customers-{current_timestamp}'
%store customers_feature_group_name

customers_feature_group = FeatureGroup(name=customers_feature_group_name, sagemaker_session=sagemaker_session)

In [28]:
customers_feature_group.load_feature_definitions(data_frame=customers_df)

[FeatureDefinition(feature_name='user_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='gender_encoding', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='age_level_encoding', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='most_recent_clicked_product_encoding', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='most_recent_clicked_webpage_encoding', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None)]

In [29]:
customers_feature_group.create(s3_uri=f's3://{default_bucket}/{prefix}', 
                               record_identifier_name='user_id', 
                               event_time_feature_name='event_time', 
                               role_arn=role, 
                               enable_online_store=True, 
                               table_format=table_format 
                              )

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:995541965594:feature-group/feature-store-predicting-ctr-customers-04-28-17-06',
 'ResponseMetadata': {'RequestId': '6bd2576a-54a2-445c-8b39-3672a5cb4091',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6bd2576a-54a2-445c-8b39-3672a5cb4091',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '127',
   'date': 'Sun, 28 Apr 2024 17:16:11 GMT'},
  'RetryAttempts': 0}}

In [63]:
customers_df.head()

Unnamed: 0,user_id,gender_encoding,age_level_encoding,most_recent_clicked_product_encoding,most_recent_clicked_webpage_encoding,event_time
0,1000001,1,2,2,8.0,2024-04-23T15:29:56.584Z
1,1000009,1,2,2,3.0,2024-04-23T15:29:56.584Z
2,100001,1,2,2,3.0,2024-04-23T15:29:56.584Z
3,1000010,1,2,9,7.0,2024-04-23T15:29:56.584Z
4,1000026,1,4,7,8.0,2024-04-23T15:29:56.584Z


In [30]:
wait_for_feature_group_creation_complete(customers_feature_group)

Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...


Initial status: Creating


Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
Waiting for feature group: feature-store-predicting-ctr-customers-04-28-17-06 to be created ...
FeatureGroup feature-store-predicting-ct

### Ingesting records into Feature Store

In [31]:
%%time

logger.info(f'Ingesting data into feature group: {customers_feature_group.name} ...')
customers_feature_group.ingest(data_frame=customers_df, max_processes=16, wait=True)
logger.info(f'{len(customers_df)} customer records ingested into feature group: {customers_feature_group.name}')

Ingesting data into feature group: feature-store-predicting-ctr-customers-04-28-17-06 ...
Ingesting data into feature group: feature-store-predicting-ctr-customers-04-28-17-06 ...
117174 customer records ingested into feature group: feature-store-predicting-ctr-customers-04-28-17-06
117174 customer records ingested into feature group: feature-store-predicting-ctr-customers-04-28-17-06


CPU times: user 1.12 s, sys: 348 ms, total: 1.46 s
Wall time: 2min 51s


## Accessing records from the online Feature Store

In our Customers Feature Store, user_id is the record identifier and we can retrieve records using the user_id

In [8]:
featurestore_runtime_client = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=region)

In [28]:
feature_record = featurestore_runtime_client.get_record(FeatureGroupName='feature-store-predicting-ctr-customers-04-28-17-06', 
                                                        RecordIdentifierValueAsString='1097446')
feature_record

{'ResponseMetadata': {'RequestId': '89f06f8d-880c-4a90-b1a1-b5f03b1e1e49',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '89f06f8d-880c-4a90-b1a1-b5f03b1e1e49',
   'content-type': 'application/json',
   'content-length': '566',
   'date': 'Wed, 01 May 2024 17:42:57 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'user_id', 'ValueAsString': '1097446'},
  {'FeatureName': 'gender_encoding', 'ValueAsString': '1'},
  {'FeatureName': 'age_level_encoding', 'ValueAsString': '3'},
  {'FeatureName': 'most_recent_clicked_product_encoding',
   'ValueAsString': '0'},
  {'FeatureName': 'most_recent_clicked_webpage_encoding',
   'ValueAsString': '8.0'},
  {'FeatureName': 'event_time', 'ValueAsString': '2024-05-01T17:24:53.650Z'}]}

## Accessing data from Offline Feature Stores

In [20]:
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = sagemaker.Session(boto_session=boto_session, 
                                          sagemaker_client=sagemaker_client, 
                                          sagemaker_featurestore_runtime_client=featurestore_runtime)

In [21]:
s3_uri_prefix = f's3://predicting-ctr/sagemaker-feature-store/*'
transactions_feature_group_name = "feature-store-predicting-ctr-transactions-04-28-17-06"

In [22]:
transactions_fg = FeatureGroup(name=transactions_feature_group_name, sagemaker_session=feature_store_session)

In [23]:
transactions_query = transactions_fg.athena_query()
transactions_table = transactions_query.table_name

In [24]:
query_string = f'SELECT * FROM {transactions_table}'

output_location = f's3://{default_bucket}/sagemaker-feature-store/query_results/'

In [25]:
transactions_query.run(query_string=query_string, output_location=output_location)
transactions_query.wait()
output_df = transactions_query.as_dataframe()

In [26]:
output_df.head()

Unnamed: 0,write_time,api_invocation_time,is_deleted,session_id,datetime,user_id,product,campaign_id,webpage_id,product_category_1,...,gender_encoding,product_encoding,age_level_encoding,time_of_day_encoding,product_category_1_encoding,webpage_id_encoding,campaign_id_encoding,most_recent_clicked_product_encoding,most_recent_clicked_webpage_encoding,event_time
0,2024-05-01 17:28:37.104000 UTC,2024-05-01 17:23:40.000000 UTC,False,336807,2017-07-06 17:31:00,59063,H,359520,13787,4,...,1,7,3,0,3,3,4,7,3.0,2024-05-01T17:23:39.123Z
1,2024-04-28 17:17:53.017000 UTC,2024-04-28 17:12:51.000000 UTC,False,146161,2017-07-04 18:17:00,483064,C,359520,13787,4,...,1,2,2,2,3,3,4,2,3.0,2024-04-28T13:37:08.466Z
2,2024-04-28 17:17:52.983000 UTC,2024-04-28 17:12:52.000000 UTC,False,143771,2017-07-03 07:16:00,580267,C,359520,13787,4,...,1,2,3,1,3,3,4,2,3.0,2024-04-28T13:37:08.466Z
3,2024-04-28 17:17:53.017000 UTC,2024-04-28 17:12:51.000000 UTC,False,544261,2017-07-04 21:12:00,45169,I,82320,1734,1,...,1,8,2,2,0,0,0,8,0.0,2024-04-28T13:37:08.466Z
4,2024-04-28 17:17:53.017000 UTC,2024-04-28 17:12:51.000000 UTC,False,517692,2017-07-04 08:11:00,909668,A,405490,60305,2,...,1,0,2,1,1,8,8,0,8.0,2024-04-28T13:37:08.466Z


In [27]:
output_df.sort_values('event_time',ascending=False)

Unnamed: 0,write_time,api_invocation_time,is_deleted,session_id,datetime,user_id,product,campaign_id,webpage_id,product_category_1,...,gender_encoding,product_encoding,age_level_encoding,time_of_day_encoding,product_category_1_encoding,webpage_id_encoding,campaign_id_encoding,most_recent_clicked_product_encoding,most_recent_clicked_webpage_encoding,event_time
0,2024-05-01 17:28:37.104000 UTC,2024-05-01 17:23:40.000000 UTC,False,336807,2017-07-06 17:31:00,59063,H,359520,13787,4,...,1,7,3,0,3,3,4,7,3.0,2024-05-01T17:23:39.123Z
115419,2024-05-01 17:28:41.761000 UTC,2024-05-01 17:24:05.000000 UTC,False,134362,2017-07-06 19:20:00,973621,C,359520,13787,4,...,1,2,3,2,3,3,4,7,3.0,2024-05-01T17:23:39.123Z
115429,2024-05-01 17:28:41.761000 UTC,2024-05-01 17:24:05.000000 UTC,False,592391,2017-07-06 14:55:00,547088,I,118601,28529,4,...,1,8,2,0,3,4,3,8,4.0,2024-05-01T17:23:39.123Z
115428,2024-05-01 17:28:41.761000 UTC,2024-05-01 17:24:05.000000 UTC,False,69917,2017-07-06 20:20:00,682607,C,405490,60305,3,...,1,2,2,2,2,8,8,2,8.0,2024-05-01T17:23:39.123Z
115427,2024-05-01 17:28:41.761000 UTC,2024-05-01 17:24:05.000000 UTC,False,162007,2017-07-06 20:22:00,475174,D,414149,45962,5,...,1,3,2,2,4,5,9,3,5.0,2024-05-01T17:23:39.123Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149969,2024-04-28 17:12:53.680000 UTC,2024-04-28 17:09:42.000000 UTC,False,5966,2017-07-05 11:32:00,499978,I,404347,53587,1,...,1,8,2,3,0,7,7,8,7.0,2024-04-28T13:37:08.466Z
149968,2024-04-28 17:12:53.680000 UTC,2024-04-28 17:09:41.000000 UTC,False,293647,2017-07-04 05:17:00,981441,C,360936,13787,5,...,1,2,2,1,4,3,5,2,3.0,2024-04-28T13:37:08.466Z
149967,2024-04-28 17:12:53.680000 UTC,2024-04-28 17:09:41.000000 UTC,False,118228,2017-07-04 20:17:00,382386,F,414149,45962,2,...,1,5,1,2,1,5,9,5,5.0,2024-04-28T13:37:08.466Z
149966,2024-04-28 17:12:53.680000 UTC,2024-04-28 17:09:41.000000 UTC,False,458202,2017-07-02 06:03:00,543839,C,360936,13787,3,...,0,2,4,1,2,3,5,2,3.0,2024-04-28T13:37:08.466Z
