
## Data Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
pd.options.mode.chained_assignment = None

## Reading all data

In [2]:
orders = pd.read_csv("./data-clarify/orders.csv")
order_products_train = pd.read_csv('./data-clarify/order_products__train.csv')
aisles = pd.read_csv('./data-clarify/aisles.csv')
products = pd.read_csv('./data-clarify/products.csv')
departments = pd.read_csv('./data-clarify/departments.csv')
product_features = pd.read_pickle('./data-clarify/product_features.pkl')
user_features = pd.read_pickle('./data-clarify/user_features.pkl')

## Merging train order data with orders

In [3]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
train_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1
3,1187899,1,train,11,4,8,14.0,26405,4,1
4,1187899,1,train,11,4,8,14.0,39657,5,1


removing unnecessary columns from train_orders

In [4]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)

In [5]:
train_orders.shape

(1384617, 7)

## Merging with product_features and user_features

In [6]:
df = train_orders.merge(product_features, on = 'product_id', how = 'left')
df = df.merge(user_features, on = 'user_id', how = 'left')
df.head()


Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered,mean_add_to_cart_order,total_orders,total_reorders,...,department_reorder_percentage,department_unique_users,department,total_orders_by_user,total_products_by_user,total_unique_product_by_user,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order
0,1,11,4,8,14.0,196,1,3.984738,1507,1221,...,0.658155,61482,beverages,1,11,11,10,0.909091,11,0.909091
1,1,11,4,8,14.0,25133,1,6.580645,186,139,...,0.674966,87400,dairy eggs,1,11,11,10,0.909091,11,0.909091
2,1,11,4,8,14.0,38928,1,3.707022,413,329,...,0.674966,87400,dairy eggs,1,11,11,10,0.909091,11,0.909091
3,1,11,4,8,14.0,26405,1,2.761905,105,62,...,0.427166,21647,household,1,11,11,10,0.909091,11,0.909091
4,1,11,4,8,14.0,39657,1,3.88417,259,208,...,0.581363,57302,snacks,1,11,11,10,0.909091,11,0.909091


In [7]:
df.shape

(1384617, 34)

In [8]:
df.columns

Index(['user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'reordered',
       'mean_add_to_cart_order', 'total_orders', 'total_reorders',
       'reorder_percentage', 'unique_users', 'order_first_time_total_cnt',
       'aisle_mean_add_to_cart_order', 'aisle_std_add_to_cart_order',
       'aisle_total_orders', 'aisle_total_reorders',
       'aisle_reorder_percentage', 'aisle_unique_users', 'aisle',
       'department_mean_add_to_cart_order', 'department_std_add_to_cart_order',
       'department_total_orders', 'department_total_reorders',
       'department_reorder_percentage', 'department_unique_users',
       'department', 'total_orders_by_user', 'total_products_by_user',
       'total_unique_product_by_user', 'total_reorders_by_user',
       'reorder_propotion_by_user', 'average_order_size', 'reorder_in_order'],
      dtype='object')

In [9]:
path = "./data-clarify/final_data.csv"

df.to_csv(path)

In [10]:
df['reordered'].value_counts()

1    828824
0    555793
Name: reordered, dtype: int64

## Create dummy variables
Convert Day and Hour categorical variables to dummy variables.

In [11]:
final_data = pd.get_dummies(data=df, prefix=['Day','Hour'], columns=['order_dow','order_hour_of_day'], drop_first=True)
final_data.head()

Unnamed: 0,user_id,order_number,days_since_prior_order,product_id,reordered,mean_add_to_cart_order,total_orders,total_reorders,reorder_percentage,unique_users,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
0,1,11,14.0,196,1,3.984738,1507,1221,0.810219,1507,...,0,0,0,0,0,0,0,0,0,0
1,1,11,14.0,25133,1,6.580645,186,139,0.747312,186,...,0,0,0,0,0,0,0,0,0,0
2,1,11,14.0,38928,1,3.707022,413,329,0.79661,413,...,0,0,0,0,0,0,0,0,0,0
3,1,11,14.0,26405,1,2.761905,105,62,0.590476,105,...,0,0,0,0,0,0,0,0,0,0
4,1,11,14.0,39657,1,3.88417,259,208,0.803089,259,...,0,0,0,0,0,0,0,0,0,0


### Duplicate rows
 The dataset doesn't have any duplicates.

In [12]:
    
counter = 0
r,c = final_data.shape

df1 = final_data.drop_duplicates()
df1.reset_index(drop=True,inplace=True)

if df1.shape==(r,c):
    print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else:
    print(f'\n\033[1mInference:\033[0m Number of duplicates dropped ---> {r-df.shape[0]}')


[1mInference:[0m The dataset doesn't have any duplicates


## Save the new dataset for future 
save the dataset with dummy variables for future tasks.

In [13]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
import sagemaker
import boto3
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
from pyathena import connect


In [14]:
path = "./data-clarify/final_data.csv"
final_data.to_csv(path, index=False, header=True)

In [15]:
import time

timestamp = int(time.time())

dummy_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix="dummy-dataset-{}".format(timestamp), path=path)
dummy_data_s3_uri

's3://sagemaker-us-east-1-254716743917/dummy-dataset-1650237616/final_data.csv'

In [16]:
!aws s3 ls $dummy_data_s3_uri

2022-04-17 23:20:17  470105099 final_data.csv


In [17]:
%store dummy_data_s3_uri

Stored 'dummy_data_s3_uri' (str)


In [18]:
%store

Stored variables and their in-db values:
balanced_bias_data_jsonlines_s3_uri             -> 's3://sagemaker-us-east-1-254716743917/bias-detect
balanced_bias_data_s3_uri                       -> 's3://sagemaker-us-east-1-254716743917/bias-detect
bias_data_s3_uri                                -> 's3://sagemaker-us-east-1-254716743917/bias-detect
database_name                                   -> 'oneclickaws'
dummy_data_s3_uri                               -> 's3://sagemaker-us-east-1-254716743917/dummy-datas
ingest_create_athena_db_passed                  -> True
s3_public_path_csv                              -> 's3://projectads508'
