In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery_storage
import time
import pickle
# Modelling
import xgboost
from sklearn.model_selection import train_test_split

# Load Data From Bigquery

In [2]:
def bq_connector():
    key_path = "C:/Users/HEN1/Projects/Instacart_Market_Basket_Analysis/keys/plucky-mile-327121-255163f80b63.json"
    credentials = service_account.Credentials.from_service_account_file(
        key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )

    bqclient = bigquery.Client(credentials=credentials, project=credentials.project_id,)
    bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials)
    return bqclient, bqstorageclient

def bq_full_table_df(bqclient, bqstorageclient, table_name):
    sql_query = f"SELECT * FROM instacart.{table_name}"
    query_job = bqclient.query(sql_query)
    time.sleep(30)
    count =0
    while query_job.state !='DONE':
        print("NOT DONE")
        if query_job.state =='PENDING':
            print(f"job from {table_name} is pending")
            break
        if query_job.state =='RUNNING':
            print(f"job from {table_name} is running")
            print(query_job.result())
            time.sleep(60)
            query_job.reload()
            time.sleep(10)
            count += 1
            if count>3:
                break
        else:
            print("may meet an error")
            break
    if query_job.state == 'DONE':
        print(f"successfully finished getting data from {table_name} table")
        df = query_job.to_dataframe(bqstorage_client=bqstorageclient,
                                    progress_bar_type='tqdm_notebook',)
        print("successfully transferred to df")
        time.sleep(3)
    else:
        print("error")

    return df

In [3]:
bqclient, bqstorageclient = bq_connector()

In [4]:
aisles = bq_full_table_df(bqclient, bqstorageclient, 'aisles')
time.sleep(3)

successfully finished getting data from aisles table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


In [5]:
departments = bq_full_table_df(bqclient, bqstorageclient, 'departments')
time.sleep(60)

successfully finished getting data from departments table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


In [6]:
orders = bq_full_table_df(bqclient, bqstorageclient, 'orders')


NOT DONE
job from orders is running
<google.cloud.bigquery.table.RowIterator object at 0x0000026272850580>
successfully finished getting data from orders table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


In [7]:
products = bq_full_table_df(bqclient, bqstorageclient, 'products')


successfully finished getting data from products table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


In [8]:
order_products_prior = bq_full_table_df(bqclient, bqstorageclient, 'order_products_prior')
time.sleep(60)

NOT DONE
job from order_products_prior is running
<google.cloud.bigquery.table.RowIterator object at 0x0000026272A2AF70>
successfully finished getting data from order_products_prior table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


In [9]:
order_products_train = bq_full_table_df(bqclient, bqstorageclient, 'order_products_train')

NOT DONE
job from order_products_train is running
<google.cloud.bigquery.table.RowIterator object at 0x00000262756DF5E0>
successfully finished getting data from order_products_train table


Query is running:   0%|          |

Downloading:   0%|          |

successfully transferred to df


## Defining train, validation and test set|

In [11]:
order_products_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           Int64
 1   product_id         Int64
 2   add_to_cart_order  Int64
 3   reordered          Int64
dtypes: Int64(4)
memory usage: 1.1 GB


In [12]:
order_products_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype
---  ------             --------------    -----
 0   order_id           1384617 non-null  Int64
 1   product_id         1384617 non-null  Int64
 2   add_to_cart_order  1384617 non-null  Int64
 3   reordered          1384617 non-null  Int64
dtypes: Int64(4)
memory usage: 47.5 MB


In [10]:
df = orders[orders['eval_set']=='prior']

In [11]:
test = orders[orders['eval_set']=='train']

In [39]:
pred = orders[orders['eval_set']=='test']

In [13]:
pred.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
77,1349715,19750,test,12,0,1,0.0
113,1126303,92284,test,15,0,4,2.0
184,3366376,153495,test,6,0,4,3.0
242,1850652,178490,test,15,0,1,4.0
370,672486,172529,test,16,0,3,6.0


# Feature Engineering

In [41]:
order_merge_prior = orders.merge(order_products_prior, on='order_id', how='left')

In [42]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,761287,2928,prior,1,0,1,
1,1039920,4640,prior,1,0,3,
2,2639736,4799,prior,1,0,5,
3,2558046,5038,prior,1,0,3,
4,2934925,5076,prior,1,0,3,


In [43]:
order_merge_prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,761287,2928,prior,1,0,1,,29589,2,0
1,761287,2928,prior,1,0,1,,18932,3,0
2,761287,2928,prior,1,0,1,,48075,1,0
3,1039920,4640,prior,1,0,3,,40199,1,0
4,1039920,4640,prior,1,0,3,,38684,4,0


In [44]:
order_merge_prior.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32640698 entries, 0 to 32640697
Data columns (total 10 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                Int64  
 1   user_id                 Int64  
 2   eval_set                object 
 3   order_number            Int64  
 4   order_dow               Int64  
 5   order_hour_of_day       Int64  
 6   days_since_prior_order  float64
 7   product_id              Int64  
 8   add_to_cart_order       Int64  
 9   reordered               Int64  
dtypes: Int64(8), float64(1), object(1)
memory usage: 2.9+ GB


### Create DataFrame groupby users and products

In [45]:
## creat user and product combo occur times
user_product = order_merge_prior.groupby(['user_id', 'product_id'])[['order_id']].count().rename(columns={'order_id':'total_bought_freq'}).reset_index()

In [46]:
order_merge = pd.concat([order_products_prior, order_products_train], ignore_index=True)

In [47]:
result = orders.merge(order_merge, on='order_id', how='left')

In [48]:
result.isna().sum()

order_id                        0
user_id                         0
eval_set                        0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order    2078068
product_id                  75000
add_to_cart_order           75000
reordered                   75000
dtype: int64

In [49]:
user_product[user_product.total_bought_freq==1].shape[0]

7982695

In [50]:
user_product[user_product.total_bought_freq==1].product_id.nunique()

49587

In [51]:
## Create user total orders dataframe
user_total = order_merge_prior.groupby('user_id')[['order_id']].count().rename(columns={'order_id':'user_total_bought'}).reset_index()

In [52]:
## Create dataframe that a customer only place order one time, and group by products
## The new column defines how many unique customers buy specific product at least once
item_one = user_product[user_product.total_bought_freq==1].groupby('product_id')[['user_id']].count().rename(columns={'user_id': 'one_order_freq'}).reset_index()

In [53]:
result.product_id.fillna(0, inplace=True)
result = result.merge(item_one, on='product_id', how='left')

In [54]:
result = result.merge(user_product, on=['user_id', 'product_id'], how='left')

In [55]:
## Get the total number of users bought product
## The new column defines how many transactions for specific product(1 customer buy A product once count as once) 
item_cust_freq = user_product.groupby('product_id')[['user_id']].count().rename(columns={'user_id':'item_cust_freq'}).reset_index()

In [56]:
result = result.merge(item_cust_freq, on='product_id', how='left')

In [57]:
item_cust_freq.sort_values(by='item_cust_freq', ascending=False).merge(products, on='product_id', how='left').head()

Unnamed: 0,product_id,item_cust_freq,product_name,aisle_id,department_id
0,24852,73956,Banana,24,4
1,13176,63537,Bag of Organic Bananas,24,4
2,21137,58838,Organic Strawberries,24,4
3,21903,55037,Organic Baby Spinach,123,4
4,47626,46402,Large Lemon,24,4


In [58]:
user_product.sort_values(by='total_bought_freq', ascending=False).merge(products, on='product_id', how='left').head()

Unnamed: 0,user_id,product_id,total_bought_freq,product_name,aisle_id,department_id
0,141736,25133,99,Organic String Cheese,21,16
1,41356,38652,99,Yerba Mate Orange Exuberance Tea,64,7
2,17997,4210,99,Whole Milk,84,16
3,41356,6583,99,Oraganic Lemon Elation Yerba Mate Drink,98,7
4,41356,14366,99,Enlighten Mint Organic,94,7


In [59]:
order_merge_prior['order_number_back'] = order_merge_prior.groupby('user_id')['order_number'].transform(max) - order_merge_prior.order_number +1 

In [60]:
## Creat dataframe that onyl contain last 5 orders
last_five = order_merge_prior[order_merge_prior.order_number_back <=5]

In [61]:
## Calculate how many time a product occur in the most recent 5 times
product_occur_l5 = last_five.groupby(['user_id','product_id'])[['order_id']].count().rename(columns={'order_id':'times_last5'}).reset_index()

In [62]:
result = result.merge(product_occur_l5, on=['user_id', 'product_id'], how='left')

In [63]:
product_occur_l5.head()

Unnamed: 0,user_id,product_id,times_last5
0,1,196,4
1,1,10258,4
2,1,12427,4
3,1,13032,2
4,1,25133,4


In [64]:
## How frequently a customer bought a product after its first purchase
first_order_number = order_merge_prior.groupby(['user_id', 'product_id'])[['order_number']].min().rename(columns={'order_number':'first_order_number'}).reset_index()
total_order_number = order_merge_prior.groupby('user_id')[['order_number']].max().rename(columns={'order_number':'total_order_number'}).reset_index()

In [65]:
first_order_number = first_order_number.merge(total_order_number, on='user_id', how='left')

In [66]:
first_order_number['range'] = first_order_number.total_order_number - first_order_number.first_order_number +1 

In [67]:
user_product = user_product.merge(first_order_number, on=['user_id', 'product_id'], how='left')

In [68]:
user_product['order_fre_ratio_aft_first'] = user_product.total_bought_freq/user_product.range

In [69]:
order_number_df = order_merge_prior.groupby(['user_id', 'product_id'])['order_number'].agg(['min', 'max']).reset_index().rename(columns={'min': 'first_order_number', 'max': 'last_order_number'})

In [70]:
order_merge_prior.product_id.fillna(0, inplace=True)
order_merge_prior = order_merge_prior.merge(order_number_df, on=['user_id', 'product_id'], how='left')
order_merge_prior.product_id.replace(0, np.nan, inplace=True)

In [71]:
order_merge_prior['product_order_ratio'] = (order_merge_prior['order_number']-order_merge_prior['first_order_number'])/(order_merge_prior['last_order_number'] - order_merge_prior['first_order_number'])

In [72]:
result = result.merge(order_merge_prior[['order_id', 'user_id', 'product_id', 'product_order_ratio']], on=['order_id', 'user_id', 'product_id'], how='left')

In [73]:
order_merge_prior.isna().sum()

order_id                        0
user_id                         0
eval_set                        0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order    2078068
product_id                 206209
add_to_cart_order          206209
reordered                  206209
order_number_back               0
first_order_number         206209
last_order_number          206209
product_order_ratio        206209
dtype: int64

In [74]:
order_merge_prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_number_back,first_order_number,last_order_number,product_order_ratio
0,761287,2928,prior,1,0,1,,29589,2,0,14,1,13,0.0
1,761287,2928,prior,1,0,1,,18932,3,0,14,1,1,
2,761287,2928,prior,1,0,1,,48075,1,0,14,1,13,0.0
3,1039920,4640,prior,1,0,3,,40199,1,0,6,1,1,
4,1039920,4640,prior,1,0,3,,38684,4,0,6,1,1,


In [75]:
result = result.merge(user_product[['user_id', 'product_id', 'order_fre_ratio_aft_first']],  on=['user_id', 'product_id'], how='left')

In [76]:
result.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,one_order_freq,total_bought_freq,item_cust_freq,times_last5,product_order_ratio,order_fre_ratio_aft_first
0,761287,2928,prior,1,0,1,,29589,2,0,308.0,9.0,467.0,4.0,0.0,0.642857
1,761287,2928,prior,1,0,1,,18932,3,0,16.0,1.0,21.0,,,0.071429
2,761287,2928,prior,1,0,1,,48075,1,0,159.0,7.0,223.0,3.0,0.0,0.5
3,1039920,4640,prior,1,0,3,,40199,1,0,3261.0,1.0,5627.0,,,0.166667
4,1039920,4640,prior,1,0,3,,38684,4,0,723.0,1.0,890.0,,,0.166667


In [77]:
prod_feat = order_merge_prior.groupby('product_id')[['reordered']].mean().rename(columns={'reordered':'prod_mean_reorder_ratio'}).reset_index()
mean_add_to_cart_order = order_merge_prior.groupby('product_id')[['add_to_cart_order']].mean().rename(columns={'add_to_cart_order':'prod_mean_add_to_cart_order'}).reset_index()

In [78]:
prod_feat = prod_feat.merge(mean_add_to_cart_order, on='product_id')

In [79]:
user_prod_feat = order_merge_prior.groupby(['user_id', 'product_id'])[['reordered']].mean().rename(columns={'reordered':'mean_reorder_ratio'}).reset_index()
user_prod_mean_add_to_cart_order = order_merge_prior.groupby(['user_id', 'product_id'])[['add_to_cart_order']].mean().rename(columns={'add_to_cart_order':'mean_add_to_cart_order'}).reset_index()

In [80]:
user_prod_feat = user_prod_feat.merge(user_prod_mean_add_to_cart_order, on=['user_id', 'product_id'])

In [81]:
prod_feat.head()

Unnamed: 0,product_id,prod_mean_reorder_ratio,prod_mean_add_to_cart_order
0,1,0.613391,5.801836
1,2,0.133333,9.888889
2,3,0.732852,6.415162
3,4,0.446809,9.507599
4,5,0.6,6.466667


In [82]:
user_prod_feat.head()

Unnamed: 0,user_id,product_id,mean_reorder_ratio,mean_add_to_cart_order
0,1,196,0.9,1.4
1,1,10258,0.888889,3.333333
2,1,10326,0.0,5.0
3,1,12427,0.9,3.3
4,1,13032,0.666667,6.333333


In [83]:
result = result.merge(prod_feat, on='product_id', how='left')

In [84]:
result = result.merge(user_prod_feat, on=['user_id', 'product_id'], how='left')

### Users feature

In [85]:
order_merge_prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_number_back,first_order_number,last_order_number,product_order_ratio
0,761287,2928,prior,1,0,1,,29589,2,0,14,1,13,0.0
1,761287,2928,prior,1,0,1,,18932,3,0,14,1,1,
2,761287,2928,prior,1,0,1,,48075,1,0,14,1,13,0.0
3,1039920,4640,prior,1,0,3,,40199,1,0,6,1,1,
4,1039920,4640,prior,1,0,3,,38684,4,0,6,1,1,


In [86]:
user_feat = order_merge_prior.groupby('user_id').agg(
    max_order_number=('order_number', 'max'),
    sum_days_since_prior=('days_since_prior_order', 'sum'),
    mean_days_since_prior=('days_since_prior_order', 'mean'),
    count_product_id=('product_id', 'count'),
    unique_count_product_id=('product_id', 'nunique')
).reset_index()

In [87]:
user_feat.head()

Unnamed: 0,user_id,max_order_number,sum_days_since_prior,mean_days_since_prior,count_product_id,unique_count_product_id
0,1,11,1108.0,20.145455,59,18
1,2,15,2936.0,16.043716,195,102
2,3,13,907.0,11.481013,88,33
3,4,6,245.0,16.333333,18,17
4,5,5,383.0,14.185185,37,23


In [88]:
result = result.merge(user_feat, on='user_id', how='left')

In [89]:
result.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,order_fre_ratio_aft_first,prod_mean_reorder_ratio,prod_mean_add_to_cart_order,mean_reorder_ratio,mean_add_to_cart_order,max_order_number,sum_days_since_prior,mean_days_since_prior,count_product_id,unique_count_product_id
0,761287,2928,prior,1,0,1,,29589,2,0,...,0.642857,0.54572,8.586576,0.888889,2.777778,14,734.0,7.489796,100,47
1,761287,2928,prior,1,0,1,,18932,3,0,...,0.071429,0.25,6.428571,0.0,3.0,14,734.0,7.489796,100,47
2,761287,2928,prior,1,0,1,,48075,1,0,...,0.5,0.465228,10.342926,0.857143,3.571429,14,734.0,7.489796,100,47
3,1039920,4640,prior,1,0,3,,40199,1,0,...,0.166667,0.602276,6.884577,0.0,1.0,6,265.0,12.045455,26,18
4,1039920,4640,prior,1,0,3,,38684,4,0,...,0.166667,0.252101,9.918487,0.0,4.0,6,265.0,12.045455,26,18


# Preparing dataset

In [12]:

def generate_results(orders, order_products):
    order_merge_prior = orders.merge(order_products, on='order_id', how='left')
    user_product = order_merge_prior.groupby(['user_id', 'product_id'])[['order_id']].count().rename(columns={'order_id':'total_bought_freq'}).reset_index()
    result = orders.merge(order_products, on='order_id', how='left')

    # Create user total orders dataframe
    user_total = order_merge_prior.groupby('user_id')[['order_id']].count().rename(columns={'order_id':'user_total_bought'}).reset_index()
    item_one = user_product[user_product.total_bought_freq==1].groupby('product_id')[['user_id']].count().rename(columns={'user_id': 'one_order_freq'}).reset_index()
    result.product_id.fillna(0, inplace=True)
    result = result.merge(item_one, on='product_id', how='left')
    result = result.merge(user_product, on=['user_id', 'product_id'], how='left')
    item_cust_freq = user_product.groupby('product_id')[['user_id']].count().rename(columns={'user_id':'item_cust_freq'}).reset_index()
    result = result.merge(item_cust_freq, on='product_id', how='left')
    order_merge_prior['order_number_back'] = order_merge_prior.groupby('user_id')['order_number'].transform(max) - order_merge_prior.order_number +1 
    last_five = order_merge_prior[order_merge_prior.order_number_back <=5]
    product_occur_l5 = last_five.groupby(['user_id','product_id'])[['order_id']].count().rename(columns={'order_id':'times_last5'}).reset_index()
    result = result.merge(product_occur_l5, on=['user_id', 'product_id'], how='left')
    first_order_number = order_merge_prior.groupby(['user_id', 'product_id'])[['order_number']].min().rename(columns={'order_number':'first_order_number'}).reset_index()
    total_order_number = order_merge_prior.groupby('user_id')[['order_number']].max().rename(columns={'order_number':'total_order_number'}).reset_index()
    first_order_number = first_order_number.merge(total_order_number, on='user_id', how='left')
    first_order_number['range'] = first_order_number.total_order_number - first_order_number.first_order_number +1 
    user_product = user_product.merge(first_order_number, on=['user_id', 'product_id'], how='left')
    user_product['order_fre_ratio_aft_first'] = user_product.total_bought_freq/user_product.range
    order_number_df = order_merge_prior.groupby(['user_id', 'product_id'])['order_number'].agg(['min', 'max']).reset_index().rename(columns={'min': 'first_order_number', 'max': 'last_order_number'})
    order_merge_prior.product_id.fillna(0, inplace=True)
    order_merge_prior = order_merge_prior.merge(order_number_df, on=['user_id', 'product_id'], how='left')
    order_merge_prior.product_id.replace(0, np.nan, inplace=True)
    order_merge_prior['product_order_ratio'] = (order_merge_prior['order_number']-order_merge_prior['first_order_number'])/(order_merge_prior['last_order_number'] - order_merge_prior['first_order_number'])
    result = result.merge(order_merge_prior[['order_id', 'user_id', 'product_id', 'product_order_ratio']], on=['order_id', 'user_id', 'product_id'], how='left')
    result = result.merge(user_product[['user_id', 'product_id', 'order_fre_ratio_aft_first']],  on=['user_id', 'product_id'], how='left')
    prod_feat = order_merge_prior.groupby('product_id')[['reordered']].mean().rename(columns={'reordered':'prod_mean_reorder_ratio'}).reset_index()
    mean_add_to_cart_order = order_merge_prior.groupby('product_id')[['add_to_cart_order']].mean().rename(columns={'add_to_cart_order':'prod_mean_add_to_cart_order'}).reset_index()
    prod_feat = prod_feat.merge(mean_add_to_cart_order, on='product_id')
    user_prod_feat = order_merge_prior.groupby(['user_id', 'product_id'])[['reordered']].mean().rename(columns={'reordered':'mean_reorder_ratio'}).reset_index()
    user_prod_mean_add_to_cart_order = order_merge_prior.groupby(['user_id', 'product_id'])[['add_to_cart_order']].mean().rename(columns={'add_to_cart_order':'mean_add_to_cart_order'}).reset_index()
    user_prod_feat = user_prod_feat.merge(user_prod_mean_add_to_cart_order, on=['user_id', 'product_id'])
    result = result.merge(prod_feat, on='product_id', how='left')
    result = result.merge(user_prod_feat, on=['user_id', 'product_id'], how='left')
    user_feat = order_merge_prior.groupby('user_id').agg(
        max_order_number=('order_number', 'max'),
        sum_days_since_prior=('days_since_prior_order', 'sum'),
        mean_days_since_prior=('days_since_prior_order', 'mean'),
        count_product_id=('product_id', 'count'),
        unique_count_product_id=('product_id', 'nunique')
    ).reset_index()
    result = result.merge(user_feat, on='user_id', how='left')
    result.product_id.replace(0, np.nan, inplace=True)
    return result



In [13]:
# Call the function with your dataframes and variables
generated_results = generate_results(orders, order_products_prior)


In [14]:
with open("../feature_train.pkl", "wb") as file: 
    pickle.dump(generated_results, file) 

## Dump features dataframe

In [90]:
result.product_id.replace(0, np.nan, inplace=True)

In [91]:
result.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,order_fre_ratio_aft_first,prod_mean_reorder_ratio,prod_mean_add_to_cart_order,mean_reorder_ratio,mean_add_to_cart_order,max_order_number,sum_days_since_prior,mean_days_since_prior,count_product_id,unique_count_product_id
0,761287,2928,prior,1,0,1,,29589,2,0,...,0.642857,0.54572,8.586576,0.888889,2.777778,14,734.0,7.489796,100,47
1,761287,2928,prior,1,0,1,,18932,3,0,...,0.071429,0.25,6.428571,0.0,3.0,14,734.0,7.489796,100,47
2,761287,2928,prior,1,0,1,,48075,1,0,...,0.5,0.465228,10.342926,0.857143,3.571429,14,734.0,7.489796,100,47
3,1039920,4640,prior,1,0,3,,40199,1,0,...,0.166667,0.602276,6.884577,0.0,1.0,6,265.0,12.045455,26,18
4,1039920,4640,prior,1,0,3,,38684,4,0,...,0.166667,0.252101,9.918487,0.0,4.0,6,265.0,12.045455,26,18


In [92]:
result.isna().sum()

order_id                              0
user_id                               0
eval_set                              0
order_number                          0
order_dow                             0
order_hour_of_day                     0
days_since_prior_order          2078068
product_id                        75000
add_to_cart_order                 75000
reordered                         75000
one_order_freq                    75684
total_bought_freq                630793
item_cust_freq                    75009
times_last5                    15129401
product_order_ratio             1459617
order_fre_ratio_aft_first        630793
prod_mean_reorder_ratio           75009
prod_mean_add_to_cart_order       75009
mean_reorder_ratio               630793
mean_add_to_cart_order           630793
max_order_number                      0
sum_days_since_prior                  0
mean_days_since_prior                 0
count_product_id                      0
unique_count_product_id               0


In [93]:
with open("../feature.pkl", "wb") as file: 
    pickle.dump(result, file) 