In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
pd.options.mode.chained_assignment = None

root = '/content/drive/MyDrive/instacart-market-basket-analysis/'

Reading all data

In [None]:
orders = pd.read_csv(root + 'orders.csv',
                 dtype={
                        'order_id': np.int32,
                        'user_id': np.int64,
                        'eval_set': 'category',
                        'order_number': np.int16,
                        'order_dow': np.int8,
                        'order_hour_of_day': np.int8,
                        'days_since_prior_order': np.float32})


order_products_train = pd.read_csv(root + 'order_products__train.csv',
                                 dtype={
                                        'order_id': np.int32,
                                        'product_id': np.uint16,
                                        'add_to_cart_order': np.int16,
                                        'reordered': np.int8})

order_products_prior = pd.read_csv(
    root + 'order_products__prior.csv',
    dtype={
        'order_id': 'Int32',  # Nullable integer dtype
        'product_id': 'Int32',
        'add_to_cart_order': 'Int16',
        'reordered': 'Int8'
    }
)


product_features = pd.read_pickle(root + 'product_features.pkl')

user_features = pd.read_pickle(root + 'user_features.pkl')

user_product_features = pd.read_pickle(root + 'user_product_features.pkl')

products = pd.read_csv(root +'products.csv')

aisles = pd.read_csv(root + 'aisles.csv')

departments = pd.read_csv(root + 'departments.csv')



```
# This is formatted as code
```

merging train order data with orders


In [None]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
train_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1
3,1187899,1,train,11,4,8,14.0,26405,4,1
4,1187899,1,train,11,4,8,14.0,39657,5,1


In [None]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)




```
# This is formatted as code
```

unique user_ids in train data

In [None]:
train_users = train_orders.user_id.unique()
train_users[:10]

array([ 1,  2,  5,  7,  8,  9, 10, 13, 14, 17])

*keeping* only train_users in the data

In [None]:
user_product_features.shape


(7788811, 11)

In [None]:
df = user_product_features[user_product_features.user_id.isin(train_users)]
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,3,3,1.0,1.0,23.0,7,1.0,1.0,1.0
1,1,10258,3,3,1.0,2.666667,23.0,7,1.0,1.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5,0.0,0.0,0.0
3,1,12427,3,3,1.0,2.333333,23.0,7,1.0,1.0,1.0
4,1,13032,1,1,1.0,5.0,20.0,7,1.0,0.0,0.0


In [None]:
df = df.merge(train_orders, on = ['user_id', 'product_id'], how = 'outer')
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,3,3,1.0,1.0,23.0,7.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,3,3,1.0,2.666667,23.0,7.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,,,,,
3,1,12427,3,3,1.0,2.333333,23.0,7.0,1.0,1.0,1.0,,,,,
4,1,13032,1,1,1.0,5.0,20.0,7.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


for order_number, order_dow, order_hour_of_day, days_since_prior_order, impute null values with mean values grouped by users as these products will also be potential candidate for order.

In [None]:
df.order_number.fillna(df.groupby('user_id')['order_number'].transform('mean'), inplace = True)
df.order_dow.fillna(df.groupby('user_id')['order_dow'].transform('mean'), inplace = True)
df.order_hour_of_day.fillna(df.groupby('user_id')['order_hour_of_day'].transform('mean'), inplace = True)
df.days_since_prior_order.fillna(df.groupby('user_id')['days_since_prior_order'].\
                                                             transform('mean'), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.order_number.fillna(df.groupby('user_id')['order_number'].transform('mean'), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.order_dow.fillna(df.groupby('user_id')['order_dow'].transform('mean'), inplace = True)
The behavior will change in pandas 3.0. This i



```
# This is formatted as code
```

Removing those products which were bought the first time in last order by a user



In [None]:
df.reordered.value_counts()

Unnamed: 0_level_0,count
reordered,Unnamed: 1_level_1
1.0,828824
0.0,555793


In [None]:
df.reordered.isnull().sum()


4342548

In [None]:
df = df[df.reordered != 0]


In [None]:
df.shape

(5171372, 16)

In [None]:
df.reordered.fillna(0, inplace = True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.reordered.fillna(0, inplace = True)


Unnamed: 0,0
user_id,0
product_id,0
total_product_orders_by_user,213844
total_product_reorders_by_user,213844
user_product_reorder_percentage,213844
avg_add_to_cart_by_user,213844
avg_days_since_last_bought,213844
last_ordered_in,213844
is_reorder_3,213844
is_reorder_2,213844


In [None]:
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,3,3,1.0,1.0,23.0,7.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,3,3,1.0,2.666667,23.0,7.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,11.0,4.0,8.0,14.0,0.0
3,1,12427,3,3,1.0,2.333333,23.0,7.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,0.0
4,1,13032,1,1,1.0,5.0,20.0,7.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


Merging product and user features

In [None]:
product_features.head()

Unnamed: 0,product_id,order_first_time_total_cnt,order_second_time_total_cnt,second_time_percent,aisle_mean_add_to_cart_order,aisle_std_add_to_cart_order,aisle_total_orders,aisle_total_reorders,aisle_reorder_percentage,aisle_unique_users,...,department_std_add_to_cart_order,department_total_orders,department_total_reorders,department_reorder_percentage,department_unique_users,department_0,department_1,department_2,department_3,department_4
0,1,323.0,472.0,1.4613,9.287208,7.820526,101195,55495,0.548397,36061,...,7.681034,1252447,719323,0.574334,144403,0,0,0,0,1
1,2,35.0,5.0,0.142857,9.973083,7.949225,91245,13970,0.153104,45756,...,7.869214,811451,282131,0.347687,138915,0,0,0,1,0
2,3,31.0,81.0,2.612903,8.555624,7.915439,108199,56907,0.525948,34687,...,6.7162,1164495,760722,0.653263,144648,0,0,0,1,1
3,4,70.0,71.0,1.014286,9.237562,7.473914,169341,94357,0.557201,41090,...,7.392699,969271,525355,0.54201,133681,0,0,1,0,0
4,5,1.0,6.0,6.0,10.282027,8.245271,27313,7644,0.279867,18119,...,7.869214,811451,282131,0.347687,138915,0,0,0,1,0


In [None]:
user_features.head()

Unnamed: 0,user_id,avg_dow,std_dow,avg_doh,std_doh,avg_since_order,std_since_order,total_orders_by_user,total_products_by_user,total_unique_product_by_user,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,2.888889,1.278275,12.5,2.572479,23.833334,3.853951,3,18,10,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
1,2,1.733333,0.746528,10.755556,2.062567,16.9,8.784627,6,90,55,52,0.577778,15.0,0.587457,9.0,15.0,16.0,0.888889,0.8,0.625
2,3,1.694444,1.166667,16.305556,2.025825,9.111111,6.390966,5,36,19,19,0.527778,7.2,0.633333,6.0,5.0,6.0,0.833333,1.0,1.0
3,4,4.222222,0.440959,14.555556,0.881917,19.666666,2.645751,2,9,9,1,0.111111,4.5,0.071429,7.0,2.0,,0.142857,0.0,
4,5,1.0,0.0,18.0,0.0,19.0,0.0,1,12,12,8,0.666667,12.0,0.666667,12.0,,,0.666667,,


In [None]:
df = df.merge(product_features, on = 'product_id', how = 'left')
df = df.merge(user_features, on = 'user_id', how = 'left')
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,3,3,1.0,1.0,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
1,1,10258,3,3,1.0,2.666667,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
2,1,10326,1,0,0.0,5.0,28.0,5.0,0.0,0.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
3,1,12427,3,3,1.0,2.333333,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
4,1,13032,1,1,1.0,5.0,20.0,7.0,1.0,0.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0


*The* dataframe has null values because the product was never bought earlier by a user



In [None]:
df.shape

(5171372, 63)

In [None]:
df.isnull().sum().sort_values(ascending = False)


Unnamed: 0,0
reorder_1,591220
orders_1,591220
avg_days_since_last_bought,213844
is_reorder_2,213844
is_reorder_3,213844
...,...
days_since_prior_order,0
order_hour_of_day,0
order_dow,0
order_number,0


In [None]:
df.to_pickle(root + 'Finaldata.pkl')

In [None]:
df2 = pd.read_pickle(root +'Finaldata.pkl')
df2.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,3,3,1.0,1.0,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
1,1,10258,3,3,1.0,2.666667,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
2,1,10326,1,0,0.0,5.0,28.0,5.0,0.0,0.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
3,1,12427,3,3,1.0,2.333333,23.0,7.0,1.0,1.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
4,1,13032,1,1,1.0,5.0,20.0,7.0,1.0,0.0,...,13,0.722222,6.0,0.741667,5.0,8.0,5.0,0.6,0.625,1.0
