In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from tqdm.notebook import tqdm
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import lightgbm
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline 

pd.options.mode.chained_assignment = None
#!pip3 install dask[complete]
#import dask.dataframe as dd

### Merging datasets here to form large datasets master_prior and master_train

In [7]:
products = pd.read_csv("products.csv")
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")

In [None]:
Products_aisles_departments= pd.merge(products,aisles,on='aisle_id')
Products_aisles_departments=pd.merge(Products_aisles_departments,departments,on='department_id')

In [5]:
orders_and_prior=pd.merge(orders,order_products__prior,on='order_id')
orders_and_train=pd.merge(orders,order_products__train,on='order_id')
master_train=pd.merge(orders_and_train,Products_aisles_departments,on='product_id')
master_prior=pd.merge(orders_and_prior,Products_aisles_departments,on='product_id')

In [7]:
master_prior.to_csv('master_prior.csv',index=False)
master_train.to_csv('master_train.csv',index=False)

### Merged data is saved and then loaded in csv format

In [2]:
master_train=pd.read_csv('master_train.csv')
master_prior=pd.read_csv('master_prior.csv')
#orders=pd.read_csv("orders.csv")
#order_products__prior=pd.read_csv("order_products__prior.csv")
#order_products__train=pd.read_csv("order_products__train.csv")

### Loading the features created

In [2]:
final_78_features=pd.read_csv('final_78_features.csv')
feature_data_test=pd.read_csv('feature_data_test.csv')

## test and train users

In [10]:
test_users=pd.unique(orders[orders['eval_set']=='test']['user_id'])

In [6]:
train_users=pd.unique(orders[orders['eval_set']=='train']['user_id'])

## Data cleaning and preprocessing 

In [3]:
master_prior.isnull().values.any()

True

In [6]:
master_prior.drop(['days_since_prior_order'],axis=1).isnull().values.any()

False

###### It is clear that only 'days_since_prior_order' contains null values. These nulls values are assigned when orders placed for the first time . I will  take care of this while featurizing data. 

In [11]:
pd.options.display.float_format='{:.5f}'.format

In [14]:
master_prior.dropna(axis=0)[['order_number','add_to_cart_order','days_since_prior_order','order_hour_of_day','order_dow']].describe()


Unnamed: 0,order_number,add_to_cart_order,days_since_prior_order,order_hour_of_day,order_dow
count,30356421.0,30356421.0,30356421.0,30356421.0,30356421.0
mean,18.24706,8.35415,11.10407,13.41125,2.74078
std,17.5917,7.13354,8.77891,4.24682,2.08792
min,2.0,1.0,0.0,0.0,0.0
25%,6.0,3.0,5.0,10.0,1.0
50%,12.0,6.0,8.0,13.0,3.0
75%,25.0,11.0,15.0,16.0,5.0
max,99.0,145.0,30.0,23.0,6.0


#### Data looks clean as there are no missing values('only present in 'days_since_prior_order') which will be handled while doing feature engineering. There does not look like any outliers apart from add_to_cart_order where max is 145..But we keep this data as 145 add_cart_order_number is a possible value

### Featurization starts from here

In [11]:
product_name_count = master_prior['product_id'].value_counts()
aisle_count = master_prior['aisle_id'].value_counts()
department_count = master_prior['department_id'].value_counts()
product_freq_ratio=product_name_count/(master_prior['product_id'].shape[0])
aisle_freq_ratio=aisle_count/(master_prior['aisle_id'].shape[0])
department_freq_ratio=department_count/(master_prior['department_id'].shape[0])

In [12]:
dep_reorder_ratio   = master_prior.groupby(["department_id"])["reordered"].aggregate("mean").reset_index()
aisle_reorder_ratio = master_prior.groupby(["aisle_id"])["reordered"].aggregate("mean").reset_index()
product_reorder_ratio = master_prior.groupby(["product_id"])["reordered"].aggregate("mean").reset_index()

In [13]:
dep_reorder_ratio=dep_reorder_ratio.set_index('department_id')
aisle_reorder_ratio=aisle_reorder_ratio.set_index('aisle_id')
product_reorder_ratio=product_reorder_ratio.set_index('product_id')

In [14]:
reorder_ratio_cart_universal = master_prior.groupby(["add_to_cart_order"])["reordered"].aggregate("mean")
product_add_order_universal  =master_prior[['product_id','add_to_cart_order']].groupby(['product_id']).aggregate("mean")

In [15]:
all_user_reorder_ratio_hour = master_prior.groupby(["order_hour_of_day"])["reordered"].aggregate("mean")
product_ordering_hour       =  master_prior[['product_id','order_hour_of_day']].groupby(['product_id']).aggregate("mean")

In [8]:
mini_master_train=master_train[['user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','reordered']]

### Splitting train and test users for multiprocessing

In [13]:
train_users_list=[]
train_users_list.append(train_user2[0:6300])
train_users_list.append(train_user2[6300:12600])
train_users_list.append(train_user2[12600:18900])
train_users_list.append(train_user2[18900:25200])
train_users_list.append(train_user2[25200:31500])
train_users_list.append(train_user2[31500:37800])
train_users_list.append(train_user2[37800:44100])
train_users_list.append(train_user2[44100:])

In [17]:
test_users_list = []
test_users_list.append(test_users[0:9400])
test_users_list.append(test_users[9400:18800])
test_users_list.append(test_users[18800:28200])
test_users_list.append(test_users[28200:37600])
test_users_list.append(test_users[37600:47000])
test_users_list.append(test_users[47000:56400])
test_users_list.append(test_users[56400:65800])
test_users_list.append(test_users[65800:])

### Process to create the features; The below firstprocess function is created to design features and to use with multiprocessing

### 1st set of feaures

<ol>
<li>max_order_number = latest order number ordered by the user </li>
<li>avg_products_per_order5 = average number of products ordered in user's last five orders</li>
<li>avg_products_per_ordered = average number of products per order </li>
<li>avg_times_product =  number of  times a product is ordered on an average by the user</li>
<li>max_times_product  =  maximun times any product is ordered by the user</li>
<li>max_reorder_any_product = maximum reorder ratio of any product by the user;Here reorders refers number of reorders in user's total orders</li>
    
<li>reorder_ratio = total reorder ratio of that product</li>
    
<li>product_add_order_universal = product's all user average add to cart number</li>
<li>reorder_ratio_cart_universal = for a perticular add to cart number what is the reorder ratio for all users</li>
    
<li>product_add_order_local = what is the product's mean add_to_cart_order number for a perticular user?</li>
<li>reorder_ratio_cart_local = For a perticular user and perticular add to cart number what is the reorder ratio?</li>
    
<li>product_time_order = Usually at what time product is ordered for all users</li>
<li>order_dow = what day of the week user ordered</li>
<li> order_hour_of_day = In which hour user ordered</li>
<li> days_since_prior_order = how many days elapsed since last order</li>
    
<li>reorder_ratio_user_hour = what is the reorder ratio of the user at a perticular hour</li>
<li>reorder_ratio_all_user_hour = how much in ratio all users order at a perticular hour</li>
<li>reorder_ratio_user_week = how much in ratio user reorders on a perticular day of the week</li>
<li>order_ratio=ratio of how many times user bought the product in his total orders</li>

<li>department_id of the product</li>
<li>aisle_id of the product</li>
<li>pf= product frequency,ie it indicates the ratio of perticular product count to all product count</li>
<li>af= (similar to product frequency)aisle frequency</li>
<li>df =(similar to product frequency)department frequency</li>
<li>dep_reorder_ratio=Reorder ratio of a perticular department for all users</li>
<li>aisle_reorder_ratio=Reorder ratio of a perticular aisle for all users</li>
<li>product_reorder_ratio=Reorder ratio of a perticular product for all users</li>
<li>ordered_last_5 = how many times user ordered product in his last five orders</li>
<li>atco = product's average add to cart order in users last 5 orders</li>
<li>dspo = number of days it took to order this product from previous order of any product</li>
<li>ohod = average order hour in last few orders</li>
<li>odow = average day of week order was placed  in last few orders</li>
<li>max_no_purchase = maximum number of days spent without buying, by the user.</li>
<li>tot_chance_buy = once a perticular product is bought for the first time how orders user did?</li>
<li>median_n5 = Median number of days user has gone without buying the product after the previous order</li>
<li>order_steak_5='bin1',bin2','bin3,'bin4','bin5','bin6'= 'ordered'  or  'not ordered' product binary sequence in last five orders</li>
<li>bin2dec= decimal reprecentation of the previous feature(order_steak_5)</li>
</ol>



## Feature_creating_machine for Train set1 

In [17]:
def firstprocess(train_users,i):
    row_data=[]
    for uid in tqdm(train_users):
        max_order=orders[orders['user_id']==uid].order_number.max()
        new_temp_orders=orders[(orders['user_id']==uid) & (orders['order_number'] == max_order)][['order_dow','order_hour_of_day','days_since_prior_order']]
        #12
        order_dow = int(new_temp_orders['order_dow'])
        #13
        order_hour_of_day = int(new_temp_orders['order_hour_of_day'])
        #14
        days_since_prior_order = int(new_temp_orders['days_since_prior_order'])

        oneuser=master_prior[master_prior['user_id']==uid]
        pids = pd.unique(oneuser['product_id'])

        reorder_ratio_hour_user = oneuser.groupby(["order_hour_of_day"])["reordered"].aggregate("mean")
        #1
        max_order_number = oneuser['order_number'].max()
        #2
        avg_products_per_order5=oneuser['order_number'].value_counts(sort=False).values[-5:].mean()
        #3
        avg_pro_order=oneuser['order_number'].value_counts().mean()
        #4
        avg_times_pro=oneuser['product_id'].value_counts().mean()
        #5
        max_times_pro = oneuser['product_id'].value_counts().max()

        latest=oneuser['order_number'].max()-5
        latest=oneuser[oneuser['order_number']>latest]
        #6
        max_reorder_ratio=(latest[['product_id','order_number']].groupby(['product_id']).count()/5).values.max()
        #7
        reorder_ratio=np.count_nonzero((oneuser['reordered']))/oneuser['reordered'].shape[0]
  
        
        #8
        product_add_order_local  = oneuser[['product_id','add_to_cart_order']].groupby(['product_id']).aggregate("mean")
        
        #9
        reorder_ratio_cart_local = oneuser.groupby(["add_to_cart_order"])["reordered"].aggregate("mean")

        #10
        one_user_reorder_ratio_hour = oneuser.groupby(["order_hour_of_day"])["reordered"].aggregate("mean")
        #11
        one_user_reorder_ratio_week = oneuser.groupby(["order_dow"])["reordered"].aggregate("mean")
  
        user_tot_order=np.arange(1,oneuser['order_number'].max()+1)
        #another feature related above one comes here
        if order_hour_of_day in one_user_reorder_ratio_hour.index:
            reorder_ratio_user_hour=one_user_reorder_ratio_hour.loc[order_hour_of_day]
        else:
            reorder_ratio_user_hour=0
        #15
        reorder_ratio_all_user_hour = all_user_reorder_ratio_hour.loc[order_hour_of_day]
        if order_dow in one_user_reorder_ratio_week.index:
            reorder_ratio_user_week = one_user_reorder_ratio_week.loc[order_dow]
        else:
            reorder_ratio_user_week = 0
        
        for pid in pids:
            feature=[]
            temp_train=mini_master_train[(mini_master_train['user_id']==uid) & (mini_master_train['product_id']==pid)]
    
            oneuserproduct=oneuser[oneuser['product_id']==pid]
    
            user_pro_order=pd.unique(oneuserproduct['order_number'])
            order_steak=['1' if i in user_pro_order  else '0' for i in user_tot_order]
            #16
            order_steak_5=order_steak[-5:]
            if(len(order_steak_5)<5):
                order_steak_5=1*['0']+order_steak_5
            #17
            bin2dec=int(''.join(order_steak_5),2)
    
            #User feature
    
            p_add_order_universal = int(product_add_order_universal.loc[pid])
            reorder_ratio_universal = reorder_ratio_cart_universal.loc[p_add_order_universal]



            p_add_order_local = int(product_add_order_local.loc[pid])
            if p_add_order_local in reorder_ratio_cart_local.index:
                reorder_ratio_local = reorder_ratio_cart_local.loc[p_add_order_local]
            else:
                reorder_ratio_local=0
            product_time_order = int(product_ordering_hour.loc[pid])





            #product features
            order_ratio=len(user_pro_order)/len(user_tot_order)

            department_id=oneuserproduct.iloc[0]['department_id']
            aisle_id=oneuserproduct.iloc[0]['aisle_id']

            pf=product_freq_ratio[pid]
            af=aisle_freq_ratio[aisle_id]
            df=department_freq_ratio[department_id]

            order_steak_5=[int(i) for i in order_steak_5]

            ordered_last_5=np.count_nonzero(order_steak_5)

            atco=oneuserproduct['add_to_cart_order'].tail().mean()
            dspo=oneuserproduct['days_since_prior_order'].tail().mean()
            ohod=oneuserproduct['order_hour_of_day'].tail().mean()
            odow=oneuserproduct['order_dow'].tail().mean()

            max_no_purchase=oneuserproduct['days_since_prior_order'].tail().max() 

    
            tot_chance_buy=len(order_steak)-order_steak.index('1')

            median_n5=np.median(oneuserproduct['days_since_prior_order'].tail())





            feature.append(uid)
            feature.append(pid)
            #Users
            feature.append(max_order_number)
            feature.append(avg_products_per_order5)
  
            feature.append(np.round(avg_pro_order,4))

            feature.append(np.round(avg_times_pro,4))

            feature.append(max_times_pro)	

            feature.append(np.round(max_reorder_ratio,4))

            feature.append(np.round(reorder_ratio,3))
    
            feature.append(p_add_order_universal)
            feature.append(np.round(reorder_ratio_universal,3))
            feature.append(p_add_order_local)
            feature.append(np.round(reorder_ratio_local,4))
            feature.append(product_time_order)
            if (temp_train.empty == False):
                reordered = 1
            else:
                reordered=0


    

    
            feature.append(order_dow)
            feature.append(order_hour_of_day)
            feature.append(days_since_prior_order)

            feature.append(np.round(reorder_ratio_user_hour,3))
            feature.append(np.round(reorder_ratio_all_user_hour,3))
            feature.append(np.round(reorder_ratio_user_week,3))
            #products
            feature.append(np.round(order_ratio,4))
            feature.append(department_id)
            feature.append(aisle_id)
            feature.append(np.round(pf,3))
            feature.append(np.round(af,3))
            feature.append(np.round(df,3))
            ##
            feature.append(np.round(float(dep_reorder_ratio.loc[department_id]),4))
            feature.append(np.round(float(aisle_reorder_ratio.loc[aisle_id]),4))
            feature.append(np.round(float(product_reorder_ratio.loc[pid]),4))

            feature.append(ordered_last_5)
            feature.append(np.round(atco,4))
            if(dspo!=dspo):
                feature.append(0)
            else:
                feature.append(np.round(dspo,4))
            feature.append(np.round(ohod,4))
            feature.append(np.round(odow,4))
            if(max_no_purchase!=max_no_purchase):
                feature.append(0)
            else:
                feature.append(max_no_purchase)
            feature.append(tot_chance_buy)
            if(median_n5!=median_n5):
                feature.append(0)
            else:
                feature.append(median_n5)

            feature.extend(order_steak_5)

    
            feature.append(bin2dec)

            feature.append(reordered)




            row_data.append(feature)
    temp=pd.DataFrame(row_data,columns=columns37)
    temp.to_csv('featured_data'+str(i)+'.csv',index=False)

### 1st set of features for test

In [3]:
columns12=['user_id','product_id','max_order_number','avg_products_per_ordered_5','avg_products_per_ordered','avg_times_product','max_times_product','max_reorder_any_product','total_reorder_ratio','product_add_order_universal','reorder_ratio_cart_universal','product_add_order_local','reorder_ratio_cart_local','product_time_order']

columns25=['order_dow','order_hour_of_day','days_since_prior_order','reorder_ratio_user_hour','reorder_ratio_all_user_hour','reorder_ratio_user_week','order_ratio','department_id','aisle_id',
         'pf','af','df','dep_reorder_ratio','aisle_reorder_ratio','product_reorder_ratio',
         'ordered_last_5','atco','dspo','ohod','odow','max_no_purchase','tot_chance_buy','median_n5','bin1','bin2','bin3','bin4','bin5','bin2dec_order_steak5']
columns37=columns12+columns25

In [18]:
def firstprocess(test_users,i):
    row_data=[]
    for uid in tqdm(test_users):
        max_order=orders[orders['user_id']==uid].order_number.max()
        new_temp_orders=orders[(orders['user_id']==uid) & (orders['order_number'] == max_order)][['order_dow','order_hour_of_day','days_since_prior_order']]
        #12
        order_dow = int(new_temp_orders['order_dow'])
        #13
        order_hour_of_day = int(new_temp_orders['order_hour_of_day'])
        #14
        days_since_prior_order = int(new_temp_orders['days_since_prior_order'])

        oneuser=master_prior[master_prior['user_id']==uid]
        pids = pd.unique(oneuser['product_id'])

        reorder_ratio_hour_user = oneuser.groupby(["order_hour_of_day"])["reordered"].aggregate("mean")
        #1
        max_order_number = oneuser['order_number'].max()
        #2
        avg_products_per_order5=oneuser['order_number'].value_counts(sort=False).values[-5:].mean()
        #3
        avg_pro_order=oneuser['order_number'].value_counts().mean()
        #4
        avg_times_pro=oneuser['product_id'].value_counts().mean()
        #5
        max_times_pro = oneuser['product_id'].value_counts().max()

        latest=oneuser['order_number'].max()-5
        latest=oneuser[oneuser['order_number']>latest]
        #6
        max_reorder_ratio=(latest[['product_id','order_number']].groupby(['product_id']).count()/5).values.max()
        #7
        reorder_ratio=np.count_nonzero((oneuser['reordered']))/oneuser['reordered'].shape[0]
  
        #8
        reorder_ratio_cart_local = oneuser.groupby(["add_to_cart_order"])["reordered"].aggregate("mean")
        #9
        product_add_order_local  = oneuser[['product_id','add_to_cart_order']].groupby(['product_id']).aggregate("mean")

        #10
        one_user_reorder_ratio_hour = oneuser.groupby(["order_hour_of_day"])["reordered"].aggregate("mean")
        #11
        one_user_reorder_ratio_week = oneuser.groupby(["order_dow"])["reordered"].aggregate("mean")
  
        user_tot_order=np.arange(1,oneuser['order_number'].max()+1)
        #another feature related above one comes here
        if order_hour_of_day in one_user_reorder_ratio_hour.index:
            reorder_ratio_user_hour=one_user_reorder_ratio_hour.loc[order_hour_of_day]
        else:
            reorder_ratio_user_hour=0
        #15
        reorder_ratio_all_user_hour = all_user_reorder_ratio_hour.loc[order_hour_of_day]
        if order_dow in one_user_reorder_ratio_week.index:
            reorder_ratio_user_week = one_user_reorder_ratio_week.loc[order_dow]
        else:
            reorder_ratio_user_week = 0
        
        for pid in pids:
            feature=[]
    
            oneuserproduct=oneuser[oneuser['product_id']==pid]
    
            user_pro_order=pd.unique(oneuserproduct['order_number'])
            order_steak=['1' if i in user_pro_order  else '0' for i in user_tot_order]
            #16
            order_steak_5=order_steak[-5:]
            if(len(order_steak_5)<5):
                order_steak_5=1*['0']+order_steak_5
            #17
            bin2dec=int(''.join(order_steak_5),2)
    
            #User feature
    
            p_add_order_universal = int(product_add_order_universal.loc[pid])
            reorder_ratio_universal = reorder_ratio_cart_universal.loc[p_add_order_universal]



            p_add_order_local = int(product_add_order_local.loc[pid])
            if p_add_order_local in reorder_ratio_cart_local.index:
                reorder_ratio_local = reorder_ratio_cart_local.loc[p_add_order_local]
            else:
                reorder_ratio_local=0
            product_time_order = int(product_ordering_hour.loc[pid])





            #product features
            order_ratio=len(user_pro_order)/len(user_tot_order)

            department_id=oneuserproduct.iloc[0]['department_id']
            aisle_id=oneuserproduct.iloc[0]['aisle_id']

            pf=product_freq_ratio[pid]
            af=aisle_freq_ratio[aisle_id]
            df=department_freq_ratio[department_id]

            order_steak_5=[int(i) for i in order_steak_5]

            ordered_last_5=np.count_nonzero(order_steak_5)

            atco=oneuserproduct['add_to_cart_order'].tail().mean()
            dspo=oneuserproduct['days_since_prior_order'].tail().mean()
            ohod=oneuserproduct['order_hour_of_day'].tail().mean()
            odow=oneuserproduct['order_dow'].tail().mean()

            max_no_purchase=oneuserproduct['days_since_prior_order'].tail().max() 

    
            tot_chance_buy=len(order_steak)-order_steak.index('1')

            median_n5=np.median(oneuserproduct['days_since_prior_order'].tail())





            feature.append(uid)
            feature.append(pid)
            #Users
            feature.append(max_order_number)
            feature.append(avg_products_per_order5)
  
            feature.append(np.round(avg_pro_order,4))

            feature.append(np.round(avg_times_pro,4))

            feature.append(max_times_pro)	

            feature.append(np.round(max_reorder_ratio,4))

            feature.append(np.round(reorder_ratio,3))
    
            feature.append(p_add_order_universal)
            feature.append(np.round(reorder_ratio_universal,3))
            feature.append(p_add_order_local)
            feature.append(np.round(reorder_ratio_local,4))
            feature.append(product_time_order)
            
            feature.append(order_dow)
            feature.append(order_hour_of_day)
            feature.append(days_since_prior_order)

            feature.append(np.round(reorder_ratio_user_hour,3))
            feature.append(np.round(reorder_ratio_all_user_hour,3))
            feature.append(np.round(reorder_ratio_user_week,3))
            #products
            feature.append(np.round(order_ratio,4))
            feature.append(department_id)
            feature.append(aisle_id)
            feature.append(np.round(pf,3))
            feature.append(np.round(af,3))
            feature.append(np.round(df,3))
            ##
            feature.append(np.round(float(dep_reorder_ratio.loc[department_id]),4))
            feature.append(np.round(float(aisle_reorder_ratio.loc[aisle_id]),4))
            feature.append(np.round(float(product_reorder_ratio.loc[pid]),4))

            feature.append(ordered_last_5)
            feature.append(np.round(atco,4))
            if(dspo!=dspo):
                feature.append(0)
            else:
                feature.append(np.round(dspo,4))
            feature.append(np.round(ohod,4))
            feature.append(np.round(odow,4))
            if(max_no_purchase!=max_no_purchase):
                feature.append(0)
            else:
                feature.append(max_no_purchase)
            feature.append(tot_chance_buy)
            if(median_n5!=median_n5):
                feature.append(0)
            else:
                feature.append(median_n5)

            feature.extend(order_steak_5)

    
            feature.append(bin2dec)

           




            row_data.append(feature)
    temp=pd.DataFrame(row_data,columns=columns37)
    temp.to_csv('featured_data_test'+str(i)+'.csv',index=False)

### Train 2nd set of features

Assume that Users products presence in all the orders is 0010101010, It means total 10 orders were place by the user till now. Out of which 3rd 5th and 7th and 9th orders conatain the product.<br>

So in his last five orders, product's ordered or not ordered sequence is 01010 .That 
means User in his last five orders , ordered the product only in 2nd and 4th transaction


<ol>
    
<li>ordere_ratio_last_5:It is just  order ratio : Indicating on how many orders in last 5 orders contain product.In the above example it is 2/5 </li>
<li>count_of_ones_after_first_one:How many times product reordered after product is ordered in last 5 transaction . In the example it is 3. </li>
<li>count_of_zeros_after_first_one:How many times product reordered after product is not ordered in last 5 transaction . In the example it is 4</li>
<li>one_exceed_zeros_count:how many times product ordered exceeds not ordered after order of the product is placed for the first time.In the above example it is -1</li>
<li>len_of_ordersteak_after_first_one:reorder_ratio_after_first_one:How many orders were there after product is placed for the first time . In the above example it is 7</li>
<li>reorder_ratio_after_first_one:How many orders were there after product is placed for the first time . In the above example it is 7</li>
<li>reorder_ratio_after_first_one:In the above example it is 3/7</li>
<li>Is_it_reordered:This variables tells if the product is reordered atleast once after it is ordered for the first time.'1'='YES' for our example</li>
<li>Is_the_first_one_last_order:Is the product ordered only in the previous order. Answer is 0 ='No' for our example, For sequence 0000001 answer is 'YES'</li>
<li>Is_the_first_one_last_but_one_order : Is the product ordered only in the last but one order. Answer is 0 ='No' for our example,For sequence 00010 or 00011 it is 'YES'</li>
<li>last_time_ordered:Wheather the product was ordered last time ,For our example it is 'NO'</li>
<li>last_two_times_ordered:Wheather the product was ordered last two times ?</li>
<li>coun_greater_2:Was the product reordered more than two times? 'YES' for our example</li>
<li>coun_greater_3:Was the product reordered more than three times? 'YES' for our example</li>
<li>first_index_of_one: What is the order number that the product was ordered for the first time? Answer=3</li>
<li>orders_per_day: It is the ratio of total orders to number of days took to make these many orders </li>
<li>days_taken_for_product_reorders_avg:It is average days took for the user to make reorder of the product since his first order of the product</li>
<li>days_remain_to_probable_buy:For example for the current order for which our task is to predict the reorder,If the days elapsed since last order is 14 days and for user it takes on an average 16 days to reorder the product ; days remaing for probable buy is 2 </li>
<li>ordperday_order_ratio:It is multiplication of number of orders made per day and order ratio;These are the previous features</li>
<li>order_steak_days_weighted5_1:It is the weighted average of number of order units happend in the last 5  to total number of days taken in last five orders. Here More recent reorders of the product gets more weightage </li>
<li>order_steak_days_weighted5_2:It is the weighted average of number of order units happend to total number of days taken to make number of orders once the product was ordered for the first time. Here More recent reorders of the product gets more weightage</li>
<li>median_days_no_buy5:In the last five transaction median number of days spent withour buying the product since product is ordered for the first time</li>
<li>max_days_no_buy5:In the last five transaction maximum number of days spent withour buying the product since product is ordered for the first time</li>
<li>days_spent_no_buy_last:What is the number of days already elapsed till now withour buying the product since product is ordered for the last time</li>
</ol>

In [14]:
def firstprocess(train_users,nocopy):
    row_data=[]
    for uid in tqdm(train_users):
        
        oneuser=master_prior[master_prior['user_id']==uid]
        
        train_order_number=orders[(orders['user_id']==uid)]['order_number'].max()
        trains_days_since_prior_order=int(orders[(orders['user_id']==uid) & (orders.order_number==train_order_number)]['days_since_prior_order'])
        pids = pd.unique(oneuser['product_id'])
        
        user_tot_order=np.arange(1,oneuser['order_number'].max()+1)
        #another feature related above one comes here

        
        for pid in pids:
            feature=[]
            reordered=0
            temp_train=mini_master_train[(mini_master_train['user_id']==uid) & (mini_master_train['product_id']==pid)]
            
            
            oneuserproduct=oneuser[oneuser['product_id']==pid]
    
            user_pro_order=pd.unique(oneuserproduct['order_number'])
            order_steak = [1 if i in user_pro_order  else 0 for i in user_tot_order]
            

            #product features
            first_index_of_one=order_steak.index(1)
            
            order_steak_5=order_steak[-5:]
            
            order_steak_5=[int(i) for i in order_steak_5]

            #
            ordere_ratio_last_5=order_steak_5.count(1)/5
            
            
            
            
            count_of_ones_after_first_one =order_steak.count(1)-1
            count_of_zeros_after_first_one =order_steak[first_index_of_one:].count(0)
            one_exceed_zeros_count=count_of_ones_after_first_one-count_of_zeros_after_first_one
            len_of_ordersteak_after_first_one =len(order_steak[first_index_of_one:])
            if(len_of_ordersteak_after_first_one!=0):
                reorder_ratio_after_first_one = count_of_ones_after_first_one/len_of_ordersteak_after_first_one
            else:
                reorder_ratio_after_first_one=-0.5
                
            Is_it_reordered =order_steak[first_index_of_one:].count(1)-1
            Is_the_first_one_last_order = 0
            Is_the_first_one_last_but_one_order = 0
            last_time_ordered=order_steak[-1:].count(1)
            last_two_times_ordered=order_steak[-2:].count(1)
            if(last_two_times_ordered==2):
                last_two_times_ordered=1
            else:
                last_two_times_ordered=0
    

            if((len(order_steak)-1)==first_index_of_one):
                Is_the_first_one_last_order=1
            if((len(order_steak)-2)==first_index_of_one):
                Is_the_first_one_last_but_one_order=1
            
            if(Is_it_reordered>=2):
                coun_greater_2=1
            else:
                coun_greater_2=0
            
            
            if(Is_it_reordered>=3):
                coun_greater_3=1
            else:
                coun_greater_3=0

            if(Is_it_reordered):
            #find the sum of this with respect to all the products
            #calculate total reorder probability here
                Is_it_reordered=1
            else:
            #first time product only
                Is_it_reordered=0

            


            

            


            total_days_since_first_order=(oneuser.groupby(['order_number'])['days_since_prior_order'].aggregate('mean').reset_index().loc[first_index_of_one+1:]['days_since_prior_order'].sum())

            if(total_days_since_first_order != 0):
                orders_per_day=count_of_ones_after_first_one/total_days_since_first_order
            else:
                orders_per_day=-0.5

    
            if(count_of_ones_after_first_one != 0):
                days_taken_for_product_reorders_avg = total_days_since_first_order/count_of_ones_after_first_one
            else:
                days_taken_for_product_reorders_avg=-0.5

            if(days_taken_for_product_reorders_avg !=-0.5):
                days_remain_to_probable_buy=int(days_taken_for_product_reorders_avg-trains_days_since_prior_order)
            else:
                days_remain_to_probable_buy = -0.5
            
            
            
            
            
            ordperday_order_ratio=orders_per_day*reorder_ratio_after_first_one
            
            days_list=orders[orders['user_id']==uid]['days_since_prior_order'].tolist()
            days_list=days_list[1:][-5:]
            order_steak5=order_steak[-5:]
            
            
            if(sum(order_steak5)!=0):
                first_occur_one=order_steak5.index(1)
                if(sum(np.array(days_list)*0.01)!=0.0):
                    order_steak_days_weighted5_1 = sum((np.arange(1,len(order_steak5)+1,1)/10)*order_steak5)/sum(np.array(days_list)*0.01)
                else:
                    order_steak_days_weighted5_1=-0.5
                if(sum(np.array(days_list[first_occur_one:])*0.01) != 0.0):
                    order_steak_days_weighted5_2 = sum((np.arange(1,len(order_steak5)+1,1)/10)*order_steak5)/sum(np.array(days_list[first_occur_one:])*0.01)
                else:
                    order_steak_days_weighted5_2=-0.5
            else:
                order_steak_days_weighted5_1=0
                order_steak_days_weighted5_2=0
                
            
            
            listOfSumDays=[]
            index_one=[i for i,j in enumerate(order_steak5) if j==1]
            for i in range(len(index_one)):
                k=index_one[i:i+2]
                if(len(k)==2):
                    
                    listOfSumDays.append(sum(days_list[-5:][k[0]:k[1]]))
                else:
                    listOfSumDays.append(sum(days_list[-5:][k[0]:]))
                    
            if(len(listOfSumDays)!=0):
                median_days_no_buy5=np.median(listOfSumDays)
                max_days_no_buy5=np.max(listOfSumDays)
                days_spent_no_buy_last=listOfSumDays[-1:][0]
            else:
                median_days_no_buy5=-0.5
                max_days_no_buy5=-0.5
                days_spent_no_buy_last=-0.5
    
            feature.append(uid)
            feature.append(pid)
            #Users
            feature.append(np.round(ordere_ratio_last_5,4))
            feature.append(count_of_ones_after_first_one)
            feature.append(count_of_zeros_after_first_one)
            feature.append(one_exceed_zeros_count)
            feature.append(len_of_ordersteak_after_first_one)
            feature.append(np.round(reorder_ratio_after_first_one,4))
            feature.append(Is_it_reordered)
            feature.append(Is_the_first_one_last_order)
            feature.append(Is_the_first_one_last_but_one_order)
            feature.append(last_time_ordered)
            feature.append(last_two_times_ordered)
            feature.append(coun_greater_2)
            feature.append(coun_greater_3)
            
            feature.append(first_index_of_one)
            feature.append(np.round(orders_per_day,4))
            feature.append(np.round(days_taken_for_product_reorders_avg,4))
            feature.append(np.round(days_remain_to_probable_buy,4))
            feature.append(np.round(ordperday_order_ratio,4))
            feature.append(np.round(order_steak_days_weighted5_1,4))
            feature.append(np.round(order_steak_days_weighted5_2,4))
            feature.append(np.round(median_days_no_buy5,4))
            feature.append(np.round(max_days_no_buy5,4))
            feature.append(np.round(days_spent_no_buy_last,4))
            
            
            if (temp_train.empty == False):
                reordered = 1
            else:
                reordered = 0


    

    
            
           
            feature.append(reordered)




            row_data.append(feature)
    temp=pd.DataFrame(row_data,columns=columns23)
    temp.to_csv('featured_data_temp2'+str(nocopy)+'.csv',index=False)

#### Test 2nd set of features 

In [23]:
def firstprocess(test_users,nocopy):
    row_data=[]
    for uid in tqdm(test_users):
        
        oneuser=master_prior[master_prior['user_id']==uid]
        
        train_order_number=orders[(orders['user_id']==uid)]['order_number'].max()
        trains_days_since_prior_order=int(orders[(orders['user_id']==uid) & (orders.order_number==train_order_number)]['days_since_prior_order'])
        pids = pd.unique(oneuser['product_id'])
        
        user_tot_order=np.arange(1,oneuser['order_number'].max()+1)
        #another feature related above one comes here

        
        for pid in pids:
            feature=[]
            reordered=0
            
            
            oneuserproduct=oneuser[oneuser['product_id']==pid]
    
            user_pro_order=pd.unique(oneuserproduct['order_number'])
            order_steak = [1 if i in user_pro_order  else 0 for i in user_tot_order]
            

            #product features
            first_index_of_one=order_steak.index(1)
            
            order_steak_5=order_steak[-5:]
            
            order_steak_5=[int(i) for i in order_steak_5]

            #
            ordere_ratio_last_5=order_steak_5.count(1)/5
            
            
            
            
            count_of_ones_after_first_one =order_steak.count(1)-1
            count_of_zeros_after_first_one =order_steak[first_index_of_one:].count(0)
            one_exceed_zeros_count=count_of_ones_after_first_one-count_of_zeros_after_first_one
            len_of_ordersteak_after_first_one =len(order_steak[first_index_of_one:])
            if(len_of_ordersteak_after_first_one!=0):
                reorder_ratio_after_first_one = count_of_ones_after_first_one/len_of_ordersteak_after_first_one
            else:
                reorder_ratio_after_first_one=-0.5
                
            Is_it_reordered =order_steak[first_index_of_one:].count(1)-1
            Is_the_first_one_last_order = 0
            Is_the_first_one_last_but_one_order = 0
            last_time_ordered=order_steak[-1:].count(1)
            last_two_times_ordered=order_steak[-2:].count(1)
            if(last_two_times_ordered==2):
                last_two_times_ordered=1
            else:
                last_two_times_ordered=0
    

            if((len(order_steak)-1)==first_index_of_one):
                Is_the_first_one_last_order=1
            if((len(order_steak)-2)==first_index_of_one):
                Is_the_first_one_last_but_one_order=1
            
            if(Is_it_reordered>=2):
                coun_greater_2=1
            else:
                coun_greater_2=0
            
            
            if(Is_it_reordered>=3):
                coun_greater_3=1
            else:
                coun_greater_3=0

            if(Is_it_reordered):
            #find the sum of this with respect to all the products
            #calculate total reorder probability here
                Is_it_reordered=1
            else:
            #first time product only
                Is_it_reordered=0

            


            

            


            total_days_since_first_order=(oneuser.groupby(['order_number'])['days_since_prior_order'].aggregate('mean').reset_index().loc[first_index_of_one+1:]['days_since_prior_order'].sum())

            if(total_days_since_first_order != 0):
                orders_per_day=count_of_ones_after_first_one/total_days_since_first_order
            else:
                orders_per_day=-0.5

    
            if(count_of_ones_after_first_one != 0):
                days_taken_for_product_reorders_avg = total_days_since_first_order/count_of_ones_after_first_one
            else:
                days_taken_for_product_reorders_avg=-0.5

            if(days_taken_for_product_reorders_avg !=-0.5):
                days_remain_to_probable_buy=int(days_taken_for_product_reorders_avg-trains_days_since_prior_order)
            else:
                days_remain_to_probable_buy = -0.5
            
            
            
            
            
            ordperday_order_ratio=orders_per_day*reorder_ratio_after_first_one
            
            days_list=orders[orders['user_id']==uid]['days_since_prior_order'].tolist()
            days_list=days_list[1:][-5:]
            order_steak5=order_steak[-5:]
            
            
            if(sum(order_steak5)!=0):
                first_occur_one=order_steak5.index(1)
                if(sum(np.array(days_list)*0.01)!=0.0):
                    order_steak_days_weighted5_1 = sum((np.arange(1,len(order_steak5)+1,1)/10)*order_steak5)/sum(np.array(days_list)*0.01)
                else:
                    order_steak_days_weighted5_1=-0.5
                if(sum(np.array(days_list[first_occur_one:])*0.01) != 0.0):
                    order_steak_days_weighted5_2 = sum((np.arange(1,len(order_steak5)+1,1)/10)*order_steak5)/sum(np.array(days_list[first_occur_one:])*0.01)
                else:
                    order_steak_days_weighted5_2=-0.5
            else:
                order_steak_days_weighted5_1=0
                order_steak_days_weighted5_2=0
                
            
            
            listOfSumDays=[]
            index_one=[i for i,j in enumerate(order_steak5) if j==1]
            for i in range(len(index_one)):
                k=index_one[i:i+2]
                if(len(k)==2):
                    
                    listOfSumDays.append(sum(days_list[-5:][k[0]:k[1]]))
                else:
                    listOfSumDays.append(sum(days_list[-5:][k[0]:]))
                    
            if(len(listOfSumDays)!=0):
                median_days_no_buy5=np.median(listOfSumDays)
                max_days_no_buy5=np.max(listOfSumDays)
                days_spent_no_buy_last=listOfSumDays[-1:][0]
            else:
                median_days_no_buy5=-0.5
                max_days_no_buy5=-0.5
                days_spent_no_buy_last=-0.5
    
            feature.append(uid)
            feature.append(pid)
            #Users
            feature.append(np.round(ordere_ratio_last_5,4))
            feature.append(count_of_ones_after_first_one)
            feature.append(count_of_zeros_after_first_one)
            feature.append(one_exceed_zeros_count)
            feature.append(len_of_ordersteak_after_first_one)
            feature.append(np.round(reorder_ratio_after_first_one,4))
            feature.append(Is_it_reordered)
            feature.append(Is_the_first_one_last_order)
            feature.append(Is_the_first_one_last_but_one_order)
            feature.append(last_time_ordered)
            feature.append(last_two_times_ordered)
            feature.append(coun_greater_2)
            feature.append(coun_greater_3)
            
            feature.append(first_index_of_one)
            feature.append(np.round(orders_per_day,4))
            feature.append(np.round(days_taken_for_product_reorders_avg,4))
            feature.append(np.round(days_remain_to_probable_buy,4))
            feature.append(np.round(ordperday_order_ratio,4))
            feature.append(np.round(order_steak_days_weighted5_1,4))
            feature.append(np.round(order_steak_days_weighted5_2,4))
            feature.append(np.round(median_days_no_buy5,4))
            feature.append(np.round(max_days_no_buy5,4))
            feature.append(np.round(days_spent_no_buy_last,4))
   




            row_data.append(feature)
    temp=pd.DataFrame(row_data,columns=columns23)
    temp.to_csv('featured_data_test2'+str(nocopy)+'.csv',index=False)

In [24]:
columns23=['user_id','product_id','ordere_ratio_last_5','count_of_ones_after_first_one','count_of_zeros_after_first_one','one_exceed_zeros_count','len_of_ordersteak_after_first_one','reorder_ratio_after_first_one','Is_it_reordered','Is_the_first_one_last_order','Is_the_first_one_last_but_one_order','last_time_ordered','last_two_times_ordered','coun_greater_2','coun_greater_3','first_index_of_one','orders_per_day','days_taken_for_product_reorders_avg','days_remain_to_probable_buy','ordperday_order_ratio','order_steak_days_weighted5_1','order_steak_days_weighted5_2','median_days_no_buy5','max_days_no_buy5','days_spent_no_buy_last']

### 3rd set of featues  reorder probability

Assue that if a product is ordered it is represented by 1 else 0
<ol>
<li>'weighted_bin1':product is ordered in last but 4th transaction multiplied by 0.1;for example if was ordered it is 1*0.1</li>
<li>'weighted_bin2':product is ordered in last but 3rd transaction multiplied by 0.2</li>
<li>'weighted_bin3':product is ordered in last but 2nd transaction multiplied by 0.3</li>
<li>'weighted_bin4':product is ordered in last but 1st transaction multiplied by 0.4</li>
<li>'weighted_bin5':product is ordered in last but 1st transaction multiplied by 0.5;i.e more recent transactions given more weightage</li>
<li>department_reorder_probability:What is the departmentwise reorder probability of the product,it is differant from reorder ratio</li>
<li>product_reorder_probability:What is the productwise reorder probability</li>
<li>aisle_reorder_probability:what is the departmentwise reorder probability of the product</li>
<li>days_gap_reorder_probability:</li>
<li>orderhour_reorder_probability:What is the hourwise reorder probability of the product</li>
<li>cart_reorder_probability:Reorder probability of the product based on average add to cart number of the product for a perticular user</li>
<li>cart_reorder_probability_universal:Reorder probability of the product based on average add to cart number of the product for all users</li>
</ol>

### loading probability features to create 3rd set of features

In [None]:
feature_data_test['weight_bin1']=feature_data_test['bin1']*0.1
feature_data_test['weight_bin2']=feature_data_test['bin2']*0.1
feature_data_test['weight_bin3']=feature_data_test['bin3']*0.1
feature_data_test['weight_bin4']=feature_data_test['bin4']*0.1
feature_data_test['weight_bin5']=feature_data_test['bin5']*0.1

In [16]:
add_cart_reorder_prob=pd.read_csv('add_cart_reorder_prob.csv')
add_cart_reorder_prob1=pd.read_csv('add_cart_reorder_prob.csv').rename(columns={'probability':'cart_reorder_probability_univesal'})
aisle_probability=pd.read_csv('aisle_probability.csv').rename(columns={'probability':'aisle_reorder_probability'})
department_probability = pd.read_csv('department_probability.csv').rename(columns={'probability':'department_reorder_probability'})
days_since_probability=pd.read_csv('days_since_probability.csv').rename(columns={'probability':'days_gap_reorder_probability'})
orderhour_probability=pd.read_csv('orderhour_probability.csv').rename(columns={'probability':'orderhour_reorder_probability'})
product_probability=pd.read_csv('product_probability.csv').rename(columns={'probability':'product_reorder_probability'})
department_probability.drop(['department'],axis=1,inplace=True)

In [17]:
feature_data_test=pd.merge(feature_data_test,department_probability,how='inner',on=['department_id'])
feature_data_test=pd.merge(feature_data_test,product_probability,how='inner',on=['product_id'])
feature_data_test=pd.merge(feature_data_test,aisle_probability,how='inner',on=['aisle_id'])
feature_data_test=pd.merge(feature_data_test,days_since_probability,how='inner',on=['days_since_prior_order'])
feature_data_test=pd.merge(feature_data_test,orderhour_probability,how='inner',on=['order_hour_of_day'])
feature_data_test=pd.merge(feature_data_test,add_cart_reorder_prob,how='inner',left_on='product_add_order_local',right_on='add_to_cart_order')
feature_data_test=pd.merge(feature_data_test,add_cart_reorder_prob1,how='inner',left_on='product_add_order_universal',right_on='add_to_cart_order')

## Multiprocessing to featurize data 

In [25]:
%%time
from multiprocessing import Process
import multiprocessing
def main():
    p=[None]*8
    #the below code is used for multiprogramming
    #the number of process depends upon the number of cores present System
    #process is used to call multiprogramming
    manager=multiprocessing.Manager() 	
    for i in range(0,8):
        p[i]=Process(target = firstprocess,args = (test_users_list[i],i,))
    
    
    #p1.start() is used to start the thread execution
    for i in range(0,8):
        p[i].start()
    
    #After completion all the threads are joined
    for i in range(0,8):
        p[i].join()

if __name__=="__main__":
    main()

CPU times: user 232 ms, sys: 249 ms, total: 481 ms
Wall time: 1h 47min 11s


### <font color='red'>Features are summarized here</font>

In [3]:
train78features=pd.read_csv('final_78_features.csv')
test78features=pd.read_csv('feature_data_test.csv')

#### TrainFeatues are consuming around 4GB data and test data is around 2GB till now. Created the features in Google cloud with 8 processors and 32GB RAM . With 8 processors data creation took around 12 hours. Found out that  XGBoost gets stuck if we take all features once into RAM. In lightgbm data was fitting well and extremely fast compared to xgboost. But hyper tuning is also difficult with this much amount of data. So will try to hyper tune row sampling and columnsampling multiple times and get the parameters

In [4]:
train78features.shape

(8474661, 79)

In [8]:
train78features.head()

Unnamed: 0,user_id,product_id,max_order_number,avg_products_per_ordered_5,avg_products_per_ordered,avg_times_product,max_times_product,max_reorder_any_product,total_reorder_ratio,product_add_order_universal,...,weight_bin4,weight_bin5,department_reorder_probability,product_reorder_probability,aisle_reorder_probability,days_gap_reorder_probability,orderhour_reorder_probability,cart_reorder_probability,cart_reorder_probability_univesal,reordered
0,1,196,10,6.0,5.9,3.2778,10,1.0,0.695,3,...,0.4,0.5,0.356632,0.470259,0.349587,0.2952,0.281428,0.37,0.298406,1
1,166234,196,23,2.6,2.6522,6.1,18,1.0,0.836,3,...,0.0,0.0,0.356632,0.470259,0.349587,0.2952,0.281428,0.37,0.298406,0
2,176205,196,25,1.4,2.16,3.1765,11,0.6,0.685,3,...,0.0,0.0,0.356632,0.470259,0.349587,0.2952,0.281428,0.37,0.298406,0
3,183553,196,28,2.4,3.2143,2.3077,11,0.4,0.567,3,...,0.0,0.0,0.356632,0.470259,0.349587,0.2952,0.281428,0.37,0.298406,0
4,175812,40939,15,1.6,1.7333,1.4444,3,0.2,0.308,3,...,0.0,0.5,0.356632,0.363946,0.439581,0.2952,0.281428,0.37,0.298406,0


In [5]:
test78features.shape

(4833292, 78)

In [9]:
test78features.head()

Unnamed: 0,user_id,product_id,max_order_number,avg_products_per_ordered_5,avg_products_per_ordered,avg_times_product,max_times_product,max_reorder_any_product,total_reorder_ratio,product_add_order_universal,...,weight_bin3,weight_bin4,weight_bin5,department_reorder_probability,product_reorder_probability,aisle_reorder_probability,days_gap_reorder_probability,orderhour_reorder_probability,cart_reorder_probability,cart_reorder_probability_univesal
0,3,47766,12,6.0,7.3333,2.6667,10,1.0,0.625,6,...,0.3,0.4,0.5,0.309702,0.436097,0.393833,0.280343,0.265305,0.298406,0.245529
1,38488,47766,5,10.8,10.8,1.3171,3,0.6,0.241,6,...,0.0,0.4,0.0,0.309702,0.436097,0.393833,0.280343,0.265305,0.298406,0.245529
2,49686,47209,33,6.6,4.9697,1.907,22,1.0,0.476,6,...,0.0,0.0,0.0,0.309702,0.438232,0.393833,0.280343,0.265305,0.298406,0.245529
3,87661,21288,3,14.666667,14.6667,1.0732,2,0.4,0.068,6,...,0.3,0.0,1.0,0.309702,0.343467,0.393833,0.280343,0.265305,0.298406,0.245529
4,199370,21288,9,11.2,10.2222,1.7358,6,1.0,0.424,6,...,0.0,0.0,0.0,0.309702,0.343467,0.393833,0.280343,0.265305,0.298406,0.245529


##### Total 76 features are created for train and test . Here I have created features for test data already. As we do not have labes for test data , We have to check accuracy in kaggle only. For the case study temporarily I will devide 'train78features' into train and test , and evaluate model

In [6]:
train78features.columns

Index(['user_id', 'product_id', 'max_order_number',
       'avg_products_per_ordered_5', 'avg_products_per_ordered',
       'avg_times_product', 'max_times_product', 'max_reorder_any_product',
       'total_reorder_ratio', 'product_add_order_universal',
       'reorder_ratio_cart_universal', 'product_add_order_local',
       'reorder_ratio_cart_local', 'product_time_order', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order',
       'reorder_ratio_user_hour', 'reorder_ratio_all_user_hour',
       'reorder_ratio_user_week', 'order_ratio', 'department_id', 'aisle_id',
       'pf', 'af', 'df', 'dep_reorder_ratio', 'aisle_reorder_ratio',
       'product_reorder_ratio', 'ordered_last_5', 'atco', 'dspo', 'ohod',
       'odow', 'max_no_purchase', 'tot_chance_buy', 'median_n5', 'bin1',
       'bin2', 'bin3', 'bin4', 'bin5', 'bin2dec_order_steak5',
       'ordere_ratio_last_5', 'count_of_ones_after_first_one',
       'count_of_zeros_after_first_one', 'one_exceed_zeros_count',
    

#### Feature summarization(total features =76)

<p style="color:red">Assumptions.</p><br>
<ul>
<li>Assume that Users products presence in all the orders is 0010101010, It means total 10 orders were place by the user till now. Out of which 3rd 5th and 7th and 9th orders conatain the product</li>.<br>

<li>So in his last five orders, product's ordered or not ordered sequence is 01010 .That 
means User in his last five orders , ordered the product only in 2nd and 4th transaction</li><br>

<li>Assume that if a product is ordered it is represented by 1 else 0</li><br>
</ul>

<ol>
<li>max_order_number = latest order number ordered by the user </li>
<li>avg_products_per_order5 = average number of products ordered in user's last five orders</li>
<li>avg_products_per_ordered = average number of products per order </li>
<li>avg_times_product =  number of  times a product is ordered on an average by the user</li>
<li>max_times_product  =  maximun times any product is ordered by the user</li>
<li>max_reorder_any_product = maximum reorder ratio of any product by the user;Here reorders refers number of reorders in user's total orders</li>
    
<li>reorder_ratio = total reorder ratio of that product</li>
    
<li>product_add_order_universal = product's all user average add to cart number</li>
<li>reorder_ratio_cart_universal = for a perticular add to cart number what is the reorder ratio for all users</li>
    
<li>product_add_order_local = what is the product's mean add_to_cart_order number for a perticular user?</li>
<li>reorder_ratio_cart_local = For a perticular user and perticular add to cart number what is the reorder ratio?</li>
    
<li>product_time_order = Usually at what time product is ordered for all users</li>
<li>order_dow = what day of the week user ordered</li>
<li> order_hour_of_day = In which hour user ordered</li>
<li> days_since_prior_order = how many days elapsed since last order</li>
    
<li>reorder_ratio_user_hour = what is the reorder ratio of the user at a perticular hour</li>
<li>reorder_ratio_all_user_hour = how much in ratio all users order at a perticular hour</li>
<li>reorder_ratio_user_week = how much in ratio user reorders on a perticular day of the week</li>
<li>order_ratio=ratio of how many times user bought the product in his total orders</li>

<li>department_id of the product</li>
<li>aisle_id of the product</li>
<li>pf= product frequency,ie it indicates the ratio of perticular product count to all product count</li>
<li>af= (similar to product frequency)aisle frequency</li>
<li>df =(similar to product frequency)department frequency</li>
<li>dep_reorder_ratio=Reorder ratio of a perticular department for all users</li>
<li>aisle_reorder_ratio=Reorder ratio of a perticular aisle for all users</li>
<li>product_reorder_ratio=Reorder ratio of a perticular product for all users</li>
<li>ordered_last_5 = how many times user ordered product in his last five orders</li>
<li>atco = product's average add to cart order in users last 5 orders</li>
<li>dspo = number of days it took to order this product from previous order of any product</li>
<li>ohod = average order hour in last few orders</li>
<li>odow = average day of week order was placed  in last few orders</li>
<li>max_no_purchase = maximum number of days spent without buying, by the user.</li>
<li>tot_chance_buy = once a perticular product is bought for the first time how orders user did?</li>
<li>median_n5 = Median number of days user has gone without buying the product after the previous order</li>
<li>order_steak_5='bin1',bin2','bin3,'bin4','bin5','bin6'= 'ordered'  or  'not ordered' product binary sequence in last five orders</li>
<li>bin2dec= decimal reprecentation of the previous feature(order_steak_5)</li>





    
<li>ordere_ratio_last_5:It is just  order ratio : Indicating on how many orders in last 5 orders contain product.In the above example it is 2/5 </li>
<li>count_of_ones_after_first_one:How many times product reordered after product is ordered in last 5 transaction . In the example it is 3. </li>
<li>count_of_zeros_after_first_one:How many times product reordered after product is not ordered in last 5 transaction . In the example it is 4</li>
<li>one_exceed_zeros_count:how many times product ordered exceeds not ordered after order of the product is placed for the first time.In the above example it is -1</li>
<li>len_of_ordersteak_after_first_one:reorder_ratio_after_first_one:How many orders were there after product is placed for the first time . In the above example it is 7</li>
<li>reorder_ratio_after_first_one:How many orders were there after product is placed for the first time . In the above example it is 7</li>
<li>reorder_ratio_after_first_one:In the above example it is 3/7</li>
<li>Is_it_reordered:This variables tells if the product is reordered atleast once after it is ordered for the first time.'1'='YES' for our example</li>
<li>Is_the_first_one_last_order:Is the product ordered only in the previous order. Answer is 0 ='No' for our example, For sequence 0000001 answer is 'YES'</li>
<li>Is_the_first_one_last_but_one_order : Is the product ordered only in the last but one order. Answer is 0 ='No' for our example,For sequence 00010 or 00011 it is 'YES'</li>
<li>last_time_ordered:Wheather the product was ordered last time ,For our example it is 'NO'</li>
<li>last_two_times_ordered:Wheather the product was ordered last two times ?</li>
<li>coun_greater_2:Was the product reordered more than two times? 'YES' for our example</li>
<li>coun_greater_3:Was the product reordered more than three times? 'YES' for our example</li>
<li>first_index_of_one: What is the order number that the product was ordered for the first time? Answer=3</li>
<li>orders_per_day: It is the ratio of total orders to number of days took to make these many orders </li>
<li>days_taken_for_product_reorders_avg:It is average days took for the user to make reorder of the product since his first order of the product</li>
<li>days_remain_to_probable_buy:For example for the current order for which our task is to predict the reorder,If the days elapsed since last order is 14 days and for user it takes on an average 16 days to reorder the product ; days remaing for probable buy is 2 </li>
<li>ordperday_order_ratio:It is multiplication of number of orders made per day and order ratio;These are the previous features</li>
<li>order_steak_days_weighted5_1:It is the weighted average of number of order units happend in the last 5  to total number of days taken in last five orders. Here More recent reorders of the product gets more weightage </li>
<li>order_steak_days_weighted5_2:It is the weighted average of number of order units happend to total number of days taken to make number of orders once the product was ordered for the first time. Here More recent reorders of the product gets more weightage</li>
<li>median_days_no_buy5:In the last five transaction median number of days spent withour buying the product since product is ordered for the first time</li>
<li>max_days_no_buy5:In the last five transaction maximum number of days spent withour buying the product since product is ordered for the first time</li>
<li>days_spent_no_buy_last:What is the number of days already elapsed till now withour buying the product since product is ordered for the last time</li>
    

<li>'weighted_bin1':product is ordered in last but 4th transaction multiplied by 0.1;for example if was ordered it is 1*0.1</li>
<li>'weighted_bin2':product is ordered in last but 3rd transaction multiplied by 0.2</li>
<li>'weighted_bin3':product is ordered in last but 2nd transaction multiplied by 0.3</li>
<li>'weighted_bin4':product is ordered in last but 1st transaction multiplied by 0.4</li>
<li>'weighted_bin5':product is ordered in last but 1st transaction multiplied by 0.5;i.e more recent transactions given more weightage</li>
<li>department_reorder_probability:What is the departmentwise reorder probability of the product,it is differant from reorder ratio</li>
<li>product_reorder_probability:What is the productwise reorder probability</li>
<li>aisle_reorder_probability:what is the departmentwise reorder probability of the product</li>
<li>days_gap_reorder_probability:</li>
<li>orderhour_reorder_probability:What is the hourwise reorder probability of the product</li>
<li>cart_reorder_probability:Reorder probability of the product based on average add to cart number of the product for a perticular user</li>
<li>cart_reorder_probability_universal:Reorder probability of the product based on average add to cart number of the product for all users</li>
<li>reordered:It is our target label. It will tesll wheather the product is reordered or not for train data. When we combine designed features with the given train information propertly we will get this data. </li>
</ol>



<p style="color:blue">Referances made</p> 

kaggle competetion url:https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/<br>
seaborn:https://seaborn.pydata.org/generated/seaborn.barplot.html <br>
pandas:https://pandas.pydata.org/pandas-docs/ <br>
numpy:https://numpy.org/<br>
winners interview:https://medium.com/kaggle-blog/instacart-market-basket-analysis-feda2700cded
