In [5]:
import pandas as pd
from datetime import datetime
import numpy as np
import os
import gc
pd.set_option('display.max_columns', 100)
from scipy import stats
# Here we will calculate user based features such as 
#                                                   1. Number of orders per User
#                                                   2. How frequent a user has reordered products
#                                                   3. average days between user's purchases
#                                                   4. standard deviation in user's days_since_prior_order
#                                                   5. Day of the week the users orders the most
#                                                   6. hour of day the user has ordered most.
#                                                   7. Total products bought per user 
#                                                   8. Total unique products ordered
def get_user_features(df_prior_final,orders):
    '''
        Computes user features
        Args: 
            df_prior_final :  Users prior order history
            orders : users order data
        Returns:
                User feature DF
    '''
    from scipy import stats
    user= df_prior_final.groupby('user_id').agg({'order_number' : 'max',
                                            'reordered' : 'mean',
                                            'days_since_prior_order' : ['mean','std'],
                                             'order_dow' : lambda x : stats.mode(x)[0],
                                             'order_hour_of_day' : lambda x : stats.mode(x)[0],
                                             'order_id' : 'count',
                                             'product_id' : lambda x : x.unique().shape[0]}).reset_index()
    user.columns = ['user_id','u_total_orders','u_reordered_ratio','u_average_days_between_orders','u_days_between_orders_std','u_dow_most_orders','u_hod_most_orders',
                    'u_total_items_bought','u_total_unique_prod']
    #print(user.head())
    # User's avg basket size
    basket_size_per_order = df_prior_final.groupby(by=['user_id', 'order_id'])['product_id'].aggregate('count').to_frame('basket_size_per_order').reset_index()
    avg_basket_size = basket_size_per_order.groupby(by=['user_id']).agg({'basket_size_per_order' : ['sum','mean','std']}).reset_index()
    #deleting the avg_basket_size dataframe
    avg_basket_size.columns = ['user_id','u_basket_sum','u_avg_basket_size','u_basket_std']
    del [basket_size_per_order]
    gc.collect()
    print(avg_basket_size.head())
    
    # combining both user and avg_basket_size DF based on user_id
    # u_date_inscription : max date for each user excluding last order
    user = user.merge(avg_basket_size,on='user_id', how='left')
    users_fe = orders.query("order_number_reverse != 0").groupby("user_id").agg({'date': 'max'}).rename(columns={'date' :'u_date_inscription'}).reset_index()
    user = user.merge(users_fe,on='user_id', how='left')
    print(user.head())
    return user