In [3]:
import pandas as pd
from datetime import datetime
import numpy as np
import os
import gc
pd.set_option('display.max_columns', 100)
from scipy import stats
def is_organic(text):
    '''
    This function whether 'organic' is substring of text or not
    Parameters:
    text(String) : input string
    Returns : 1 if  'organic' is substring of text
              0 Otherwise
    '''
    if 'organic' in str(text).lower():
        return 1
    else:
        return 0
def encoding(values,max_val):
    '''
        This function returns sine and consine tranformation of 
        text(String) : input string
        Returns : 1 if  'organic' is substring of text
                  0 Otherwise
    '''
    x_sin = np.sin(2*np.pi*values/max_val)
    x_cos = np.cos(2*np.pi*values/max_val)
    return x_sin,x_cos

def get_cyclic_encoding(df):
    '''
        computes cyclic encoding of order_dow and order_hour_of_day
        Here order_dow and order_hour_of_day are both cyclic features. For order_dow the cycle repeats 
        between 0 to 6 and for order_hour_of_day the cycle repeats between 0 to 23. There are certain 
        problembs with cyclic features if we don't encode them properly. 
        For example the difference between hour 23 and 22 is 1 but for hour 23 and 0 the difference is 23
        although the real difference for both of them is 1 hour. We need to encode this cyclic features
        in such a way that hour 23 and 0 are close even though the absolute difference between them is 23.
        One common way to encode cyclic features is to use sine an cosine transformations. 
        We can do that using following transformation.
        Xsin = sin(2∗π∗x/max(x))
        Xcos = cos(2∗π∗x/max(x))

        Reference : http://blog.davidkaleko.com/feature-engineering-cyclical-features.html

    '''
    df['order_dow_sin'], df['order_dow_cos'] =   encoding(df['order_dow'],6) 
    # 6 is the maximum value for order_dow and 23 is the maximum value for order_dow
    df['order_hour_of_day_sin'], df['order_hour_of_day_cos'] = encoding(df['order_hour_of_day'],23)
    df.drop(['order_dow','order_hour_of_day'],axis=1,inplace=True) # dropping original features as we don't need them
    #filling NaN values with 0 
    df = df.fillna(0)
    return df
def get_train_data(df_orders,df_train,data):
    '''
        Prepares train data
        Args: 
            df_orders :  Order history 
            df_train : users last ordered data
            data : featurized DF with all features
        Returns:
                train data
    '''
    orders = df_orders[(df_orders.eval_set=='train') | (df_orders.eval_set=='test')] # seleting train and test orders
    # merging train/ test orders with our featurized user-product DF
    data = data.merge(orders, on='user_id',how='left')
    data_train = data[data.eval_set=='train']
    # we will consider the reordered label as 0 for all the user-product combinations that are not there in train orders as they are not reordered in the latest order
    data_train = data_train.merge(df_train[['product_id', 'order_id', 'reordered']], on=['product_id', 'order_id'], how='left')
    data_train = data_train.drop(['order_id','eval_set'], axis=1)
    #data_train = data_train.drop(['order_id','eval_set','days_since_prior_order','order_number'], axis=1)
    #filling the NAN reordered values with 0.
    data_train = data_train.fillna(0)
    #data_train.head()
    data_train = get_cyclic_encoding(data_train)
    data_train['is_organic'] = data_train['product_name'].apply(is_organic)
    # Dropping product_name
    data_train.drop(['product_name'],axis=1,inplace=True)
    # Set user_id and product_id as the index of train/test DF
    data_train = data_train.set_index(['user_id', 'product_id'])
    return data_train

def get_test_data(data):
       '''
        Prepares test data
        Args: 
            data : featurized DF with all features
        Returns:
                test data
        '''
    data_test = data[data.eval_set=='test']
    data_test = data_test.fillna(0)
    # data_test = data_test.drop(['eval_set', 'order_id','days_since_prior_order','order_number'], axis=1)
    data_test = data_test.drop(['eval_set', 'order_id'], axis=1)
    #data_test.head()
    data_test = get_cyclic_encoding(data_test)
    data_test['is_organic'] = data_test['product_name'].apply(is_organic)
    data_test.drop(['product_name'],axis=1,inplace=True)
    data_test = data_test.set_index(['user_id', 'product_id'])
    return data_test