In [1]:
import pickle
import lightgbm as lgb
from sklearn.externals import joblib 
import pandas as pd
import numpy as np
import time
from pandas import json_normalize
import json
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

# fun1 which returns predictions

In [43]:
def final_fun_1(query_pt):
    start=time.time()
    # load light gb regressor model
    with open('../lgb_agg_final.pickle','rb') as f:
        lgb_regressor = pickle.load(f)
    # load standardScaler object
    scaler = joblib.load('standarising_scaler.pkl')
    # DATA COLUMNS
    data_columns = ['channelGrouping', 'customDimensions', 'date', 'fullVisitorId', 'visitId', 'visitNumber', 
                    'visitStartTime', 'device.browser', 'device.operatingSystem', 'device.isMobile', 
                    'device.deviceCategory', 'geoNetwork.continent', 'geoNetwork.subContinent', 
                    'geoNetwork.country', 'geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city', 
                    'geoNetwork.networkDomain', 'totals.hits', 'totals.pageviews', 'totals.sessionQualityDim',
                    'totals.timeOnSite', 'totals.transactions', 'totals.transactionRevenue',
                    'trafficSource.campaign', 'trafficSource.source', 'trafficSource.medium', 
                    'trafficSource.keyword', 'trafficSource.referralPath']
    test_df = query_pt[data_columns].copy()#pd.DataFrame(data=[query_pt.values], columns=data_columns)
    # converting single row series into dataframe
    if isinstance(test_df,pd.Series):
        test_df = pd.DataFrame([test_df])
    # NUMERIC DATA
    numeric_feat = ['totals.hits','totals.pageviews','totals.timeOnSite', 'totals.transactions',
                'totals.transactionRevenue', 'totals.sessionQualityDim']
    for feature in numeric_feat:
        # convert to float32
        test_df[feature] = test_df[feature].astype('float32')
        # fill missing values
        test_df[feature].fillna(0,inplace=True)
        if feature not in ['totals.transactionRevenue']: 
            print(f'Normalising {feature} numeric feature....')
            # transform numeric features using trained scaler object
            test_df[feature] = scaler.transform(test_df[feature].values.reshape(-1, 1))
    print(f'DONE: Standarization of numeric features completed.')     
    # CATEGORICAL DATA
    object_cols = list(test_df.select_dtypes(include=['object', 'bool']).columns)
            
    # creating separate category 'others' for missing data
    test_df['geoNetwork.region'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['geoNetwork.metro'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['geoNetwork.city'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['trafficSource.keyword'].replace([np.nan,'(not provided)'], 'others',inplace=True)
    test_df['trafficSource.referralPath'].fillna('others',inplace=True)
    
    # source : https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
    for feature in object_cols:
        if feature != 'fullVisitorId':
            # intitalizing label encoder object
            label_encoder = preprocessing.LabelEncoder()
            # reading already saved files       
            label_encoder.classes_ = np.load(feature+'.npy')
            # transforming that feature
            print(f"Label encoding {feature}....")
            test_df[feature] = label_encoder.transform(list(test_df[feature].values.astype('str')))
    print(f"DONE: Label encoding for categorical features completed.")
    
    # FEATURE ENGINEERING
    # list of categorical features whose median values would be used as new feature
    median_features = ['channelGrouping','device.browser','device.operatingSystem','geoNetwork.country',
                       'customDimensions','geoNetwork.continent', 'geoNetwork.subContinent','geoNetwork.region',
                       'geoNetwork.metro','geoNetwork.city','geoNetwork.networkDomain']
    # list of features whose sum, mean, standard deviation would be used as new features
    sum_features = ['totals.hits','totals.pageviews','totals.timeOnSite','totals.transactions']

    mean_features = ['totals.hits','totals.pageviews','totals.sessionQualityDim']

    std_features =  ['totals.hits','totals.pageviews']
    # define window intervals
    # setting target period with gap of 2 months
    # Duration : 1 Aug 2016 (start of train date) to 1 Dec 2018 (start of kaggle private date)
    target_period = pd.date_range(start='2016-08-01',end='2018-12-01', freq='2MS')
    # Shift train period by 168 days
    train_period = target_period.to_series().shift(periods=-168, freq='d',axis= 0)
    # Set start date of window using the dates which are greater than the start of our TRAIN DATE
    # We will use these dates to calculate the time features with respect to 'visitStartTime',
    # which is given in POSIX time format.
    # So we will convert the datetime object to POSIX time format using :- .astype('int')//10**9
    start_window = train_period[train_period.index>np.datetime64('2016-08-01')].astype('int')//10**9
    # Set end date of window such that gap of 45 days is maintained with target period
    end_window = target_period.to_series().shift(periods=-45, freq='d',axis= 0)[3:]
    
    # GROUPING DATA FOR EACH USER
    # taking median of categorical features
    median_features_data = test_df.groupby('fullVisitorId')[median_features].median().add_suffix('_median')

    # TIME FEATURES
    # first and last visit time in the given window is calculated by subtracting the start of the window
    visit_time = test_df.groupby('fullVisitorId')['visitStartTime'].agg(['first','last']) \
                            .sub(start_window.values[-1]).abs().add_suffix('_time')

    # time difference is time between the first and last visit of the user 
    time_diff = test_df.groupby('fullVisitorId')['visitStartTime'].apply(lambda x: x.max() - x.min()) \
                            .rename('time_diff')

    # taking sum, mean, standard deviation of numerical features
    sum_numerical_features = test_df.groupby('fullVisitorId')[sum_features].sum().add_suffix('_sum')
    mean_numerical_features = test_df.groupby('fullVisitorId')[mean_features].mean().add_suffix('_mean')
    std_numerical_features = test_df.groupby('fullVisitorId')[std_features].std(ddof=0).add_suffix('_std')

    # combining new feature into a dataframe
    test_x = pd.concat([median_features_data,visit_time, time_diff,sum_numerical_features,
                         mean_numerical_features, std_numerical_features], axis=1).reset_index()

    # removing visitorId from test set
    test_x.drop(['fullVisitorId'],inplace=True, axis=1,errors='ignore')
    test_x = test_x.astype('int')
    test_y = test_df['totals.transactionRevenue']
    print(f'DONE: Time window feature engineering completed.')
    
    # get predictions
    predictions = lgb_regressor.predict(test_x)
    predictions[predictions<0]=0
    end = time.time()
    print(f'Prediction done in {end-start} secs.')
    return predictions

In [75]:
test_org = pd.read_csv('test_5000.csv')
query_pt = test_org.loc[0]
preds = final_fun_1(query_pt)
preds

Normalising totals.hits numeric feature....
Normalising totals.pageviews numeric feature....
Normalising totals.timeOnSite numeric feature....
Normalising totals.transactions numeric feature....
Normalising totals.sessionQualityDim numeric feature....
DONE: Standarization of numeric features completed.
Label encoding channelGrouping....
Label encoding customDimensions....
Label encoding device.browser....
Label encoding device.operatingSystem....
Label encoding device.isMobile....
Label encoding device.deviceCategory....
Label encoding geoNetwork.continent....
Label encoding geoNetwork.subContinent....
Label encoding geoNetwork.country....
Label encoding geoNetwork.region....
Label encoding geoNetwork.metro....
Label encoding geoNetwork.city....
Label encoding geoNetwork.networkDomain....
Label encoding trafficSource.campaign....
Label encoding trafficSource.source....
Label encoding trafficSource.medium....
Label encoding trafficSource.keyword....
Label encoding trafficSource.referral

array([0.])

# fun2 which returns rmse score

In [66]:
def final_fun_2(test_df, original_target):
    start=time.time()
    # load light gb regressor model
    with open('../lgb_agg_final.pickle','rb') as f:
        lgb_regressor = pickle.load(f)
    # load standardScaler object
    scaler = joblib.load('standarising_scaler.pkl')
    # DATA COLUMNS
    data_columns = ['channelGrouping', 'customDimensions', 'date', 'fullVisitorId', 'visitId', 'visitNumber', 
                    'visitStartTime', 'device.browser', 'device.operatingSystem', 'device.isMobile', 
                    'device.deviceCategory', 'geoNetwork.continent', 'geoNetwork.subContinent', 
                    'geoNetwork.country', 'geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city', 
                    'geoNetwork.networkDomain', 'totals.hits', 'totals.pageviews', 'totals.sessionQualityDim',
                    'totals.timeOnSite', 'totals.transactions', 'totals.transactionRevenue',
                    'trafficSource.campaign', 'trafficSource.source', 'trafficSource.medium', 
                    'trafficSource.keyword', 'trafficSource.referralPath']
    test_df = query_pt[data_columns].copy()#pd.DataFrame(data=[query_pt.values], columns=data_columns)
    # converting single row series into dataframe
    if isinstance(test_df,pd.Series):
        test_df = pd.DataFrame([test_df])
    # NUMERIC DATA
    numeric_feat = ['totals.hits','totals.pageviews','totals.timeOnSite', 'totals.transactions',
                'totals.transactionRevenue', 'totals.sessionQualityDim']
    for feature in numeric_feat:
        # convert to float32
        test_df[feature] = test_df[feature].astype('float32')
        # fill missing values
        test_df[feature].fillna(0,inplace=True)
        if feature not in ['totals.transactionRevenue']: 
            print(f'Normalising {feature} numeric feature....')
            # transform numeric features using trained scaler object
            test_df[feature] = scaler.transform(test_df[feature].values.reshape(-1, 1))
    print(f'DONE: Standarization of numeric features completed.')     
    # CATEGORICAL DATA
    object_cols = list(test_df.select_dtypes(include=['object', 'bool']).columns)
            
    # creating separate category 'others' for missing data
    test_df['geoNetwork.region'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['geoNetwork.metro'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['geoNetwork.city'].replace(['not available in demo dataset', '(not set)'], 'others', inplace=True)
    test_df['trafficSource.keyword'].replace([np.nan,'(not provided)'], 'others',inplace=True)
    test_df['trafficSource.referralPath'].fillna('others',inplace=True)
    
    # source : https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
    for feature in object_cols:
        if feature != 'fullVisitorId':
            # intitalizing label encoder object
            label_encoder = preprocessing.LabelEncoder()
            # reading already saved files       
            label_encoder.classes_ = np.load(feature+'.npy')
            # transforming that feature
            print(f"Label encoding {feature}....")
            test_df[feature] = label_encoder.transform(list(test_df[feature].values.astype('str')))
    print(f"DONE: Label encoding for categorical features completed.")
    
    # FEATURE ENGINEERING
    # list of categorical features whose median values would be used as new feature
    median_features = ['channelGrouping','device.browser','device.operatingSystem','geoNetwork.country',
                       'customDimensions','geoNetwork.continent', 'geoNetwork.subContinent','geoNetwork.region',
                       'geoNetwork.metro','geoNetwork.city','geoNetwork.networkDomain']
    # list of features whose sum, mean, standard deviation would be used as new features
    sum_features = ['totals.hits','totals.pageviews','totals.timeOnSite','totals.transactions']

    mean_features = ['totals.hits','totals.pageviews','totals.sessionQualityDim']

    std_features =  ['totals.hits','totals.pageviews']
    # define window intervals
    # setting target period with gap of 2 months
    # Duration : 1 Aug 2016 (start of train date) to 1 Dec 2018 (start of kaggle private date)
    target_period = pd.date_range(start='2016-08-01',end='2018-12-01', freq='2MS')
    # Shift train period by 168 days
    train_period = target_period.to_series().shift(periods=-168, freq='d',axis= 0)
    # Set start date of window using the dates which are greater than the start of our TRAIN DATE
    # We will use these dates to calculate the time features with respect to 'visitStartTime',
    # which is given in POSIX time format.
    # So we will convert the datetime object to POSIX time format using :- .astype('int')//10**9
    start_window = train_period[train_period.index>np.datetime64('2016-08-01')].astype('int')//10**9
    # Set end date of window such that gap of 45 days is maintained with target period
    end_window = target_period.to_series().shift(periods=-45, freq='d',axis= 0)[3:]
    
    # GROUPING DATA FOR EACH USER
    # taking median of categorical features
    median_features_data = test_df.groupby('fullVisitorId')[median_features].median().add_suffix('_median')

    # TIME FEATURES
    # first and last visit time in the given window is calculated by subtracting the start of the window
    visit_time = test_df.groupby('fullVisitorId')['visitStartTime'].agg(['first','last']) \
                            .sub(start_window.values[-1]).abs().add_suffix('_time')

    # time difference is time between the first and last visit of the user 
    time_diff = test_df.groupby('fullVisitorId')['visitStartTime'].apply(lambda x: x.max() - x.min()) \
                            .rename('time_diff')

    # taking sum, mean, standard deviation of numerical features
    sum_numerical_features = test_df.groupby('fullVisitorId')[sum_features].sum().add_suffix('_sum')
    mean_numerical_features = test_df.groupby('fullVisitorId')[mean_features].mean().add_suffix('_mean')
    std_numerical_features = test_df.groupby('fullVisitorId')[std_features].std(ddof=0).add_suffix('_std')

    # combining new feature into a dataframe
    test_x = pd.concat([median_features_data,visit_time, time_diff,sum_numerical_features,
                         mean_numerical_features, std_numerical_features], axis=1).reset_index()

    # removing visitorId from test set
    test_x.drop(['fullVisitorId'],inplace=True, axis=1,errors='ignore')
    test_x = test_x.astype('int')
    test_y = test_df['totals.transactionRevenue']
    print(f'DONE: Time window feature engineering completed.')
    
    # get predictions
    predictions = lgb_regressor.predict(test_x)
    predictions[predictions<0]=0
    end = time.time()
    print(f'Prediction done in {end-start} secs.')
    if not isinstance(original_target, pd.Series):
        original_target = pd.Series(original_target)
    # get rmse score
    rmse = mean_squared_error(np.log1p(original_target), predictions, squared=False)
    return rmse

In [76]:
test_org = pd.read_csv('test_5000.csv')
query_pt = test_org.loc[0]
rmse = final_fun_2(query_pt,query_pt['totals.transactionRevenue'])
rmse

Normalising totals.hits numeric feature....
Normalising totals.pageviews numeric feature....
Normalising totals.timeOnSite numeric feature....
Normalising totals.transactions numeric feature....
Normalising totals.sessionQualityDim numeric feature....
DONE: Standarization of numeric features completed.
Label encoding channelGrouping....
Label encoding customDimensions....
Label encoding device.browser....
Label encoding device.operatingSystem....
Label encoding device.isMobile....
Label encoding device.deviceCategory....
Label encoding geoNetwork.continent....
Label encoding geoNetwork.subContinent....
Label encoding geoNetwork.country....
Label encoding geoNetwork.region....
Label encoding geoNetwork.metro....
Label encoding geoNetwork.city....
Label encoding geoNetwork.networkDomain....
Label encoding trafficSource.campaign....
Label encoding trafficSource.source....
Label encoding trafficSource.medium....
Label encoding trafficSource.keyword....
Label encoding trafficSource.referral

0.0