In [2]:

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import ExtraTreesRegressor

#See example in http://nbviewer.jupyter.org/github/trevorstephens/gplearn/blob/master/doc/gp_examples.ipynb
from gplearn.genetic import SymbolicRegressor

import time
start_time = time.time()
tcurrent   = start_time

np.random.seed(1234)   

data = {
    'tra':   pd.read_csv('input/air_visit_data.csv'),
    'as':    pd.read_csv('input/air_store_info.csv'),
    'hs':    pd.read_csv('input/hpg_store_info.csv'),
    'ar':    pd.read_csv('input/air_reserve.csv'),
    'hr':    pd.read_csv('input/hpg_reserve.csv'),
    'id':    pd.read_csv('input/store_id_relation.csv'),
    'tes':   pd.read_csv('input/sample_submission.csv'),
    'hol':   pd.read_csv('input/date_info.csv').rename(columns={'calendar_date': 'visit_date' })
}


In [3]:
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

for df in ['ar', 'hr']:
    data[df]['visit_datetime']   = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime']   = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(
        lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
        
    #--- begin  new features    
    data[df]['reserve_datetime_diff_2'] = data[df].apply(
        lambda r: ( (r['visit_datetime'] - r['reserve_datetime']).days)**2.1, axis=1)
    data[df]['reserve_datetime_diff_3'] = data[df].apply(
        lambda r: ( (r['visit_datetime'] - r['reserve_datetime']).days)**3.2, axis=1)
    #--- end new features        
        
    data[df] = data[df].groupby(
        ['air_store_id', 'visit_datetime'], as_index=False)[[
            'reserve_datetime_diff', 'reserve_visitors'
        ]].sum().rename(columns={
            'visit_datetime': 'visit_date'
        })
        
    show_data = 0    
    if (show_data==1):
        print(data[df].head())

data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow']        = data['tra']['visit_date'].dt.dayofweek
data['tra']['year']       = data['tra']['visit_date'].dt.year
data['tra']['month']      = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(
    lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(
    lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

In [4]:
unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat(
    [
        pd.DataFrame({
            'air_store_id': unique_stores,
            'dow': [i] * len(unique_stores)
        }) for i in range(7)
    ],
    axis=0,
    ignore_index=True).reset_index(drop=True)

#sure it can be compressed...
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].min().rename(columns={
        'visitors': 'min_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].mean().rename(columns={
        'visitors': 'mean_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].median().rename(columns={
        'visitors': 'median_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].max().rename(columns={
        'visitors': 'max_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].count().rename(columns={
        'visitors': 'count_observations'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

In [5]:
stores = pd.merge(stores, data['as'], how='left', on=['air_store_id'])
lbl = preprocessing.LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date

train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date'])
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date'])

train = pd.merge(data['tra'], stores, how='left', on=['air_store_id', 'dow'])
test = pd.merge(data['tes'], stores, how='left', on=['air_store_id', 'dow'])

for df in ['ar', 'hr']:
    train = pd.merge(
        train, data[df], how='left', on=['air_store_id', 'visit_date'])
    test = pd.merge(
        test, data[df], how='left', on=['air_store_id', 'visit_date'])

col = [
    c for c in train
    if c not in ['id', 'air_store_id', 'visit_date', 'visitors']
]
train = train.fillna(-1)
test = test.fillna(-1)

In [6]:
for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)

for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

train_x = train.drop(['air_store_id', 'visit_date', 'visitors'], axis=1)
train_y = np.log1p(train['visitors'].values)

if (show_data==1):
    print(train_x.shape, train_y.shape)
    
test_x  = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1)

In [7]:
regressor = 4

print('\n\nAdopted regressor = ', regressor,'\n')

if (regressor == 1):
    print('Starting XGBoost')
    boost_params = {'eval_metric': 'rmse'}
    xgb0 = xgb.XGBRegressor(
        max_depth        = 8,
        learning_rate    = 0.01,
        n_estimators     = 10000,
        objective        = 'reg:linear',
        gamma            = 0,
        min_child_weight = 1,
        subsample        = 1,
        colsample_bytree = 1,
        scale_pos_weight = 1,
        seed             = 27,
        **boost_params)
        
    xgb0.fit(train_x, train_y)
    predict_y = xgb0.predict(test_x)
    print('Finished XGBoost')
    
    
if (regressor == 2):    
    print('Starting Extra trees')
    et = ExtraTreesRegressor (n_estimators         = 10000, 
                                 max_depth         = 8, 
                                 n_jobs            = -1, 
                                 random_state      = 11, 
                                 verbose           = 0, 
                                 warm_start        = True,
                                 min_samples_leaf  = 120, 
                                 max_features      = 0.8)    
    et.fit(train_x, train_y)
    predict_y = et.predict(test_x)
    print('Finished Extra trees')
    



Adopted regressor =  4 



In [8]:
if (regressor == 3):    
    print('Starting Genetic Programming')
    
    '''
    http://gplearn.readthedocs.io/en/stable/reference.html
    
    The sum of p_crossover, p_subtree_mutation, p_hoist_mutation and p_point_mutation 
    should total to 1.0 or less.
    '''
    
    gp = SymbolicRegressor(function_set=('add', 'sub', 'mul', 'div','max','min','log','sqrt'),
                           population_size       = 100, 
                           const_range           = (-10, 100),
                           generations           = 100, 
                           stopping_criteria     = 0.001,
                           p_crossover           = 0.5, 
                           p_subtree_mutation    = 0.25,
                           p_hoist_mutation      = 0.05, 
                           p_point_mutation      = 0.20, 
                           init_depth            = (6, 12),
                           max_samples           = 0.7, 
                           verbose               = 1, 
                           n_jobs                = -1, 
                           metric                = 'rmse',
                           parsimony_coefficient = 0.0001, 
                           random_state          = 1121)  
                           # seed is relevant to stochastic approaches such as genetic programming
                           
    gp.fit(train_x, train_y)
    predict_y = gp.predict(test_x)  
    predict_y[predict_y < 0] = 0        # only positive values
    
    print ('\nDetails about the results using Genetic Programming\n')  
    print (gp._program)
    print ('R2(max) = ',gp.score(train_x, train_y))  

    # summary of the results
    print('Raw fitness = ',gp._program.raw_fitness_)    
    #print('Fitness     = ',gp._program.fitness_)    
    print('OOB fitness = ',gp._program.oob_fitness_)    
    print('Depth       = ',gp._program.depth_)    
    print('Length      = ',gp._program.length_,'\n')    

    '''
    Comments:
    raw_fitness_ : The raw fitness of the individual program.
    fitness_     : The penalized fitness of the individual program.
    oob_fitness_ : The out-of-bag raw fitness of the individual program for the held-out samples. 
                     Only present when sub-sampling was used in the estimator by 
                     specifying max_samples < 1.0.
    depth_       : The maximum depth of the program tree.
    length_      : The number of functions and terminals in the program.
    '''
    print('Finished Genetic Programming')


if (regressor == 4):    
    print('Starting Genetic Programming')
    
    '''
    http://gplearn.readthedocs.io/en/stable/reference.html
    
    The sum of p_crossover, p_subtree_mutation, p_hoist_mutation and p_point_mutation 
    should total to 1.0 or less.
    '''
    
    gp = SymbolicRegressor(function_set=('add', 'sub', 'mul', 'div','max','min','log','sqrt'),
                           population_size       = 100, 
                           const_range           = (-10, 100),
                           generations           = 400, 
                           stopping_criteria     = 0.001,
                           p_crossover           = 0.55, 
                           p_subtree_mutation    = 0.20,
                           p_hoist_mutation      = 0.05, 
                           p_point_mutation      = 0.20, 
                           init_depth            = (6, 12),
                           max_samples           = 0.9, 
                           verbose               = 1, 
                           n_jobs                = -1, 
                           metric                = 'rmse',
                           parsimony_coefficient = 0.0001, 
                           random_state          = 343)  
                           # seed is relevant to stochastic approaches such as genetic programming
                           
    gp.fit(train_x, train_y)
    predict_y = gp.predict(test_x)  
    predict_y[predict_y < 0] = 0        # only positive values
    
    print ('\nDetails about the results using Genetic Programming\n')  
    print (gp._program)
    print ('R2(max) = ',gp.score(train_x, train_y))  

    # summary of the results
    print('Raw fitness = ',gp._program.raw_fitness_)    
    #print('Fitness     = ',gp._program.fitness_)    
    print('OOB fitness = ',gp._program.oob_fitness_)    
    print('Depth       = ',gp._program.depth_)    
    print('Length      = ',gp._program.length_,'\n')    

    '''
    Comments:
    raw_fitness_ : The raw fitness of the individual program.
    fitness_     : The penalized fitness of the individual program.
    oob_fitness_ : The out-of-bag raw fitness of the individual program for the held-out samples. 
                     Only present when sub-sampling was used in the estimator by 
                     specifying max_samples < 1.0.
    depth_       : The maximum depth of the program tree.
    length_      : The number of functions and terminals in the program.
    '''
    print('Finished Genetic Programming')


#------------------------------------------------ end regressors

test['visitors'] = np.expm1(predict_y)

fname = 'submissionr v01 regressor ' + str(regressor) + '.csv'

test[['id', 'visitors']].to_csv(fname, index=False, float_format='%.3f')  

nm=(time.time() - start_time)/60
print ("Total processing time %s min" % nm)

Starting Genetic Programming
    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   405.46    2862704645.31        2   0.581068049643   0.575271763408    548.03m
   1    96.57    208.911352395        2   0.579387358561   0.590331051658    346.39m
   2      9.7    162.785674983        2   0.579329968907   0.590837736395    241.13m
   3     3.49    23.3025837326        2   0.578978245865   0.593932514375    186.37m
   4     2.78     65.787448749        2   0.579187688739   0.592091820966    153.25m
   5     2.73    12.3187149265        2   0.579375149217   0.590438886761    131.15m
   6     4.32    3064.66896666        2   0.579126151504   0.592633304568    115.69m
   7     2.24    98.9883109355        2   0.579152213935   0.592404041283    103.78m
   8     3.17    223.788266727        2   0.57

  94    23.18    1.52804126246       20   0.522715484165   0.526740035344     30.20m
  95    20.07    6341.41694445       20   0.522192323491   0.524970884042     30.11m
  96    19.62    1.70543580708       20   0.522143558902   0.534090335394     30.03m
  97    18.68    1.19490212945       34   0.521676019136   0.527601260648     29.94m
  98    19.29     10.250513628       23   0.521472754607   0.531349434257     29.88m
  99    21.86    2.12709034728       48   0.521344459283   0.529407421941     29.84m
 100     18.9     2.9747097936       17   0.521626955307    0.52998548442     29.76m
 101    17.79    1.48844840276       35   0.521033719136   0.521755355946     29.67m
 102    18.05    2.10350433744       23   0.521592926313   0.523888601133     29.58m
 103     19.8    1.89670464712       23   0.521355031738   0.526015448116     29.50m
 104    18.55    10.2960089645       19   0.520763551731   0.531262396113     29.41m
 105     19.8    1.74581210719       23   0.521242052397   0.5265

 191    17.62    1.38929789014       17   0.520497437544   0.524511739015     21.09m
 192    21.16    2.17349096363       17   0.520079191296   0.528232415963     21.00m
 193    16.23    2.33258268782       17   0.520250919766   0.526708278614     20.91m
 194    15.92    21.0676944856       17   0.520429371377   0.530178083414     20.80m
 195    15.78    20.1617052472       17   0.519959567586   0.529291219513     20.69m
 196    16.66    7.42430100727       17   0.519527483354   0.533096119946     20.58m
 197    16.41     3.3142452362       17   0.520254038984   0.526680549254     20.48m
 198     17.4    10.6018109753       17   0.520347777041   0.525846472621     20.38m
 199    16.32    3.79345507719       17   0.520318502835   0.526107111205     20.27m
 200    16.65    11.5179567664       17   0.520020488399   0.528752297592     20.17m
 201    16.29     5.2845647217       17   0.520045698444   0.528529103501     20.06m
 202    16.42    1.52852533098       17   0.520092760275   0.5281

 288    16.47    21.9275434571       17   0.520131563317   0.527013413962     11.28m
 289    23.48    23.0192008393       17   0.520075925431   0.527507353903     11.19m
 290    15.03    23.4486342132       17   0.520062832918   0.527623511235     11.09m
 291    16.55    1.62768159975       17   0.519619093431   0.531543661736     10.98m
 292    16.25    71.6377179132       17   0.519799613335   0.529952787836     10.88m
 293    17.09    26.3998899972       17   0.520397346269   0.524646706684     10.78m
 294    17.57    41.9545144333       17   0.520191525433   0.526480506462     10.67m
 295    16.57    1.50507144279       17   0.520051072277   0.523809530605     10.57m
 296    16.75    41.7643347605       17   0.520042019164   0.527808113118     10.47m
 297     17.2    267.744704944       17   0.520414312575   0.520552594584     10.37m
 298    16.65    5.68327292357       17   0.520120064771   0.523192654804     10.26m
 299    20.06    81.4523527793       17    0.51951186425   0.5286

 385    15.99    103.885351622       18   0.519636518345   0.526810529948      1.42m
 386    17.31    1.42493716516       17   0.519187522155   0.531463323942      1.32m
 387    15.95    25.3488045437       17   0.520064268679   0.523691601441      1.22m
 388    19.83    22.9352484853       17   0.519415342862   0.529456085354      1.12m
 389    19.33    3.36796952649       19    0.51991390424   0.525033582383      1.01m
 390    17.46    49.5515243393       18   0.519951140058   0.524009225838     54.77s
 391    18.23    4.73518852338       17   0.519976515667   0.524475248783     48.69s
 392    17.44     24.992621569       18   0.519466558347   0.528316925779     42.60s
 393    18.65    21.3971487118       17   0.519723305756   0.526729177462     36.52s
 394    17.81    44.0541388495       17   0.519792840585   0.526111289491     30.44s
 395    16.17     31.176603778       17   0.519964092413      0.524586085     24.34s
 396    17.22    4.66010652277       17   0.519717191426   0.5267