In [2]:
VALIDATE = False

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc

path = './input/' 
path_train = path + 'train.csv'
path_test = path + 'test.csv'

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
        
skip = range(1, 80000000)

print('Loading the training data...')
train = pd.read_csv(path_train, skiprows=skip, dtype=dtypes, header=0, usecols=train_cols)
print('Loading the test data...')
test = pd.read_csv(path_test, dtype=dtypes, header=0, usecols=test_cols)

len_train = len(train)
print('The initial size of the train set is', len_train)
print('Binding the training and test set together...')
train=train.append(test)

target = 'is_attributed'
train.loc[train[target].isnull(),target] = 99
train[target] = train[target].astype('uint8')
train.info()


del test
gc.collect()

print("Creating new time features: 'hour' and 'day'...")
train['hour'] = pd.to_datetime(train.click_time).dt.hour.astype('uint8')
train['day'] = pd.to_datetime(train.click_time).dt.day.astype('uint8')

train.drop( 'click_time', axis=1, inplace=True )
gc.collect()

print("Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...")

print('Computing the number of channels associated with ')
print('a given IP address within each hour...')
n_chans = train[['ip','day','hour','channel']].groupby(by=['ip','day',
          'hour'])[['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','day','hour'], how='left')
train['n_channels'] = train['n_channels'].astype('uint16')
del n_chans
gc.collect()

print('Computing the number of channels associated with ')
print('a given IP address and app...')
n_chans = train[['ip','app', 'channel']].groupby(by=['ip', 
          'app'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','app'], how='left')
train['ip_app_count'] = train['ip_app_count'].astype('uint16')
del n_chans
gc.collect()

print('Computing the number of channels associated with ')
print('a given IP address, app, and os...')
n_chans = train[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 
          'os'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
print('Merging the channels data with the main data set...')       
train = train.merge(n_chans, on=['ip','app', 'os'], how='left')
train['ip_app_os_count'] = train['ip_app_os_count'].astype('uint16')
del n_chans
gc.collect()

train.info()

test = train[len_train:].copy().drop( target, axis=1 )
print('The size of the test set is ', len(test))



predictors = ['ip', 'device', 'app', 'os', 'channel', 'hour', 'n_channels', 'ip_app_count', 'ip_app_os_count']
categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour']

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 255,  
    'max_depth': 8,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 5,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':99 
    }


if VALIDATE:
    
    r = 0.1 # the fraction of the train data to be used for validation
    val = train[(len_train-round(r*len_train)):len_train]
    print('The size of the validation set is ', len(val))

    train = train[:(len_train-round(r*len_train))]
    print('The size of the train set is ', len(train))


    gc.collect()

    print("Preparing the datasets for training...")

    
    dtrain = lgb.Dataset(train[predictors].values, label=train[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )
    del train
    gc.collect()

    dvalid = lgb.Dataset(val[predictors].values, label=val[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )
    del val
    gc.collect()
                      

    evals_results = {}

    print("Training the model...")

    lgb_model = lgb.train(params, 
                     dtrain, 
                     valid_sets=[dtrain, dvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=1000,
                     early_stopping_rounds=50,
                     verbose_eval=True, 
                     feval=None)
                     
    pred = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)
                     
else:

    train = train[:len_train]
    print('The size of the train set is ', len(train))

    gc.collect()

    print("Preparing the datasets for training...")

    
    dtrain = lgb.Dataset(train[predictors].values, label=train[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )
    del train
    gc.collect()

    evals_results = {}

    print("Training the model...")

    num_iteration=390
    lgb_model = lgb.train(params, 
                     dtrain, 
                     valid_sets=[dtrain], 
                     valid_names=['train'], 
                     evals_result=evals_results, 
                     num_boost_round=num_iteration,
                     verbose_eval=True, 
                     feval=None)
                     
    pred = lgb_model.predict(test[predictors])
                     
# Feature names:
print('Feature names:', lgb_model.feature_name())

# Feature importances:
print('Feature importances:', list(lgb_model.feature_importance()))

print("Preparing data for submission...")

submit = pd.read_csv(path_test, dtype='int', usecols=['click_id'])

print("Predicting the submission data...")

submit['is_attributed'] = pred

print("Writing the submission data into a csv file...")

submit.to_csv('submission.csv', index=False)

print("All done...")

Loading the training data...
Loading the test data...
('The initial size of the train set is', 104903891)
Binding the training and test set together...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 123694360 entries, 0 to 18790468
Data columns (total 7 columns):
app              uint16
channel          uint16
click_time       object
device           uint16
ip               uint32
is_attributed    uint8
os               uint16
dtypes: object(1), uint16(4), uint32(1), uint8(1)
memory usage: 3.3+ GB
Creating new time features: 'hour' and 'day'...
Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...
Computing the number of channels associated with 
a given IP address within each hour...
Merging the channels data with the main data set...
Computing the number of channels associated with 
a given IP address and app...
Merging the channels data with the main data set...
Computing the number of channels associated with 
a given IP address, app, and os...
Merging 



[1]	train's auc: 0.939881
[2]	train's auc: 0.959505
[3]	train's auc: 0.966028
[4]	train's auc: 0.96774
[5]	train's auc: 0.96904
[6]	train's auc: 0.969614
[7]	train's auc: 0.97003
[8]	train's auc: 0.969978
[9]	train's auc: 0.970571
[10]	train's auc: 0.970912
[11]	train's auc: 0.971286
[12]	train's auc: 0.971576
[13]	train's auc: 0.971941
[14]	train's auc: 0.971865
[15]	train's auc: 0.971924
[16]	train's auc: 0.972248
[17]	train's auc: 0.972314
[18]	train's auc: 0.972645
[19]	train's auc: 0.972918
[20]	train's auc: 0.973276
[21]	train's auc: 0.973296
[22]	train's auc: 0.97355
[23]	train's auc: 0.973961
[24]	train's auc: 0.974158
[25]	train's auc: 0.974644
[26]	train's auc: 0.97477
[27]	train's auc: 0.975109
[28]	train's auc: 0.9755
[29]	train's auc: 0.975743
[30]	train's auc: 0.975807
[31]	train's auc: 0.976073
[32]	train's auc: 0.97639
[33]	train's auc: 0.976918
[34]	train's auc: 0.97731
[35]	train's auc: 0.977655
[36]	train's auc: 0.977936
[37]	train's auc: 0.978427
[38]	train's auc: 0

[300]	train's auc: 0.989709
[301]	train's auc: 0.989725
[302]	train's auc: 0.989729
[303]	train's auc: 0.989731
[304]	train's auc: 0.989752
[305]	train's auc: 0.989757
[306]	train's auc: 0.989784
[307]	train's auc: 0.989788
[308]	train's auc: 0.989793
[309]	train's auc: 0.989795
[310]	train's auc: 0.989799
[311]	train's auc: 0.989802
[312]	train's auc: 0.989818
[313]	train's auc: 0.989823
[314]	train's auc: 0.989825
[315]	train's auc: 0.98986
[316]	train's auc: 0.989864
[317]	train's auc: 0.989891
[318]	train's auc: 0.989908
[319]	train's auc: 0.98991
[320]	train's auc: 0.989913
[321]	train's auc: 0.989915
[322]	train's auc: 0.989918
[323]	train's auc: 0.989926
[324]	train's auc: 0.989961
[325]	train's auc: 0.989968
[326]	train's auc: 0.989971
[327]	train's auc: 0.989977
[328]	train's auc: 0.989982
[329]	train's auc: 0.990004
[330]	train's auc: 0.990008
[331]	train's auc: 0.990012
[332]	train's auc: 0.990015
[333]	train's auc: 0.990041
[334]	train's auc: 0.990044
[335]	train's auc: 0.9