In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split



In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# check missing values per column
train.isnull().sum(axis=0)/train.shape[0]

ID             0.000000
datetime       0.000000
siteid         0.099896
offerid        0.000000
category       0.000000
merchant       0.000000
countrycode    0.000000
browserid      0.050118
devid          0.149969
click          0.000000
dtype: float64

In [3]:
# imputing missing values
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None",inplace=True)
test['browserid'].fillna("None", inplace=True)

print ('Blank filling started')
index=train.browserid[train.browserid=='Firefox'].index.tolist()
train.iloc[index,8]='Mobile'

index=train.browserid[train.browserid=='Google Chrome'].index.tolist()
train.iloc[index,8]='Mobile'

index=train.browserid[train.browserid=='IE'].index.tolist()
train.iloc[index,8]='Mobile'

index=train.browserid[train.browserid=='Opera'].index.tolist()
train.iloc[index,8]='Mobile'
print ('Mobile part done for TRAIN')

index=train.browserid[train.browserid=='Chrome'].index.tolist()
train.iloc[index,8]='Desktop'

index=train.browserid[train.browserid=='InternetExplorer'].index.tolist()
train.iloc[index,8]='Desktop'

index=train.browserid[train.browserid=='Mozilla'].index.tolist()
train.iloc[index,8]='Desktop'

index=train.browserid[train.browserid=='Mozilla Firefox'].index.tolist()
train.iloc[index,8]='Desktop'
print ('Desktop part done for TRAIN')

index=train.browserid[train.browserid=='Edge'].index.tolist()
train.iloc[index,8]='Tablet'

index=train.browserid[train.browserid=='Internet Explorer'].index.tolist()
train.iloc[index,8]='Tablet'

index=train.browserid[train.browserid=='Safari'].index.tolist()
train.iloc[index,8]='Tablet'

train['devid'].fillna("None",inplace=True)
print ('Desktop part done for TRAIN')

Blank filling started
Mobile part done for TRAIN
Desktop part done for TRAIN
Desktop part done for TRAIN


In [4]:
print ('Blank filling started')
index=test.browserid[test.browserid=='Firefox'].index.tolist()
test.iloc[index,8]='Mobile'

index=test.browserid[test.browserid=='Google Chrome'].index.tolist()
test.iloc[index,8]='Mobile'

index=test.browserid[test.browserid=='IE'].index.tolist()
test.iloc[index,8]='Mobile'

index=test.browserid[test.browserid=='Opera'].index.tolist()
test.iloc[index,8]='Mobile'
print ('Mobile part done for TRAIN')

index=test.browserid[test.browserid=='Chrome'].index.tolist()
test.iloc[index,8]='Desktop'

index=test.browserid[test.browserid=='InternetExplorer'].index.tolist()
test.iloc[index,8]='Desktop'

index=test.browserid[test.browserid=='Mozilla'].index.tolist()
test.iloc[index,8]='Desktop'

index=test.browserid[test.browserid=='Mozilla Firefox'].index.tolist()
test.iloc[index,8]='Desktop'
print ('Desktop part done for TRAIN')

index=test.browserid[test.browserid=='Edge'].index.tolist()
test.iloc[index,8]='Tablet'

index=test.browserid[test.browserid=='Internet Explorer'].index.tolist()
test.iloc[index,8]='Tablet'

index=test.browserid[test.browserid=='Safari'].index.tolist()
test.iloc[index,8]='Tablet'

Blank filling started
Mobile part done for TRAIN
Desktop part done for TRAIN


In [5]:
# impute missing values

train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

In [6]:
train.browserid.replace(['Mozilla', 'Firefox'], ['Mozilla Firefox', 'Mozilla Firefox'], inplace=True)
train.browserid.replace(['Internet Explorer', 'InternetExplorer'], ['IE', 'IE'], inplace=True)
train.browserid.replace(['Google Chrome'], ['Chrome'], inplace=True)

print(train.browserid.unique())

['Mozilla Firefox' 'Edge' 'Chrome' 'None' 'IE' 'Opera' 'Safari']


In [7]:
test.browserid.replace(['Mozilla', 'Firefox'], ['Mozilla Firefox', 'Mozilla Firefox'], inplace=True)
test.browserid.replace(['Internet Explorer', 'InternetExplorer'], ['IE', 'IE'], inplace=True)
test.browserid.replace(['Google Chrome'], ['Chrome'], inplace=True)
print(test.browserid.unique())

['Mozilla Firefox' 'Edge' 'Chrome' 'IE' 'Safari' 'None' 'Opera']


In [8]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [9]:
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [10]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [11]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
for c in list(train.select_dtypes(include=['object']).columns):
    if c != 'ID':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



# CATBOOST

In [12]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click','tminute']))

# catboost accepts categorical variables as indexes
cat_cols = [0,1,2,3,4,6,7,8]
#train[cat_cols].head()

In [23]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,128865,784773,48,127,4,3,1,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,142053,157563,59,65,1,3,0,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,458279,69,15,0,1,3,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,243406,345067,117,507,2,3,1,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,154278,417948,36,276,3,3,0,0,5,16,2


In [13]:
# modeling on sampled (1e6) rows
sampled_train = train.ix[:4000000, :]

In [26]:
trainX = sampled_train[cols_to_use]
trainY = sampled_train['click']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size = 0.5)
model = CatBoostClassifier(depth=10, iterations=30, learning_rate=0.1, eval_metric='AUC', random_seed=1)

#model = CatBoostClassifier(depth=15, iterations=40, learning_rate=0.1, eval_metric='AUC', random_seed=140)
#0.67683 #4*1e6

#model = CatBoostClassifier(depth=10, iterations=20, learning_rate=0.1, eval_metric='AUC', random_seed=140)
#0.67642 #4*1e6

In [28]:
model.fit(X_train
          ,y_train
          ,cat_features=cat_cols
          ,eval_set = (X_test, y_test)
          ,use_best_model = True
         )

<catboost.core.CatBoostClassifier at 0x7f932f295eb8>

In [29]:
pred = model.predict_proba(test[cols_to_use])[:,1]
sub = pd.DataFrame({'ID':test['ID'],'click':pred})
sub.to_csv('cb_1.csv',index=False)


In [30]:
del pred
del sub
del trainX
del trainY
del X_train
del y_train

# LightGBM

In [31]:
import lightgbm as lgb


In [32]:
#sampled_train = train.ix[4000001:8000000, :]
sampled_train = train.ix[:1000,:]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(sampled_train[cols_to_use], sampled_train['click'], test_size = 0.5)

dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_test, y_test)

In [34]:
params = {
    
    'num_leaves' : 256,
    'learning_rate':0.05,
    'metric':'auc',
    'objective':'binary',
    'early_stopping_round': 40,
    'max_depth':14,
    'bagging_fraction':0.5,
    'feature_fraction':0.6,
    'bagging_seed':2017,
    'feature_fraction_seed':2017,
    'verbose' : 1
    
    
}



In [35]:
clf = lgb.train(params, dtrain,num_boost_round=500,valid_sets=dval,verbose_eval=20)

[20]	valid_0's auc: 0.883023
[40]	valid_0's auc: 0.889694
[60]	valid_0's auc: 0.882103
[80]	valid_0's auc: 0.89211
[100]	valid_0's auc: 0.89234
[120]	valid_0's auc: 0.881873
[140]	valid_0's auc: 0.880492
[160]	valid_0's auc: 0.872556
[180]	valid_0's auc: 0.862434
[200]	valid_0's auc: 0.856223
[220]	valid_0's auc: 0.855418
[240]	valid_0's auc: 0.852542
[260]	valid_0's auc: 0.852082
[280]	valid_0's auc: 0.847941
[300]	valid_0's auc: 0.845411
[320]	valid_0's auc: 0.84288
[340]	valid_0's auc: 0.840235
[360]	valid_0's auc: 0.84081
[380]	valid_0's auc: 0.84058
[400]	valid_0's auc: 0.839199
[420]	valid_0's auc: 0.841385
[440]	valid_0's auc: 0.842535
[460]	valid_0's auc: 0.84196
[480]	valid_0's auc: 0.84081
[500]	valid_0's auc: 0.84081


In [36]:
#LightGBM
preds = clf.predict(test[cols_to_use])
sub = pd.DataFrame({'ID':test['ID'], 'click':preds})
sub.to_csv('lgb_1.csv', index=False)

In [39]:
del preds
del sub


NameError: name 'preds' is not defined

# 3rd Model

In [13]:
    sampled_train = train.ix[8000001:10000000, :]

In [14]:
trainX = sampled_train[cols_to_use]
trainY = sampled_train['click']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size = 0.5)
model = CatBoostClassifier(depth=10, iterations=20, learning_rate=0.2, eval_metric='AUC', random_seed=1)


In [16]:
model.fit(X_train
          ,y_train
          ,cat_features=cat_cols
          ,eval_set = (X_test, y_test)
          ,use_best_model = True
         )

<catboost.core.CatBoostClassifier at 0x7fd730c020b8>

In [17]:
pred = model.predict_proba(test[cols_to_use])[:,1]
sub = pd.DataFrame({'ID':test['ID'],'click':pred})
sub.to_csv('cb_2.csv',index=False)


In [18]:
del pred
del sub
del trainX
del trainY
del X_train
del y_train

In [19]:

#--------------------------------------------------------
cb_1 = pd.read_csv("cb_1.csv")
lbg_1 = pd.read_csv("lgb_1.csv")
cb_2 = pd.read_csv("cb_2.csv")
#--------------------------------------------------------


In [22]:
submit=cb_1.copy()
A_mul = cb_1.ix[:,1]*0.41
B_mul = lbg_1.ix[:,1]*0.59
C_mul = cb_2.ix[:,1]*0.59
D_mul = (A_mul+B_mul)*0.41
submit.ix[:,1] = C_mul+D_mul

submit.to_csv('ensemble_sub_1.csv', index=False)