In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
train = pd.read_csv('DATASET/train.csv')
test = pd.read_csv('DATASET/test.csv')

In [3]:
print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The train data has 12137810 rows and 10 columns
The test data has 3706907 rows and 9 columns


In [4]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,e,Firefox,,0
1,IDmMSxHur,2017-01-18 17:50:53,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,,0
3,ID32T6wwQ,2017-01-17 10:18:43,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0


In [5]:
# imputing missing values
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None",inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None",inplace=True)
test['devid'].fillna("None",inplace=True)

In [6]:
train['browserid'] = train['browserid'].map(lambda x: 'Firefox' if(x=='Mozilla Firefox') else x)
train['browserid'] = train['browserid'].map(lambda x: 'Firefox' if(x=='Mozilla') else x)

train['browserid'] = train['browserid'].map(lambda x: 'Chrome' if(x=='Chrome') else x)
train['browserid'] = train['browserid'].map(lambda x: 'Chrome' if(x=='Google Chrome') else x)

train['browserid'] = train['browserid'].map(lambda x: 'IE' if(x=='InternetExplorer') else x)
train['browserid'] = train['browserid'].map(lambda x: 'IE' if(x=='Internet Explorer') else x)

test['browserid'] = test['browserid'].map(lambda x: 'Firefox' if(x=='Mozilla Firefox') else x)
test['browserid'] = test['browserid'].map(lambda x: 'Firefox' if(x=='Mozilla') else x)

test['browserid'] = test['browserid'].map(lambda x: 'Chrome' if(x=='Chrome') else x)
test['browserid'] = test['browserid'].map(lambda x: 'Chrome' if(x=='Google Chrome') else x)

test['browserid'] = test['browserid'].map(lambda x: 'IE' if(x=='InternetExplorer') else x)
test['browserid'] = test['browserid'].map(lambda x: 'IE' if(x=='Internet Explorer') else x)



In [7]:
# create timebased features
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

#train['tweekday'] = train['datetime'].dt.weekday
#test['tweekday'] = test['datetime'].dt.weekday

train['tyear'] = train['datetime'].dt.year
test['tyear'] = test['datetime'].dt.year

train['month'] = train['datetime'].dt.month
test['month'] = test['datetime'].dt.month

#train['tday'] = train['datetime'].dt.day
#test['tday'] = test['datetime'].dt.day

train['thour'] = train['datetime'].dt.hour
test['thour'] = test['datetime'].dt.hour

train['tminute'] = train['datetime'].dt.minute
test['tminute'] = test['datetime'].dt.minute

In [8]:
# create aggregate features
site_offer_count = train.groupby(['siteid','offerid']).size().reset_index()
site_offer_count.columns = ['siteid','offerid','site_offer_count']

site_offer_count_test = test.groupby(['siteid','offerid']).size().reset_index()
site_offer_count_test.columns = ['siteid','offerid','site_offer_count']

site_cat_count = train.groupby(['siteid','category']).size().reset_index()
site_cat_count.columns = ['siteid','category','site_cat_count']

site_cat_count_test = test.groupby(['siteid','category']).size().reset_index()
site_cat_count_test.columns = ['siteid','category','site_cat_count']

site_mcht_count = train.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count.columns = ['siteid','merchant','site_mcht_count']

site_mcht_count_test = test.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count_test.columns = ['siteid','merchant','site_mcht_count']

In [9]:
site_dev_count = train.groupby(['siteid','devid']).size().reset_index()
site_dev_count.columns = ['siteid','devid','site_dev_count']

site_dev_count_test = test.groupby(['siteid','devid']).size().reset_index()
site_dev_count_test.columns = ['siteid','devid','site_dev_count']

site_brows_count = train.groupby(['siteid','browserid']).size().reset_index()
site_brows_count.columns = ['siteid','browserid','site_brows_count']

site_brows_count_test = test.groupby(['siteid','browserid']).size().reset_index()
site_brows_count_test.columns = ['siteid','browserid','site_brows_count']

site_country_count = train.groupby(['siteid','countrycode']).size().reset_index()
site_country_count.columns = ['siteid','countrycode','site_country_count']

site_country_count_test = test.groupby(['siteid','countrycode']).size().reset_index()
site_country_count_test.columns = ['siteid','countrycode','site_country_count']


In [10]:

# joining all files
agg_df = [site_offer_count,site_cat_count,site_mcht_count,site_dev_count,site_brows_count,site_country_count]
agg_df_test = [site_offer_count_test,site_cat_count_test,site_mcht_count_test,site_dev_count_test,site_brows_count_test,site_country_count_test]

for x in agg_df:
    train = train.merge(x)
    
for x in agg_df_test:
    test = test.merge(x)

In [11]:
#from sklearn.preprocessing import OneHotEncoder
#enc = OneHotEncoder()
#enc.fit
train = pd.get_dummies(train,columns=['browserid','devid','countrycode'])
test = pd.get_dummies(test,columns=['browserid','devid','countrycode'])

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12137810 entries, 0 to 12137809
Data columns (total 34 columns):
ID                    object
datetime              datetime64[ns]
siteid                float64
offerid               int64
category              int64
merchant              int64
click                 int64
tyear                 int64
month                 int64
thour                 int64
tminute               int64
site_offer_count      int64
site_cat_count        int64
site_mcht_count       int64
site_dev_count        int64
site_brows_count      int64
site_country_count    int64
browserid_Chrome      uint8
browserid_Edge        uint8
browserid_Firefox     uint8
browserid_IE          uint8
browserid_None        uint8
browserid_Opera       uint8
browserid_Safari      uint8
devid_Desktop         uint8
devid_Mobile          uint8
devid_None            uint8
devid_Tablet          uint8
countrycode_a         uint8
countrycode_b         uint8
countrycode_c         uint8
count

In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3706907 entries, 0 to 3706906
Data columns (total 33 columns):
ID                    object
datetime              datetime64[ns]
siteid                float64
offerid               int64
category              int64
merchant              int64
tyear                 int64
month                 int64
thour                 int64
tminute               int64
site_offer_count      int64
site_cat_count        int64
site_mcht_count       int64
site_dev_count        int64
site_brows_count      int64
site_country_count    int64
browserid_Chrome      uint8
browserid_Edge        uint8
browserid_Firefox     uint8
browserid_IE          uint8
browserid_None        uint8
browserid_Opera       uint8
browserid_Safari      uint8
devid_Desktop         uint8
devid_Mobile          uint8
devid_None            uint8
devid_Tablet          uint8
countrycode_a         uint8
countrycode_b         uint8
countrycode_c         uint8
countrycode_d         uint8
country

In [16]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,click,tyear,month,thour,...,devid_Desktop,devid_Mobile,devid_None,devid_Tablet,countrycode_a,countrycode_b,countrycode_c,countrycode_d,countrycode_e,countrycode_f
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,0,2017,1,9,...,0,0,1,0,0,0,0,0,1,0
1,IDgRh6riJ,2017-01-14 16:27:58,4709696.0,262117,5602,55277834,0,2017,1,16,...,0,0,1,0,0,0,0,0,1,0
2,IDTdpK2Wo,2017-01-14 16:15:31,4709696.0,735140,82877,48615529,0,2017,1,16,...,0,0,1,0,0,0,0,0,1,0
3,ID3yGigGd,2017-01-13 06:09:26,4709696.0,347710,48498,7864011,0,2017,1,6,...,0,0,1,0,0,0,0,0,1,0
4,IDeXQJMLe,2017-01-14 10:08:14,4709696.0,248024,9764,84522677,0,2017,1,10,...,0,0,1,0,0,0,0,0,1,0


In [17]:
test.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,tyear,month,thour,tminute,...,devid_Desktop,devid_Mobile,devid_None,devid_Tablet,countrycode_a,countrycode_b,countrycode_c,countrycode_d,countrycode_e,countrycode_f
0,IDFDJVI,2017-01-22 09:55:48,755610.0,808980,17714,26391770,2017,1,9,55,...,1,0,0,0,0,1,0,0,0,0
1,ID4mzgU,2017-01-22 09:55:48,755610.0,550246,17714,26391770,2017,1,9,55,...,1,0,0,0,0,1,0,0,0,0
2,IDLyixf,2017-01-22 09:55:48,755610.0,390677,17714,61682792,2017,1,9,55,...,1,0,0,0,0,1,0,0,0,0
3,IDJxmu8,2017-01-22 09:54:57,755610.0,30278,33638,8958548,2017,1,9,54,...,1,0,0,0,0,1,0,0,0,0
4,IDZGhMA,2017-01-22 09:54:57,755610.0,965657,33638,79906734,2017,1,9,54,...,1,0,0,0,0,1,0,0,0,0


In [18]:

# Label Encoding
from sklearn.preprocessing import LabelEncoder
for c in list(train.select_dtypes(include=['object']).columns):
    if c != 'ID':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [19]:

# sample 10% data - to avoid memory troubles
# if you have access to large machines, you can use more data for training

#train = train.sample()
print (train.shape)

(12137810, 34)


In [20]:
# select columns to choose
cols_to_use = [x for x in train.columns if x not in list(['ID','datetime','click'])]

In [21]:

# standarise data before training
scaler = StandardScaler().fit(train[cols_to_use])

strain = scaler.transform(train[cols_to_use])
stest = scaler.transform(test[cols_to_use])

In [29]:
# train validation split
X_train, X_valid, Y_train, Y_valid = train_test_split(strain, train.click, test_size = 0.3, random_state=2017)

In [30]:
print (X_train.shape)
print (X_valid.shape)
print (Y_train.shape)
print (Y_valid.shape)

(8496467, 31)
(3641343, 31)
(8496467,)
(3641343,)


In [31]:
# model architechture
def keras_model(train):
    
    input_dim = train.shape[1]
    classes = 2
    
    model = Sequential()
    model.add(Dense(300, activation = 'relu', input_shape = (input_dim,)))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(30, activation = 'relu'))
    model.add(Dense(classes, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss='binary_crossentropy',metrics = ['accuracy','binary_accuracy'])
    return model

callback = EarlyStopping(monitor='val_acc',patience=3)

In [32]:
# one hot target columns
Y_train = to_categorical(Y_train)
Y_valid = to_categorical(Y_valid)

In [33]:
# train model
model = keras_model(X_train)
model.fit(X_train, Y_train, 4000, 100, callbacks=[callback],validation_data=(X_valid, Y_valid),shuffle=True)

Train on 8496467 samples, validate on 3641343 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


<keras.callbacks.History at 0x7f94d9c65650>

In [34]:
#from keras.utils import plot_model
#plot_model(model, to_file='model.png')

In [35]:
# check validation accuracy
vpreds = model.predict_proba(X_valid)[:,1]
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true = Y_valid[:,1], y_score=vpreds)



0.97711232483557764

In [36]:
# predict on test data
test_preds = model.predict_proba(stest)[:,1]



In [37]:
# create submission file
submit = pd.DataFrame({'ID':test.ID, 'click':test_preds})
submit.to_csv('Submission/Keras_sub_new_new1.csv', index=False)

In [None]:
from xgboost import XGBClassifier

In [None]:
test.head()

In [None]:
import gc
del X_valid
del Y_valid
del X_train
del Y_train
#del test
#del model
del submit
#del 
gc.collect()

In [None]:
XGB_model = XGBClassifier()

In [None]:
XGB_model.fit(strain, train.click)

In [None]:
test_preds2 = XGB_model.predict_proba(stest)[:,1]

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(XGB_model)
plt.show()

In [None]:
test_preds2[1]

In [None]:
test_preds2[2]

In [None]:
test = pd.read_csv('DATASET/test.csv')

In [None]:
submitXGB = pd.DataFrame({'ID':test.ID, 'click':test_preds2})
submitXGB.to_csv('Submission/XGB2new2.csv', index=False)

In [None]:
test_pred_ensemble = (test_preds2 + test_preds)/2.0
submitE = pd.DataFrame({'ID':test.ID, 'click':test_pred_ensemble})
submitE.to_csv('Submission/ENSEMBLE2new2.csv', index=False)

In [None]:
#X_train.shape

In [None]:
vpreds.shape

In [None]:
Y_train_pred = model.predict(strain)[:,1]

In [None]:
roc_auc_score(train.click,Y_train_pred)

In [None]:
strain.shape

In [None]:
Y_train_pred.shape

In [None]:
Data_l1 = pd.DataFrame(data=strain)

In [None]:
Data_l1["Y_pred"]=Y_train_pred

In [None]:
Data_l1.shape

In [None]:
Test_l1 = pd.DataFrame(data=stest)

In [None]:
Test_l1["Y_pred"] = test_preds

In [None]:
XGB_model_L2 = XGBClassifier()
XGB_model_L2.fit(Data_l1, train.click)

In [None]:
test_preds_L2 = XGB_model_L2.predict_proba(Test_l1)[:,1]

In [None]:
submitXGB_l2 = pd.DataFrame({'ID':test.ID, 'click':test_preds_L2})
#submitXGB_l2.to_csv('Submission/XGB_L2_2.csv', index=False)

In [None]:
test_preds_L2[1]

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(XGB_model_L2)
plt.show()