In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
train = pd.read_csv('DATASET/train.csv')
test = pd.read_csv('DATASET/test.csv')

In [3]:
print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The train data has 12137810 rows and 10 columns
The test data has 3706907 rows and 9 columns


In [4]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,e,Firefox,,0
1,IDmMSxHur,2017-01-18 17:50:53,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,,0
3,ID32T6wwQ,2017-01-17 10:18:43,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0


In [5]:
# imputing missing values
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None",inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None",inplace=True)
test['devid'].fillna("None",inplace=True)

In [6]:

# create timebased features

train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train['tweekday'] = train['datetime'].dt.weekday
test['tweekday'] = test['datetime'].dt.weekday

train['tyear'] = train['datetime'].dt.year
test['tyear'] = test['datetime'].dt.year

train['month'] = train['datetime'].dt.month
test['month'] = test['datetime'].dt.month

train['tday'] = train['datetime'].dt.day
test['tday'] = test['datetime'].dt.day

train['thour'] = train['datetime'].dt.hour
test['thour'] = test['datetime'].dt.hour

train['tminute'] = train['datetime'].dt.minute
test['tminute'] = test['datetime'].dt.minute

In [7]:
# create aggregate features
site_offer_count = train.groupby(['siteid','offerid']).size().reset_index()
site_offer_count.columns = ['siteid','offerid','site_offer_count']

site_offer_count_test = test.groupby(['siteid','offerid']).size().reset_index()
site_offer_count_test.columns = ['siteid','offerid','site_offer_count']

site_cat_count = train.groupby(['siteid','category']).size().reset_index()
site_cat_count.columns = ['siteid','category','site_cat_count']

site_cat_count_test = test.groupby(['siteid','category']).size().reset_index()
site_cat_count_test.columns = ['siteid','category','site_cat_count']

site_mcht_count = train.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count.columns = ['siteid','merchant','site_mcht_count']

site_mcht_count_test = test.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count_test.columns = ['siteid','merchant','site_mcht_count']

In [8]:

# joining all files
agg_df = [site_offer_count,site_cat_count,site_mcht_count]
agg_df_test = [site_offer_count_test,site_cat_count_test,site_mcht_count_test]

for x in agg_df:
    train = train.merge(x)
    
for x in agg_df_test:
    test = test.merge(x)

In [9]:

# Label Encoding
from sklearn.preprocessing import LabelEncoder
for c in list(train.select_dtypes(include=['object']).columns):
    if c != 'ID':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [10]:

# sample 10% data - to avoid memory troubles
# if you have access to large machines, you can use more data for training

#train = train.sample()
print (train.shape)

(12137810, 19)


In [11]:
# select columns to choose
cols_to_use = [x for x in train.columns if x not in list(['ID','datetime','click'])]

In [29]:
training = train[cols_to_use]
testing =test[cols_to_use]
training = training.sample(7662, random_state=12)
testing = test.sample(7000, random_state=11)

In [31]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7662 entries, 1405982 to 1595177
Data columns (total 16 columns):
siteid              7662 non-null float64
offerid             7662 non-null int64
category            7662 non-null int64
merchant            7662 non-null int64
countrycode         7662 non-null int64
browserid           7662 non-null int64
devid               7662 non-null int64
tweekday            7662 non-null int64
tyear               7662 non-null int64
month               7662 non-null int64
tday                7662 non-null int64
thour               7662 non-null int64
tminute             7662 non-null int64
site_offer_count    7662 non-null int64
site_cat_count      7662 non-null int64
site_mcht_count     7662 non-null int64
dtypes: float64(1), int64(15)
memory usage: 1017.6 KB


In [30]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 1497740 to 29887
Data columns (total 18 columns):
ID                  7000 non-null object
datetime            7000 non-null datetime64[ns]
siteid              7000 non-null float64
offerid             7000 non-null int64
category            7000 non-null int64
merchant            7000 non-null int64
countrycode         7000 non-null int64
browserid           7000 non-null int64
devid               7000 non-null int64
tweekday            7000 non-null int64
tyear               7000 non-null int64
month               7000 non-null int64
tday                7000 non-null int64
thour               7000 non-null int64
tminute             7000 non-null int64
site_offer_count    7000 non-null int64
site_cat_count      7000 non-null int64
site_mcht_count     7000 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(15), object(1)
memory usage: 1.0+ MB


In [13]:
training['origin'] = 0
testing['origin'] = 1

In [20]:
combi = training.append(testing)
y = combi['origin']
combi.drop('origin',axis=1,inplace=True)

KeyError: 'origin'

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
model = RandomForestClassifier(n_estimators = 50, max_depth = 5,min_samples_leaf = 5)
drop_list = []
for i in combi.columns:
    score = cross_val_score(model,pd.DataFrame(combi[i]),y,cv=2,scoring='roc_auc')
    if (np.mean(score) > 0.8):
        drop_list.append(i)
        print(i,np.mean(score))

ValueError: could not convert string to float: ID9yLW0