In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
train = pd.read_csv('DATASET/train.csv')
test = pd.read_csv('DATASET/test.csv')

In [3]:
print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The train data has 12137810 rows and 10 columns
The test data has 3706907 rows and 9 columns


In [4]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,e,Firefox,,0
1,IDmMSxHur,2017-01-18 17:50:53,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,,0
3,ID32T6wwQ,2017-01-17 10:18:43,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0


In [5]:
# imputing missing values
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None",inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None",inplace=True)
test['devid'].fillna("None",inplace=True)

In [8]:

# create timebased features

train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train['tweekday'] = train['datetime'].dt.weekday
test['tweekday'] = test['datetime'].dt.weekday

train['tyear'] = train['datetime'].dt.year
test['tyear'] = test['datetime'].dt.year

train['tmonth'] = train['datetime'].dt.month
test['tmonth'] = test['datetime'].dt.month

train['tday'] = train['datetime'].dt.day
test['tday'] = test['datetime'].dt.day

train['thour'] = train['datetime'].dt.hour
test['thour'] = test['datetime'].dt.hour

train['tminute'] = train['datetime'].dt.minute
test['tminute'] = test['datetime'].dt.minute

In [9]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute,tyear,tmonth,tday
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,e,Firefox,,0,5,9,42,2017,1,14
1,IDmMSxHur,2017-01-18 17:50:53,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0,2,17,50,2017,1,18
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,,0,2,12,46,2017,1,11
3,ID32T6wwQ,2017-01-17 10:18:43,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0,1,10,18,2017,1,17
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0,5,16,2,2017,1,14


In [10]:
# create aggregate features
site_offer_count = train.groupby(['siteid','offerid']).size().reset_index()
site_offer_count.columns = ['siteid','offerid','site_offer_count']

site_offer_count_test = test.groupby(['siteid','offerid']).size().reset_index()
site_offer_count_test.columns = ['siteid','offerid','site_offer_count']

site_cat_count = train.groupby(['siteid','category']).size().reset_index()
site_cat_count.columns = ['siteid','category','site_cat_count']

site_cat_count_test = test.groupby(['siteid','category']).size().reset_index()
site_cat_count_test.columns = ['siteid','category','site_cat_count']

site_mcht_count = train.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count.columns = ['siteid','merchant','site_mcht_count']

site_mcht_count_test = test.groupby(['siteid','merchant']).size().reset_index()
site_mcht_count_test.columns = ['siteid','merchant','site_mcht_count']

In [11]:

# joining all files
agg_df = [site_offer_count,site_cat_count,site_mcht_count]
agg_df_test = [site_offer_count_test,site_cat_count_test,site_mcht_count_test]

for x in agg_df:
    train = train.merge(x)
    
for x in agg_df_test:
    test = test.merge(x)

In [12]:

# Label Encoding
from sklearn.preprocessing import LabelEncoder
for c in list(train.select_dtypes(include=['object']).columns):
    if c != 'ID':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [13]:

# sample 10% data - to avoid memory troubles
# if you have access to large machines, you can use more data for training

train = train.sample(train.shape[0])
print (train.shape)

(12137810, 19)


In [14]:
# select columns to choose
cols_to_use = [x for x in train.columns if x not in list(['ID','datetime','click'])]

In [15]:

# standarise data before training
scaler = StandardScaler().fit(train[cols_to_use])

strain = scaler.transform(train[cols_to_use])
stest = scaler.transform(test[cols_to_use])

In [16]:
from sklearn.decomposition import PCA,FastICA

In [18]:
#pca = PCA(random_state=42)
#strain_pca = pca.fit_transform(strain)
#stest_pca = pca.transform(stest)

#ica = FastICA(random_state=42)
#strain_ica = ica.fit_transform(strain)
#stest_ica = ica.transform(stest)

In [23]:
#strain.savetxt("strain1.csv", a, delimiter=",")
#stest.savetxt("stest1.csv",a,delimiter=",")

AttributeError: 'numpy.ndarray' object has no attribute 'savetxt'

In [24]:
train_data= pd.DataFrame(data=strain)

In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12137810 entries, 0 to 12137809
Data columns (total 16 columns):
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
dtypes: float64(16)
memory usage: 1.4 GB


In [26]:
train_data.to_csv("train_new1.csv",index=False)

In [27]:
test_data = pd.DataFrame(data=stest)
test_data.to_csv("test_new1.csv",index=False)