In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc

plt.style.use('ggplot')

import warnings as w
w.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
encoder = LabelEncoder()

In [4]:
dtype={
    'CampaignId':np.dtype(int),
    'AdGroupId':np.dtype(int),
    'KeywordId':np.dtype(int),
    'Query':np.dtype(str),
    'QueryMatchTypeWithVariant':np.dtype(str),
    'Device':np.dtype(str),
    'Date':np.dtype(str),
    'AveragePosition':np.dtype(float),
    'Clicks':np.dtype(float),
    'Impressions':np.dtype(float),
}

In [5]:
train = pd.read_csv('babushka-ctr/train.csv.zip', dtype=dtype)

In [6]:
test = pd.read_csv('babushka-ctr/test.csv.zip', dtype=dtype)

In [7]:
df = pd.concat([train, test])

In [8]:
train['Date'] = pd.to_datetime(train['Date']).astype(int)/10**9
test['Date'] = pd.to_datetime(test['Date']).astype(int)/10**9

In [9]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690,7953835370,116881275,may loan,NEAR_EXACT,HIGH_END_MOBILE,1.472170e+09,1.0,1.0,1.0
1,150950690,7953835370,116881275,my lloan,NEAR_EXACT,DESKTOP,1.473984e+09,1.0,1.0,1.0
2,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.459814e+09,2.0,1.0,1.0
3,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.460419e+09,2.0,1.0,1.0
4,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.460938e+09,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
56217,150951650,7953852170,4761117171,finansiering mc,EXACT,TABLET,1.463443e+09,1.0,1.0,1.0
56218,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.456790e+09,2.0,0.0,1.0
56219,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.457914e+09,2.3,0.0,4.0
56220,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.458518e+09,2.5,0.0,2.0


In [None]:
for idx, column in enumerate(train.columns):
    # print(f'{column:30}\t{train[column].nunique()}')
    if idx >=3 and column != 'Clicks' and column != 'Impressions' and train[column].nunique() < 1000:
        encoder = LabelEncoder()
        encoder.fit(df[column])
        train[column] = encoder.transform(train[column])
        test[column] = encoder.transform(test[column])

In [11]:
train.drop(columns=['Impressions'], inplace=True)
# test.drop(columns=['RowId'], inplace=True)

In [12]:
train['Date'] = train['Date'].astype(int)
test['Date'] = test['Date'].astype(int)

In [13]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks
0,150950690,7953835370,116881275,may loan,2,1,1472169600,1.0,1.0
1,150950690,7953835370,116881275,my lloan,2,0,1473984000,1.0,1.0
2,150950690,7953835370,116881275,my loan,0,0,1459814400,2.0,1.0
3,150950690,7953835370,116881275,my loan,0,0,1460419200,2.0,1.0
4,150950690,7953835370,116881275,my loan,0,0,1460937600,1.0,0.0
...,...,...,...,...,...,...,...,...,...
56217,150951650,7953852170,4761117171,finansiering mc,0,2,1463443200,1.0,1.0
56218,150951650,7953852290,10765411427,snöskoterlån,0,0,1456790400,2.0,0.0
56219,150951650,7953852290,10765411427,snöskoterlån,0,0,1457913600,2.3,0.0
56220,150951650,7953852290,10765411427,snöskoterlån,0,0,1458518400,2.5,0.0


In [14]:
df

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions,RowId
0,150950690,7953835370,116881275,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0,
1,150950690,7953835370,116881275,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0,
2,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0,
3,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0,
4,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...
87799,150950810,24573479450,24155728639,marginalen bank,EXACT,HIGH_END_MOBILE,2017-02-03,2.0,,,87799.0
87800,150950810,7953835490,4181417588,lendo,EXACT,DESKTOP,2017-02-15,4.9,,,87800.0
87801,150950810,24573479450,29884667991,marginalen bank varning,PHRASE,DESKTOP,2017-01-17,2.3,,,87801.0
87802,150950810,7953835490,4181417588,lendo,EXACT,DESKTOP,2017-02-17,5.4,,,87802.0


In [15]:
encoder = LabelEncoder()
encoder.fit(df['AveragePosition'])
train['AveragePosition'] = encoder.transform(train['AveragePosition'])
test['AveragePosition'] = encoder.transform(test['AveragePosition'])

In [16]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks
0,150950690,7953835370,116881275,may loan,2,1,1472169600,0,1.0
1,150950690,7953835370,116881275,my lloan,2,0,1473984000,0,1.0
2,150950690,7953835370,116881275,my loan,0,0,1459814400,10,1.0
3,150950690,7953835370,116881275,my loan,0,0,1460419200,10,1.0
4,150950690,7953835370,116881275,my loan,0,0,1460937600,0,0.0
...,...,...,...,...,...,...,...,...,...
56217,150951650,7953852170,4761117171,finansiering mc,0,2,1463443200,0,1.0
56218,150951650,7953852290,10765411427,snöskoterlån,0,0,1456790400,10,0.0
56219,150951650,7953852290,10765411427,snöskoterlån,0,0,1457913600,13,0.0
56220,150951650,7953852290,10765411427,snöskoterlån,0,0,1458518400,15,0.0


In [18]:
df["hash"] = pd.util.hash_array(df["Query"].to_numpy())

In [19]:
train['Query'] = df['hash'][:56222]
test['Query'] = df['hash'][56222:]

In [22]:
sparse_features = train.iloc[::,:-1].columns.values.tolist()
target = ['Clicks']

In [23]:
sparse_features

['CampaignId',
 'AdGroupId',
 'KeywordId',
 'Query',
 'QueryMatchTypeWithVariant',
 'Device',
 'Date',
 'AveragePosition']

In [24]:
test_sparse_features = test.iloc[::,:-1].columns.values.tolist()

In [25]:
test_sparse_features

['CampaignId',
 'AdGroupId',
 'KeywordId',
 'Query',
 'QueryMatchTypeWithVariant',
 'Device',
 'Date',
 'AveragePosition']

In [26]:
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [27]:
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique()) for feat in sparse_features]

In [28]:
for idx, column in enumerate(test.columns):
    print(f'{column:30}\t{test[column].nunique()}')

CampaignId                    	3
AdGroupId                     	13
KeywordId                     	82
Query                         	894
QueryMatchTypeWithVariant     	5
Device                        	3
Date                          	50
AveragePosition               	60
RowId                         	87804


In [29]:
fixlen_feature_columns

[SparseFeat(name='CampaignId', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='CampaignId', group_name='default_group'),
 SparseFeat(name='AdGroupId', vocabulary_size=120, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='AdGroupId', group_name='default_group'),
 SparseFeat(name='KeywordId', vocabulary_size=469, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='KeywordId', group_name='default_group'),
 SparseFeat(name='Query', vocabulary_size=4784, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Query', group_name='default_group'),
 SparseFeat(name='QueryMatchTypeWithVariant', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='QueryMatchTypeWithVariant', group_name='default_group'),
 SparseFeat(name='Device', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Device', group_name='default_group'),
 SparseFeat(name='Date', vocabulary_size=413, emb

In [30]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [31]:
linear_feature_columns

[SparseFeat(name='CampaignId', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='CampaignId', group_name='default_group'),
 SparseFeat(name='AdGroupId', vocabulary_size=120, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='AdGroupId', group_name='default_group'),
 SparseFeat(name='KeywordId', vocabulary_size=469, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='KeywordId', group_name='default_group'),
 SparseFeat(name='Query', vocabulary_size=4784, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Query', group_name='default_group'),
 SparseFeat(name='QueryMatchTypeWithVariant', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='QueryMatchTypeWithVariant', group_name='default_group'),
 SparseFeat(name='Device', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Device', group_name='default_group'),
 SparseFeat(name='Date', vocabulary_size=413, emb

In [32]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['CampaignId',
 'AdGroupId',
 'KeywordId',
 'Query',
 'QueryMatchTypeWithVariant',
 'Device',
 'Date',
 'AveragePosition']

In [33]:
train_model_input = {name: train[name].values for name in feature_names}

In [34]:
train_model_input

{'CampaignId': array([150950690, 150950690, 150950690, ..., 150951650, 150951650,
        150951650]),
 'AdGroupId': array([7953835370, 7953835370, 7953835370, ..., 7953852290, 7953852290,
        7953852290]),
 'KeywordId': array([  116881275,   116881275,   116881275, ..., 10765411427,
        10765411427, 10765411427]),
 'Query': array([10869759176141822138,  5197190318043947393, 13907592294344053726,
        ...,  6588020951202118443,  6588020951202118443,
         6588020951202118443], dtype=uint64),
 'QueryMatchTypeWithVariant': array([2, 2, 0, ..., 0, 0, 0]),
 'Device': array([1, 0, 0, ..., 0, 0, 2]),
 'Date': array([1472169600, 1473984000, 1459814400, ..., 1457913600, 1458518400,
        1456790400]),
 'AveragePosition': array([ 0,  0, 10, ..., 13, 15, 10])}

In [35]:
test_model_input = {name: test[name].values for name in feature_names}

In [36]:
test_model_input

{'CampaignId': array([150950810, 150951290, 150950810, ..., 150950810, 150950810,
        150950810]),
 'AdGroupId': array([15401617250,  7953845090, 24573357890, ..., 24573479450,
         7953835490,  7953835490]),
 'KeywordId': array([62521739084,   191069676,   570188752, ..., 29884667991,
         4181417588, 42819814963]),
 'Query': array([15212485243717812661,  2591963728756964115, 12969578894422650563,
        ..., 18277780985400508378,   496844593792521870,
         4366996627797246793], dtype=uint64),
 'QueryMatchTypeWithVariant': array([0, 0, 0, ..., 4, 0, 4]),
 'Device': array([0, 0, 1, ..., 0, 0, 1]),
 'Date': array([1486944000, 1486684800, 1485043200, ..., 1484611200, 1487289600,
        1483574400]),
 'AveragePosition': array([ 0,  5,  6, ..., 13, 44, 34])}

In [37]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,
               task='binary',device=device,dnn_dropout=0.7)
model

DeepFM(
  (embedding_dict): ModuleDict(
    (CampaignId): Embedding(9, 4)
    (AdGroupId): Embedding(120, 4)
    (KeywordId): Embedding(469, 4)
    (Query): Embedding(4784, 4)
    (QueryMatchTypeWithVariant): Embedding(5, 4)
    (Device): Embedding(3, 4)
    (Date): Embedding(413, 4)
    (AveragePosition): Embedding(100, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (CampaignId): Embedding(9, 1)
      (AdGroupId): Embedding(120, 1)
      (KeywordId): Embedding(469, 1)
      (Query): Embedding(4784, 1)
      (QueryMatchTypeWithVariant): Embedding(5, 1)
      (Device): Embedding(3, 1)
      (Date): Embedding(413, 1)
      (AveragePosition): Embedding(100, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0.7, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=32, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
    (activation_layers): ModuleList(
  

In [39]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [40]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks
0,150950690,7953835370,116881275,10869759176141822138,2,1,1472169600,0,1.0
1,150950690,7953835370,116881275,5197190318043947393,2,0,1473984000,0,1.0
2,150950690,7953835370,116881275,13907592294344053726,0,0,1459814400,10,1.0
3,150950690,7953835370,116881275,13907592294344053726,0,0,1460419200,10,1.0
4,150950690,7953835370,116881275,13907592294344053726,0,0,1460937600,0,0.0
...,...,...,...,...,...,...,...,...,...
56217,150951650,7953852170,4761117171,3006963520013987137,0,2,1463443200,0,1.0
56218,150951650,7953852290,10765411427,6588020951202118443,0,0,1456790400,10,0.0
56219,150951650,7953852290,10765411427,6588020951202118443,0,0,1457913600,13,0.0
56220,150951650,7953852290,10765411427,6588020951202118443,0,0,1458518400,15,0.0


In [41]:
train_model_input

{'CampaignId': array([150950690, 150950690, 150950690, ..., 150951650, 150951650,
        150951650]),
 'AdGroupId': array([7953835370, 7953835370, 7953835370, ..., 7953852290, 7953852290,
        7953852290]),
 'KeywordId': array([  116881275,   116881275,   116881275, ..., 10765411427,
        10765411427, 10765411427]),
 'Query': array([10869759176141822138,  5197190318043947393, 13907592294344053726,
        ...,  6588020951202118443,  6588020951202118443,
         6588020951202118443], dtype=uint64),
 'QueryMatchTypeWithVariant': array([2, 2, 0, ..., 0, 0, 0]),
 'Device': array([1, 0, 0, ..., 0, 0, 2]),
 'Date': array([1472169600, 1473984000, 1459814400, ..., 1457913600, 1458518400,
        1456790400]),
 'AveragePosition': array([ 0,  0, 10, ..., 13, 15, 10])}

In [None]:
%%time
history = model.fit(train_model_input, train[target].astype(int).values, batch_size=1024, epochs=5, verbose=1,
                        validation_split=0.2)

In [74]:
from catboost import CatBoostClassifier

In [90]:
#cat_features=[x for x in train.columns if x not in ['Query', 'Clicks', 'Date']]
model = CatBoostClassifier()

In [91]:
model.fit(train.drop(columns=['Clicks']), train['Clicks'].astype(int), verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2a9f420a0>

In [83]:
out = model.predict(test)

CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

In [67]:
test

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,RowId
0,150950810,15401617250,62521739084,15212485243717812661,0,0,1486944000,0,0
1,150951290,7953845090,191069676,2591963728756964115,0,0,1486684800,5,1
2,150950810,24573357890,570188752,12969578894422650563,0,1,1485043200,6,2
3,150950810,24573357890,570188752,12969578894422650563,0,0,1484697600,2,3
4,150950810,15401617250,62521739084,15212485243717812661,0,1,1487030400,0,4
...,...,...,...,...,...,...,...,...,...
87799,150950810,24573479450,24155728639,14680085322823795760,0,1,1486080000,10,87799
87800,150950810,7953835490,4181417588,496844593792521870,0,0,1487116800,39,87800
87801,150950810,24573479450,29884667991,18277780985400508378,4,0,1484611200,13,87801
87802,150950810,7953835490,4181417588,496844593792521870,0,0,1487289600,44,87802


In [68]:
df[len(train):]

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions,RowId,hash
0,150950810,15401617250,62521739084,consida,EXACT,DESKTOP,2017-02-13,1.0,,,0.0,15212485243717812661
1,150951290,7953845090,191069676,billån,EXACT,DESKTOP,2017-02-10,1.5,,,1.0,2591963728756964115
2,150950810,24573357890,570188752,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6,,,2.0,12969578894422650563
3,150950810,24573357890,570188752,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2,,,3.0,12969578894422650563
4,150950810,15401617250,62521739084,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0,,,4.0,15212485243717812661
...,...,...,...,...,...,...,...,...,...,...,...,...
87799,150950810,24573479450,24155728639,marginalen bank,EXACT,HIGH_END_MOBILE,2017-02-03,2.0,,,87799.0,14680085322823795760
87800,150950810,7953835490,4181417588,lendo,EXACT,DESKTOP,2017-02-15,4.9,,,87800.0,496844593792521870
87801,150950810,24573479450,29884667991,marginalen bank varning,PHRASE,DESKTOP,2017-01-17,2.3,,,87801.0,18277780985400508378
87802,150950810,7953835490,4181417588,lendo,EXACT,DESKTOP,2017-02-17,5.4,,,87802.0,496844593792521870


In [69]:
ans = pd.DataFrame(columns=['RowId', 'ClickProbability'])

In [70]:
ans['RowId'] = df[len(train):]['RowId'].astype(int)

In [71]:
ans['ClickProbability'] = out

In [72]:
ans.to_csv('submission_2.csv', index=False)

In [73]:
ans

Unnamed: 0,RowId,ClickProbability
0,0,0.469354
1,1,2.358965
2,2,1.626329
3,3,0.798699
4,4,1.063572
...,...,...
87799,87799,2.496973
87800,87800,0.456688
87801,87801,0.390525
87802,87802,0.469047
