In [256]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc

plt.style.use('ggplot')
import warnings as w
w.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

In [257]:
from sklearn.preprocessing import LabelEncoder

In [258]:
encoder = LabelEncoder()

In [259]:
dtype={
    'CampaignId':np.dtype(int),
    'AdGroupId':np.dtype(int),
    'KeywordId':np.dtype(int),
    'Query':np.dtype(str),
    'QueryMatchTypeWithVariant':np.dtype(str),
    'Device':np.dtype(str),
    'AveragePosition':np.dtype(float),
    'Clicks':np.dtype(float),
    'Impressions':np.dtype(float),
}

train = pd.read_csv('babushka-ctr/train.csv.zip', dtype=dtype, parse_dates=['Date'])
test = pd.read_csv('babushka-ctr/test.csv.zip', dtype=dtype, parse_dates=['Date'])
df = pd.concat([train, test])

In [260]:
train['Date'] = train['Date'].astype(int)/10**9
test['Date'] = test['Date'].astype(int)/10**9
df['Date'] = df['Date'].astype(int)/10**9

In [261]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690,7953835370,116881275,may loan,NEAR_EXACT,HIGH_END_MOBILE,1.472170e+09,1.0,1.0,1.0
1,150950690,7953835370,116881275,my lloan,NEAR_EXACT,DESKTOP,1.473984e+09,1.0,1.0,1.0
2,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.459814e+09,2.0,1.0,1.0
3,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.460419e+09,2.0,1.0,1.0
4,150950690,7953835370,116881275,my loan,EXACT,DESKTOP,1.460938e+09,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
56217,150951650,7953852170,4761117171,finansiering mc,EXACT,TABLET,1.463443e+09,1.0,1.0,1.0
56218,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.456790e+09,2.0,0.0,1.0
56219,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.457914e+09,2.3,0.0,4.0
56220,150951650,7953852290,10765411427,snöskoterlån,EXACT,DESKTOP,1.458518e+09,2.5,0.0,2.0


In [262]:
df.nunique()

CampaignId                       9
AdGroupId                      120
KeywordId                      469
Query                         4784
QueryMatchTypeWithVariant        5
Device                           3
Date                           413
AveragePosition                100
Clicks                          37
Impressions                    420
RowId                        87804
dtype: int64

In [263]:
df.Clicks.unique()

array([ 1.,  0.,  3.,  2.,  5.,  4.,  6., 13.,  7.,  8., 15.,  9., 22.,
       18., 24., 12., 25., 17., 14., 11., 10., 19., 16., 21., 27., 20.,
       28., 23., 32., 52., 47., 43., 33., 30., 36., 38., 29., nan])

In [264]:
train['Clicks'] = (train['Clicks'] > 0).astype(int)
df['Clicks'] = (df['Clicks'] > 0).astype(int)
df.Clicks.unique()

array([1, 0])

In [265]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56222 entries, 0 to 56221
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CampaignId                 56222 non-null  int64  
 1   AdGroupId                  56222 non-null  int64  
 2   KeywordId                  56222 non-null  int64  
 3   Query                      56222 non-null  object 
 4   QueryMatchTypeWithVariant  56222 non-null  object 
 5   Device                     56222 non-null  object 
 6   Date                       56222 non-null  float64
 7   AveragePosition            56222 non-null  float64
 8   Clicks                     56222 non-null  int64  
 9   Impressions                56222 non-null  float64
dtypes: float64(3), int64(4), object(3)
memory usage: 4.3+ MB


In [266]:
features_to_labelencode = ['CampaignId', 'AdGroupId', 'KeywordId', 'QueryMatchTypeWithVariant', 'Device']
features_to_hash = ['Query']

In [267]:
# for idx, column in enumerate(train.columns):
#     # print(f'{column:30}\t{train[column].nunique()}')
#     if idx >=3 and column != 'Clicks' and column != 'Impressions' and train[column].nunique() < 1000:
for feat in features_to_labelencode:
        print(feat)
        encoder = LabelEncoder()
        df[feat] = encoder.fit_transform(df[feat])
        train[feat] = encoder.transform(train[feat])
        test[feat] = encoder.transform(test[feat])

CampaignId
AdGroupId
KeywordId
QueryMatchTypeWithVariant
Device


In [268]:
from sklearn.feature_extraction.text import HashingVectorizer

N_FEATURES = 16

for feat in features_to_hash:
    print(feat)
    vectorizer = HashingVectorizer(n_features=N_FEATURES)
    # vectorizer will produce N_FEATURES new columns
    new_features = [f"{feat}_{i}" for i in range(N_FEATURES)]
    df[new_features] = vectorizer.fit_transform(df[feat].values).todense()
    train[new_features] = vectorizer.transform(train[feat].values).todense()
    test[new_features] = vectorizer.transform(test[feat].values).todense()
    # remove old feature
    df.drop(columns=[feat], inplace=True)
    train.drop(columns=[feat], inplace=True)
    test.drop(columns=[feat], inplace=True)

Query


In [269]:
train.drop(columns=['Impressions'], inplace=True)
test.drop(columns=['RowId'], inplace=True)
df.drop(columns=['Impressions', 'RowId'], inplace=True)

In [270]:
train.nunique()

CampaignId                     9
AdGroupId                    120
KeywordId                    465
QueryMatchTypeWithVariant      5
Device                         3
Date                         363
AveragePosition              100
Clicks                         2
Query_0                       42
Query_1                       38
Query_2                       43
Query_3                       36
Query_4                       45
Query_5                       47
Query_6                       33
Query_7                       46
Query_8                       36
Query_9                       37
Query_10                      46
Query_11                      42
Query_12                      38
Query_13                      41
Query_14                      35
Query_15                      50
dtype: int64

In [271]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56222 entries, 0 to 56221
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CampaignId                 56222 non-null  int64  
 1   AdGroupId                  56222 non-null  int64  
 2   KeywordId                  56222 non-null  int64  
 3   QueryMatchTypeWithVariant  56222 non-null  int64  
 4   Device                     56222 non-null  int64  
 5   Date                       56222 non-null  float64
 6   AveragePosition            56222 non-null  float64
 7   Clicks                     56222 non-null  int64  
 8   Query_0                    56222 non-null  float64
 9   Query_1                    56222 non-null  float64
 10  Query_2                    56222 non-null  float64
 11  Query_3                    56222 non-null  float64
 12  Query_4                    56222 non-null  float64
 13  Query_5                    56222 non-null  flo

In [272]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Query_0,Query_1,Query_2,Query_3,Query_4,Query_5,Query_6,Query_7,Query_8,Query_9,Query_10,Query_11,Query_12,Query_13,Query_14,Query_15
0,0,0,6,2,1,1.472170e+09,1.0,1,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,-0.707107,0.0,0.0,0.0,0.0
1,0,0,6,2,0,1.473984e+09,1.0,1,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.000000,0.0,0.0,0.0,0.0
2,0,0,6,0,0,1.459814e+09,2.0,1,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.000000,0.0,0.0,0.0,0.0
3,0,0,6,0,0,1.460419e+09,2.0,1,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.000000,0.0,0.0,0.0,0.0
4,0,0,6,0,0,1.460938e+09,1.0,0,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56217,8,85,117,0,2,1.463443e+09,1.0,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0
56218,8,86,148,0,0,1.456790e+09,2.0,0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
56219,8,86,148,0,0,1.457914e+09,2.3,0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
56220,8,86,148,0,0,1.458518e+09,2.5,0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


In [273]:
# encoder = LabelEncoder()
# encoder.fit(df['AveragePosition'])

# train['AveragePosition'] = encoder.transform(train['AveragePosition'])
# test['AveragePosition'] = encoder.transform(test['AveragePosition'])

In [274]:
# sparse_features = train.iloc[::,:-1].columns.values.tolist()
sparse_features = ['CampaignId', 'AdGroupId', 'KeywordId', 'QueryMatchTypeWithVariant', 'Device']
dense_features = [f'Query_{i}' for i in range(N_FEATURES)] + ['Date', 'AveragePosition']
target = ['Clicks']

In [275]:
sparse_features

['CampaignId', 'AdGroupId', 'KeywordId', 'QueryMatchTypeWithVariant', 'Device']

In [276]:
dense_features

['Query_0',
 'Query_1',
 'Query_2',
 'Query_3',
 'Query_4',
 'Query_5',
 'Query_6',
 'Query_7',
 'Query_8',
 'Query_9',
 'Query_10',
 'Query_11',
 'Query_12',
 'Query_13',
 'Query_14',
 'Query_15',
 'Date',
 'AveragePosition']

In [277]:
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [278]:
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique()) for feat in sparse_features]
fixlen_feature_columns.extend([DenseFeat(feat, 1) for feat in dense_features])

In [279]:
for idx, column in enumerate(test.columns):
    print(f'{column:30}\t{test[column].nunique()}')

CampaignId                    	3
AdGroupId                     	13
KeywordId                     	82
QueryMatchTypeWithVariant     	5
Device                        	3
Date                          	50
AveragePosition               	60
Query_0                       	22
Query_1                       	14
Query_2                       	21
Query_3                       	15
Query_4                       	19
Query_5                       	22
Query_6                       	15
Query_7                       	17
Query_8                       	13
Query_9                       	14
Query_10                      	21
Query_11                      	19
Query_12                      	17
Query_13                      	14
Query_14                      	15
Query_15                      	21


In [280]:
fixlen_feature_columns

[SparseFeat(name='CampaignId', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='CampaignId', group_name='default_group'),
 SparseFeat(name='AdGroupId', vocabulary_size=120, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='AdGroupId', group_name='default_group'),
 SparseFeat(name='KeywordId', vocabulary_size=469, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='KeywordId', group_name='default_group'),
 SparseFeat(name='QueryMatchTypeWithVariant', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='QueryMatchTypeWithVariant', group_name='default_group'),
 SparseFeat(name='Device', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Device', group_name='default_group'),
 DenseFeat(name='Query_0', dimension=1, dtype='float32'),
 DenseFeat(name='Query_1', dimension=1, dtype='float32'),
 DenseFeat(name='Query_2', dimension=1, dtype='float32'),
 DenseFeat(name='Query_3

In [281]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [282]:
linear_feature_columns

[SparseFeat(name='CampaignId', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='CampaignId', group_name='default_group'),
 SparseFeat(name='AdGroupId', vocabulary_size=120, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='AdGroupId', group_name='default_group'),
 SparseFeat(name='KeywordId', vocabulary_size=469, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='KeywordId', group_name='default_group'),
 SparseFeat(name='QueryMatchTypeWithVariant', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='QueryMatchTypeWithVariant', group_name='default_group'),
 SparseFeat(name='Device', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='Device', group_name='default_group'),
 DenseFeat(name='Query_0', dimension=1, dtype='float32'),
 DenseFeat(name='Query_1', dimension=1, dtype='float32'),
 DenseFeat(name='Query_2', dimension=1, dtype='float32'),
 DenseFeat(name='Query_3

In [283]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['CampaignId',
 'AdGroupId',
 'KeywordId',
 'QueryMatchTypeWithVariant',
 'Device',
 'Query_0',
 'Query_1',
 'Query_2',
 'Query_3',
 'Query_4',
 'Query_5',
 'Query_6',
 'Query_7',
 'Query_8',
 'Query_9',
 'Query_10',
 'Query_11',
 'Query_12',
 'Query_13',
 'Query_14',
 'Query_15',
 'Date',
 'AveragePosition']

In [284]:
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

In [285]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [286]:
model = DeepFM(
    linear_feature_columns, dnn_feature_columns, task='binary', device=device, dnn_dropout=0.7
)
model

DeepFM(
  (embedding_dict): ModuleDict(
    (CampaignId): Embedding(9, 4)
    (AdGroupId): Embedding(120, 4)
    (KeywordId): Embedding(469, 4)
    (QueryMatchTypeWithVariant): Embedding(5, 4)
    (Device): Embedding(3, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (CampaignId): Embedding(9, 1)
      (AdGroupId): Embedding(120, 1)
      (KeywordId): Embedding(469, 1)
      (QueryMatchTypeWithVariant): Embedding(5, 1)
      (Device): Embedding(3, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0.7, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=38, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
    (activation_layers): ModuleList(
      (0-1): 2 x ReLU(inplace=True)
    )
  )
  (dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)

In [287]:
model.compile(
    optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['binary_crossentropy', 'auc']
)

In [288]:
%%time
history = model.fit(
    train_model_input,
    train[target].astype(int).values,
    batch_size=1024,
    epochs=5,
    verbose=1,
    validation_split=0.2
)

cpu
Train on 44977 samples, validate on 11245 samples, 44 steps per epoch


44it [00:00, 49.60it/s]


Epoch 1/5
0s - loss:  49.5742 - binary_crossentropy:  17.8798 - auc:  0.5000 - val_binary_crossentropy:  27.3252 - val_auc:  0.5000


44it [00:00, 52.20it/s]


Epoch 2/5
0s - loss:  54.9391 - binary_crossentropy:  19.8245 - auc:  0.4979 - val_binary_crossentropy:  27.3252 - val_auc:  0.5000


44it [00:00, 51.59it/s]


Epoch 3/5
0s - loss:  67.3500 - binary_crossentropy:  24.2740 - auc:  0.5000 - val_binary_crossentropy:  27.3252 - val_auc:  0.5000


44it [00:00, 61.18it/s]


Epoch 4/5
0s - loss:  67.3500 - binary_crossentropy:  24.2748 - auc:  0.5000 - val_binary_crossentropy:  27.3252 - val_auc:  0.5000


44it [00:00, 101.64it/s]


Epoch 5/5
0s - loss:  67.3500 - binary_crossentropy:  24.2766 - auc:  0.5000 - val_binary_crossentropy:  27.3252 - val_auc:  0.5000
CPU times: user 40.7 s, sys: 0 ns, total: 40.7 s
Wall time: 4.11 s
