<a href="https://colab.research.google.com/github/DaryaTereshchenko/AdClick/blob/main/Click_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pprint import pprint
import re

In [22]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import IntegerLookup, Concatenate, Flatten
from tensorflow.python.ops.numpy_ops import np_config

In [3]:
# load data
from urllib.request import urlretrieve
import os

def download(url, file):
    if not os.path.isfile(file):
        print("Download file... " + file + " ...")
        urlretrieve(url,file)
        print("File downloaded")

# d10m_url = "https://home.ipipan.waw.pl/sj/TIB_PAN_Adv/AdClick/D100k.tsv.gz"
# download(d10m_url,'D10M.tsv.gz')
# print("All the files are downloaded")

d100k_url = "https://home.ipipan.waw.pl/sj/TIB_PAN_Adv/AdClick/D100k.tsv.gz"
download(d100k_url,'D100k.tsv.gz')
print("All the files are downloaded")

Download file... D100k.tsv.gz ...
File downloaded
All the files are downloaded


In [4]:
ds = tf.data.experimental.make_csv_dataset("D100k.tsv.gz",
                                           field_delim="\t",
                                           compression_type="GZIP",
                                           num_epochs=1,
                                           batch_size=1024,  
                                           label_name="Click", shuffle=False)


In [5]:
def get_dataset_partitions_tf(ds, ds_size=10000000, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [6]:
train_ds, val_ds, test_ds = get_dataset_partitions_tf(ds)

In [35]:
train = train_ds.unbatch()

needed_vocab = {"AdKeyword_tokens": [], "AdDescription_tokens": [], "Query_tokens": []}

for dic in train.as_numpy_iterator():
  for k, v in dic[0].items():
    if k in needed_vocab.keys():
      needed_vocab[k].append(v)


In [64]:
def transform_to_list(parameter):
  list_of_tokens = list(map(lambda x: x.replace(b"|", b" ").split(b" "), parameter))
  flatten_list = [item for subl in list_of_tokens for item in subl]
  return np.asarray(flatten_list)


In [65]:
keyword_tokens = transform_to_list(needed_vocab["AdKeyword_tokens"])
description_tokens = transform_to_list(needed_vocab["AdDescription_tokens"])
query_tokens = transform_to_list(needed_vocab["Query_tokens"])


In [68]:
print(type(description_tokens))
print(len(keyword_tokens))
print(len(query_tokens))

<class 'numpy.ndarray'>
209037
291570


In [69]:
total_vocab = np.unique(np.concatenate((keyword_tokens, description_tokens, query_tokens), axis=0))

In [71]:
len(total_vocab)

32788

In [None]:
maxlen = 128
max_features = 25000 # max word number

In [None]:
def split_on_slash(input_data):
  return tf.strings.regex_replace(input_data, "\|", " ")

In [None]:
def build_model1():
  inputs = {
      "Position": Input(shape=(), dtype=tf.int32),
      "Age": Input(shape=(), dtype=tf.int32),
      "Depth": Input(shape=(), dtype=tf.int32),
      "Gender": Input(shape=(), dtype=tf.int32),
      "AdvertiserId": Input(shape=(), dtype=tf.int32),
      "AdDescription_tokens": Input(shape=(), dtype=tf.string),
      "AdTitle_tokens": Input(shape=(), dtype=tf.string),
      "AdKeyword_tokens": Input(shape=(), dtype=tf.string)}

  # Integer part
  input_age = IntegerLookup(vocabulary=[1,2,3,4,5,6], output_mode="one_hot", num_oov_indices=0)(inputs["Age"])
  ci_p = IntegerLookup(vocabulary=[1,2,3], output_mode="one_hot", num_oov_indices=0)(inputs["Position"])
  ci_d = IntegerLookup(vocabulary=[1,2,3], output_mode="one_hot", num_oov_indices=0)(inputs["Depth"])
  ci_g = IntegerLookup(vocabulary=[0,1,2], output_mode="one_hot", num_oov_indices=0)(inputs["Gender"])
  encoded = Concatenate()([ci_p, input_age, ci_d, ci_g])
  layer = tf.keras.layers.Hashing(num_bins=128, output_mode="one_hot")(inputs["AdvertiserId"])

  conc = tf.keras.layers.concatenate([layer, encoded])
  num = Dropout(rate=0.3)(conc)
  num = Dense(128)(num)
  num = Flatten()(num)

# String part
  description = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode="int", pad_to_max_tokens=maxlen, standardize=split_on_slash, output_sequence_length=maxlen, vocabulary=vocab_description)(inputs["AdDescription_tokens"])
  x = tf.keras.layers.Embedding(max_features, output_dim=20, input_length=maxlen)(description)
  
  title = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode="int", pad_to_max_tokens=maxlen, standardize=split_on_slash, output_sequence_length=maxlen, vocabulary=vocab_title)(inputs["AdTitle_tokens"])
  y = tf.keras.layers.Embedding(max_features, output_dim=20, input_length=maxlen)(title)

  keywords = tf.keras.layers.TextVectorization(max_tokens=max_features, output_mode="int", pad_to_max_tokens=maxlen, standardize=split_on_slash, output_sequence_length=maxlen, vocabulary=total_vocab)(inputs["AdKeyword_tokens"])
  o = tf.keras.layers.Embedding(max_features, output_dim=20, input_length=maxlen)(keywords)

  c = Concatenate()([x, y, o])
  # x = Flatten()(x)
  # y = Flatten()(y)
  t = Dropout(rate=0.3)(c)
  t = Dense(128)(t)
  t = Flatten()(t)

  z = Concatenate()([t, num])
  z = tf.keras.layers.Dense(64, activation='relu')(z)
  z = tf.keras.layers.Dense(1, activation='sigmoid')(z)
  

  model = Model(inputs=inputs, outputs=z)
  model.compile(optimizer="Adam", loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
          metrics=["AUC"], run_eagerly=True)
  return model

In [None]:
model1 = build_model1()
model1.summary()
tf.keras.utils.plot_model(model1)

In [None]:
model1.fit(train_ds.batch(128), epochs=5, verbose=1, validation_data=test_ds.batch(128))

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f553b3dff70>

In [None]:
model1.predict(dt_test.batch(128).take(1))

In [None]:
# load data
from urllib.request import urlretrieve
import os

def download(url, file):
    if not os.path.isfile(file):
        print("Download file... " + file + " ...")
        urlretrieve(url,file)
        print("File downloaded")

d100k_url = "https://home.ipipan.waw.pl/sj/TIB_PAN_Adv/AdClick/D5M_test_x.tsv.gz"
download(d100k_url,'D5M_test_x.tsv.gz')
print("All the files are downloaded")

All the files are downloaded


In [None]:
test_data = tf.data.experimental.make_csv_dataset("D5M_test_x.tsv.gz",
                                           field_delim="\t",
                                           compression_type="GZIP",
                                           num_epochs=1,
                                           batch_size=128,
                                           label_name="Click",
                                           shuffle=False)


In [None]:
d100k_url = "https://home.ipipan.waw.pl/sj/TIB_PAN_Adv/AdClick/D10M.tsv.gz"
download(d100k_url,'D10M.tsv.gz')
print("All the files are downloaded")

All the files are downloaded


In [None]:
tenm_data = tf.data.experimental.make_csv_dataset("D10M.tsv.gz",
                                           field_delim="\t",
                                           compression_type="GZIP",
                                           num_epochs=1,
                                           batch_size=128,
                                           label_name="Click",
                                           shuffle=False)


In [None]:
ds_size=10000000
train_split=0.8
test_split=0.2
train_size = int(train_split * ds_size)
test_size = int(test_split * ds_size)


In [None]:
dt = tenm_data.unbatch()

In [None]:
shuffled = dt.shuffle(100)

In [None]:
dt_train = shuffled.take(train_size) 
dt_test = shuffled.skip(train_size).take(test_size)

In [None]:
y_pred = model2.predict(dt_test.batch(256).take(1))

  inputs = self._flatten_to_reference_inputs(inputs)




In [None]:
y_pred

In [None]:
pprint(test_data.element_spec)