## Preparation

In [1]:
import os
import warnings
import io
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#os.chdir('/Users/wang/Documents/git/lets_learn/infclean/data')
os.chdir('C:/git/lets_learn/infclean/data')
#os.chdir('C:/Users/wangd/git/lets_learn/infclean/data')

def dump_history(hist, filename):
    json.dump(hist.history, open(filename, 'w'))
    
def load_history(filename):
    return json.load(open(filename, 'r'))


def load_dataset(train_path, valid_path, categorical_attributes, omit_attributes, target_attr):
    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    values = np.concatenate(([train_df[cat_attr].astype(str).unique() 
                              for cat_attr in categorical_attributes]), axis=None)
    keys = pd.Categorical(values).codes
    category_dict = dict(zip(keys, values))
    train_df[categorical_attributes] = train_df[categorical_attributes].astype(str).replace(values, keys)
    valid_df[categorical_attributes] = valid_df[categorical_attributes].astype(str).replace(values, keys)
    for o in omit_attributes:
        train_df.pop(o)
        valid_df.pop(o)
    train_target = train_df.pop(target_attr)
    valid_target = valid_df.pop(target_attr)
    train_dataset = tf.data.Dataset.from_tensor_slices((train_df.values, train_target.values))
    valid_dataset = tf.data.Dataset.from_tensor_slices((valid_df.values, valid_target.values))
    train_dataset = train_dataset.shuffle(len(train_df)).batch(1)
    valid_dataset = valid_dataset.shuffle(len(valid_df)).batch(1)
    vocab_size = len(values)
    return train_dataset, valid_dataset, category_dict, vocab_size

def negative_sampling(df, target_attr, ignore_attrs, n_per_target_attr):
    # which cities exist?
    target_attr_vals = df[target_attr].astype(str).unique()
    all_attrs = list(df)
    other_attrs = list(filter(lambda x: x != target_attr, all_attrs))
    other_attrs = list(filter(lambda x: x not in ignore_attrs, other_attrs))
    positive_samples = [[] for _ in range(len(ignore_attrs) + 1)]
    negative_samples = [[] for _ in range(len(other_attrs))]

    # for each entry, sample n negative samples with city as target
    for (i, target_val) in enumerate(target_attr_vals):
        # all entries with this city
        target_val_df = df[df[target_attr] == target_val].reset_index()
        del target_val_df['index']
        neg_attr_vals_list = []
        for attr in other_attrs:
            pos_attr_vals = target_val_df[attr].unique()
            neg_attr_vals = pd.unique(list(filter(lambda x: x not in pos_attr_vals, df[attr])))
            if len(neg_attr_vals) == 0:
                neg_attr_vals_list.append(pos_attr_vals)
            else:
                neg_attr_vals_list.append(neg_attr_vals)
        # for _ in range(n_per_target_attr * target_val_df.shape[0]):
        for _ in range(n_per_target_attr):
            # sample negative values for other attrs
            for (j, attr) in enumerate(other_attrs):
                neg_attr_val = neg_attr_vals_list[j][np.random.randint(0, len(neg_attr_vals_list[j]))]
                negative_samples[j].append(neg_attr_val)
            positive_samples[0].append(target_val)
            # sample positive values for ignored attributes
            for (j, attr) in enumerate(ignore_attrs):
                positive_samples[j + 1].append(target_val_df[attr][np.random.randint(0, target_val_df.shape[0])])
    # [city, ignored_attrs..., other_attrs...]
    column_names = np.concatenate((np.concatenate(([target_attr], ignore_attrs)), other_attrs))
    samples = np.transpose(np.concatenate((positive_samples, negative_samples)))
    negative_samples_df = pd.DataFrame(samples, columns=column_names)
    negative_samples_df['good'] = np.zeros(negative_samples_df.shape[0])
    return negative_samples_df

print('TENSORFLOW VERSION: {}'.format(tf.__version__))
if not tf.test.gpu_device_name():
    warnings.warn('NO GPU FOUND')
else:
    print('DEFAULT GPU DEVICE: {}'.format(tf.test.gpu_device_name()))
    tf.debugging.set_log_device_placement(True)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
      except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

## w2v models

In [2]:
def get_w2v_single_vocab_model(cat_input_length, num_input_length, voc_size, e_dim):
    word_input_list = []
    word_input_name = 'word_input_{}'
    num_input_list = []
    num_input_name = 'num_input_{}'
    for i in range(cat_input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i))
        word_input_list.append(input_i)
    for i in range(num_input_length):
        numerical_input_i = keras.layers.Input(shape=(1, ), name=num_input_name.format(i))
        num_input_list.append(numerical_input_i)
    
    embedding = keras.layers.Embedding(input_dim=voc_size, 
                                       output_dim=e_dim, 
                                       input_length=cat_input_length,
                                       name='embedding')
    word_reshaped_list = []
    word_reshaped_name = 'encode_reshape_{}'
    for i in range(cat_input_length):
        encoded_i = embedding(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_reshaped_name.format(i))(encoded_i)
        word_reshaped_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(cat_input_length-1):
        for j in range(i+1, cat_input_length):
            dot_sim_i = keras.layers.dot([word_reshaped_list[i], word_reshaped_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         name=dot_sim_name.format(i, j))
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_num = keras.layers.concatenate(num_input_list, axis=1) if len(num_input_list) > 1 else num_input_list[0]
    merge_final = keras.layers.concatenate([merge_sim, merge_num], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.extend(num_input_list)
    m = keras.Model(inputs=word_input_list, outputs=output, name='cbow_model')
    #m.summary()
    m.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    keras.utils.plot_model(m, to_file='w2v_single_vocab.png')
    return m

def get_w2v_multi_vocab_model(cat_input_length, num_input_length, voc_size, e_dim):
    assert cat_input_length > 1, 'Input length must be greater than 1, current: %i' % cat_input_length 
    word_input_list = []
    cat_names = ['state', 'city', 'zip', 'construct_year']
    #word_input_name = 'word_input_{}'
    word_input_name = 'cat_attr_{}'
    num_input_list = []
    num_names = ['living_space', 'rent']
    #num_input_name = 'num_input_{}'
    num_input_name = 'num_attr_{}'
    for i in range(cat_input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i+1))
        word_input_list.append(input_i)
    for i in range(num_input_length):
        numerical_input_i = keras.layers.Input(shape=(1, ), name=num_input_name.format(i+1))
        num_input_list.append(numerical_input_i)
    
    word_encode_list = []
    word_encode_name = 'encode_reshape_{}'
    for i in range(cat_input_length):
        encode_i = keras.layers.Embedding(input_dim=voc_size[i],
                                          output_dim=e_dim,
                                          input_length=1)(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_encode_name.format(i))(encode_i)
        word_encode_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(cat_input_length-1):
        for j in range(i+1, cat_input_length):
            dot_sim_i = keras.layers.dot([word_encode_list[i], word_encode_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         #name=dot_sim_name.format(i, j)
                                         )
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_num = keras.layers.concatenate(num_input_list, axis=1) if len(num_input_list) > 1 else num_input_list[0]
    merge_final = keras.layers.concatenate([merge_sim, merge_num], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.extend(num_input_list)
    m = keras.Model(inputs=word_input_list, outputs=output, name='embedding_training_model')
    m.summary()
    m.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    keras.utils.plot_model(m, to_file='w2v_multi_vocab_tt_3.png', show_shapes=True, show_layer_names=True)
    return m
    
print('SUCCESS')

## Enriched rent data - Multi vocab

In [4]:
CATEGORICAL_ATTR = ['state',
                    'city',
                    'zip',
                    'construct_year'
                    #'heating_type',
                    ]
NUMERICAL_ATTR = [#'zip',
    'living_space',
    'rent']
BOOLEAN_ATTR = [#'has_parking',
                #'has_balcony',
                ]

#TRAIN_FILENAME = 'C:/git/lets_learn/infclean/data/rent_data/enriched_rent_data_100000.csv'
TRAIN_FILENAME = 'C:/git/lets_learn/infclean/data/rent_data/neo_enriched_rent_30_per_city.csv'
#TRAIN_FILENAME = 'C:/git/lets_learn/infclean/data/rent_data/simple_rent_5_per_city.csv'
#TRAIN_FILENAME = '/Users/wang/Documents/git/lets_learn/infclean/data/rent_data/NEGSAMP_neo_rent_60_per_city.csv'
#EMBEDDING_DIM = 7
#TRAIN_FILENAME = 'C:/git/lets_learn/infclean/data/rent_data/NEGSAMP_simple_rent_20_per_city.csv'
EMBEDDING_DIM = 4

train_df = pd.read_csv(TRAIN_FILENAME)
# ignore negative samples from julia
#train_df = train_df[train_df['negative_sample'] != 1]
train_df = train_df.drop(columns=['negative_sample'])
neg_df = negative_sampling(train_df, 'city', [], 5)
train_df['good'] = np.ones(train_df.shape[0])
train_df = pd.concat([train_df, neg_df], ignore_index=True)
# shuffle
train_df = train_df.sample(frac=1).reset_index(drop=True)
print('SUCCESS')

In [5]:
CATEGORICAL_ATTR = ['state',
                    'city',
                    #'zip',
                    #'construct_year',
                    #'heating_type',
                    ]
NUMERICAL_ATTR = [#'zip',
    #'living_space',
    'rent']
unique_values = [train_df[cat_attr].astype(str).unique() for cat_attr in CATEGORICAL_ATTR]
keys = [pd.Categorical(u).codes for u in unique_values]
category_dict = []
for i, key_set in enumerate(keys):
    category_dict.append(dict(zip(key_set, unique_values[i])))
    train_df[CATEGORICAL_ATTR[i]] = train_df[CATEGORICAL_ATTR[i]].astype(str).replace(unique_values[i], key_set)

# multi vocab
vocabulary_size = [len(u) for u in unique_values]
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(sum(vocabulary_size)**0.25))
adaptive_embedding_dim = 100
model = get_w2v_multi_vocab_model(len(CATEGORICAL_ATTR), len(NUMERICAL_ATTR), vocabulary_size, adaptive_embedding_dim)

In [None]:
# Load dataset into separate vocabularies, integer-encoded
unique_values = [train_df[cat_attr].astype(str).unique() for cat_attr in CATEGORICAL_ATTR]
keys = [pd.Categorical(u).codes for u in unique_values]
category_dict = []
for i, key_set in enumerate(keys):
    category_dict.append(dict(zip(key_set, unique_values[i])))
    train_df[CATEGORICAL_ATTR[i]] = train_df[CATEGORICAL_ATTR[i]].astype(str).replace(unique_values[i], key_set)

# multi vocab
vocabulary_size = [len(u) for u in unique_values]
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(sum(vocabulary_size)**0.25))
adaptive_embedding_dim = EMBEDDING_DIM
model = get_w2v_multi_vocab_model(len(CATEGORICAL_ATTR), len(NUMERICAL_ATTR) + len(BOOLEAN_ATTR), vocabulary_size, adaptive_embedding_dim)

# Generate labels, i.e. we label all observations as positive samples
# train_target = np.ones(train_df.shape[0])
train_target = train_df['good'].to_numpy().astype(int)

train_np = [train_df[cat_attr].to_numpy().astype(int) for cat_attr in CATEGORICAL_ATTR]
for bool_attr in BOOLEAN_ATTR:
    train_np.append(train_df[bool_attr].to_numpy().astype(int))
for num_attr in NUMERICAL_ATTR:
    train_np.append(train_df[num_attr].to_numpy().astype(float))


history = model.fit(x=train_np,
                    y=train_target,
                    batch_size=32,
                    epochs=300)
print('SUCCESS')

## EK onekey data - Multi vocab

In [None]:
CATEGORICAL_ATTR = [#'catalog_id',
                #'article_id',
                #'lower_bound',
                'unit',
                'keywords',
                'manufacturer_name',
                #'ean',
                'set_id',
                    ]
NUMERICAL_ATTR = ['ek_amount',
                #'vk_amount',
                    ]

#TRAIN_FILENAME = '/Users/wang/Documents/git/lets_learn/infclean/data/mercateo/NEGSAMP_50000_onekey_ek_no_ean.csv'
TRAIN_FILENAME = 'C:/git/lets_learn/infclean/data/mercateo/NEGSAMP_catalog_5497.csv'
EMBEDDING_DIM = 7

train_df = pd.read_csv(TRAIN_FILENAME)
train_df = train_df.drop(columns=['negative_sample'])
neg_df = negative_sampling(train_df, 'keywords', ['article_id', 'ek_amount'], 5)
train_df['good'] = np.ones(train_df.shape[0])
train_df = pd.concat([train_df, neg_df], ignore_index=True)
# shuffle
train_df = train_df.sample(frac=1).reset_index(drop=True)

# Load dataset into separate vocabularies, integer-encoded
unique_values = [train_df[cat_attr].astype(str).unique() for cat_attr in CATEGORICAL_ATTR]
keys = [pd.Categorical(u).codes for u in unique_values]
category_dict = []
for i, key_set in enumerate(keys):
    category_dict.append(dict(zip(key_set, unique_values[i])))
    train_df[CATEGORICAL_ATTR[i]] = train_df[CATEGORICAL_ATTR[i]].astype(str).replace(unique_values[i], key_set)

# multi vocab
vocabulary_size = [len(u) for u in unique_values]
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(sum(vocabulary_size)**0.25))
adaptive_embedding_dim = EMBEDDING_DIM
model = get_w2v_multi_vocab_model(len(CATEGORICAL_ATTR), len(NUMERICAL_ATTR), vocabulary_size, adaptive_embedding_dim)


# Generate labels, i.e. we label all observations as positive samples
# train_target = np.ones(train_df.shape[0])
train_target = train_df['good'].to_numpy().astype(int)

train_np = [train_df[cat_attr].to_numpy().astype(int) for cat_attr in CATEGORICAL_ATTR]
for num_attr in NUMERICAL_ATTR:
    train_np.append(train_df[num_attr].to_numpy().astype(float))

In [4]:
history = model.fit(x=train_np,
                    y=train_target,
                    batch_size=32,
                    epochs=50)
print('SUCCESS')

## Vector IO

In [5]:
# Save embeddings from one vocab
'''e = model.layers[3]
weights = e.get_weights()[0]
out_v = io.open('w2v_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('w2v_meta.tsv', 'w', encoding='utf-8')

for i in range(0, vocabulary_size):
    vec = weights[i]
    out_m.write(category_dict[i] + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()'''

# Save for multiple vocabs
tsv_name = 'NEGSAMP_catalog_5497_5_col_{}_{}.tsv'
for i, cat_attr in enumerate(CATEGORICAL_ATTR, start=0):
    e = model.layers[len(CATEGORICAL_ATTR) + i]
    weights = e.get_weights()[0]
    out_v = io.open(tsv_name.format(cat_attr, 'vec'), 'w', encoding='utf-8')
    out_m = io.open(tsv_name.format(cat_attr, 'meta'), 'w', encoding='utf-8')
    
    for j in range(0, vocabulary_size[i]):
        vec = weights[j]
        out_m.write(category_dict[i][j] + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_v.close()
    out_m.close()

print('SUCCESS')

-----------------------------
## Enriched rent data - Single vocab

In [None]:
CATEGORICAL_ATTR = ['state',
                    'city',
                    'zip',
                    'construct_year',
                    'heating_type']
NUMERICAL_ATTR = ['living_space',
                  'rent']
BOOLEAN_ATTR = ['has_parking',
                'has_balcony']

TRAIN_FILENAME = 'enriched_rent_data.csv'
EMBEDDING_DIM = 50

train_df = pd.read_csv(TRAIN_FILENAME)
# Load dataset into one vocab for all columns, integer-encoded
unique_values = np.concatenate(([train_df[cat_attr].astype(str).unique()
                                 for cat_attr in CATEGORICAL_ATTR]), axis=None)
keys = pd.Categorical(unique_values).codes
category_dict = dict(zip(keys, unique_values))
train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)
    
# Generate labels, i.e. we label all observations as positive samples
train_target = np.ones(train_df.shape[0])

# Load data into model, single vocab
vocabulary_size = len(unique_values)
adaptive_embedding_dim = EMBEDDING_DIM
model = get_w2v_single_vocab_model(len(CATEGORICAL_ATTR), len(NUMERICAL_ATTR) + len(BOOLEAN_ATTR), vocabulary_size, adaptive_embedding_dim)

train_np = [train_df[cat_attr].to_numpy().astype(int) for cat_attr in CATEGORICAL_ATTR]
for bool_attr in BOOLEAN_ATTR:
    train_np.append(train_df[bool_attr].to_numpy().astype(int))
for num_attr in NUMERICAL_ATTR:
    train_np.append(train_df[num_attr].to_numpy().astype(float))

history = model.fit(x=train_np,
                    y=train_target,
                    batch_size=1,
                    epochs=100)

In [None]:
# Split single vocab
class Category:
    key_dict = {}
    col = None
    
    def __init__(self, col):
        self.col = col
        
categories = {}
for cat_attr in CATEGORICAL_ATTR:
    categories[cat_attr] = Category(cat_attr)
    
df = pd.read_csv(TRAIN_FILENAME)
for key in category_dict:
    cur_val = category_dict[key]
    for cat_attr in CATEGORICAL_ATTR:
        if cur_val in df[cat_attr].to_numpy():
            categories[cat_attr].key_dict[key] = cur_val
print("split finished")
            
# Save for multiple vocabs
e = model.layers[5]
weights = e.get_weights()[0]

for cat_attr in CATEGORICAL_ATTR:
    cat = categories[cat_attr]
    out_v = io.open('{}_vecs.tsv'.format(cat_attr), 'w', encoding='utf-8')
    out_m = io.open('{}_meta.tsv'.format(cat_attr), 'w', encoding='utf-8')
    for key in cat.key_dict:
        vec = weights[key]
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
        out_m.write(cat.key_dict[key] + "\n")
    out_v.close()
    out_m.close()    

print('SUCCESS')

## Embedding rent data

In [None]:
# cbow style model for all columns
def get_ek_prediction_model(voc_size, e_dim):
    m = tf.keras.Sequential()
    m.add(layers.Embedding(voc_size, e_dim))
    #m.add(layers.Dense(e_dim, activation='relu'))
    m.add(layers.Dense(1, activation='relu'))
    #m.summary()

    m.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])
    return m


CATEGORICAL_ATTR = ['dirty_state',
                    'dirty_city',
                    'dirty_zip']
TARGET = 'rent'
OMIT = ['intended_state', 'intended_city', 'intended_zip']
TRAIN_FILENAME = 'rent_data_low_error.csv'
VALID_FILENAME = 'rent_data_low_error_validation.csv'
EMBEDDING_DIM = 50

# Load dataset, separate vocab
# df = pd.read_csv(FILENAME)
'''for cat_attr in CATEGORICAL_ATTR:
    df[cat_attr] = pd.Categorical(df[cat_attr])
    df[cat_attr] = df[cat_attr].cat.codes
for o in OMIT:
    df.pop(o)'''

# One vocab for all columns, integer-encoded
train_ds, val_ds, cat_encode_dict, vocabulary_size = load_dataset(TRAIN_FILENAME, 
                                                                  VALID_FILENAME, 
                                                                  CATEGORICAL_ATTR, 
                                                                  OMIT, 
                                                                  TARGET)

adaptive_embedding_dim = min(EMBEDDING_DIM, int(vocabulary_size**0.25))
model = get_ek_prediction_model(vocabulary_size, adaptive_embedding_dim)
history = model.fit(train_ds, 
                    epochs=10,
                    validation_data=val_ds, 
                    validation_steps=20)

e = model.layers[0]
weights = e.get_weights()[0]
out_v = io.open('new_clean_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('new_clean_meta.tsv', 'w', encoding='utf-8')

for i in range(0, vocabulary_size):
    vec = weights[i]
    out_m.write(cat_encode_dict[i] + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

print('SUCCESS')

## Advanced rent data embedding

In [None]:
# Skip-gram style model, one vocab
# https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296
def get_sg_model(voc_size, e_dim):
    pass

def get_w2v_multi_vocab_model(cat_input_length, num_input_length, voc_size, e_dim):
    assert cat_input_length > 1, 'Input length must be greater than 1, current: %i' % cat_input_length 
    word_input_list = []
    word_input_name = 'word_input_{}'
    num_input_list = []
    num_input_name = 'num_input_{}'
    for i in range(cat_input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i))
        word_input_list.append(input_i)
    for i in range(num_input_length):
        numerical_input_i = keras.layers.Input(shape=(1, ), name=num_input_name.format(i))
        num_input_list.append(numerical_input_i)
    
    word_encode_list = []
    word_encode_name = 'encode_reshape_{}'
    for i in range(cat_input_length):
        encode_i = keras.layers.Embedding(input_dim=voc_size[i],
                                          output_dim=e_dim,
                                          input_length=1)(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_encode_name.format(i))(encode_i)
        word_encode_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(cat_input_length-1):
        for j in range(i+1, cat_input_length):
            dot_sim_i = keras.layers.dot([word_encode_list[i], word_encode_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         name=dot_sim_name.format(i, j))
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_num = keras.layers.concatenate(num_input_list, axis=1) if len(num_input_list) > 1 else num_input_list[0]
    merge_final = keras.layers.concatenate([merge_sim, merge_num], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.extend(num_input_list)
    m = keras.Model(inputs=word_input_list, outputs=output, name='cbow_model')
    #m.summary()
    m.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
   # keras.utils.plot_model(m, to_file='w2v_multi_vocab.png')
    return m

# word2vec style model, one vocabulary
# input_length ... number of input cat. attributes excl. rent/ek
# voc_size ... size of the single vocabulary
def get_w2v_model(input_length, voc_size, e_dim):
    assert input_length > 1, 'Input length must be greater than 1, current: %i' % input_length 
    word_input_list = []
    word_input_name = 'word_input_{}'
    for i in range(input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i))
        word_input_list.append(input_i)
    numerical_input = keras.layers.Input(shape=(1, ), name='numerical_input')
    
    embedding = keras.layers.Embedding(input_dim=voc_size, 
                                       output_dim=e_dim, 
                                       input_length=input_length,
                                       name='embedding')
    word_reshaped_list = []
    word_reshaped_name = 'encode_reshape_{}'
    for i in range(input_length):
        encoded_i = embedding(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_reshaped_name.format(i))(encoded_i)
        word_reshaped_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(input_length-1):
        for j in range(i+1, input_length):
            dot_sim_i = keras.layers.dot([word_reshaped_list[i], word_reshaped_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         name=dot_sim_name.format(i, j))
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_final = keras.layers.concatenate([merge_sim, numerical_input], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.append(numerical_input)
    m = keras.Model(inputs=word_input_list, outputs=output, name='cbow_model')
    #m.summary()
    m.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    keras.utils.plot_model(m, to_file='w2v_one_vocab.png')
    return m


CATEGORICAL_ATTR = ['dirty_state',
                    'dirty_city',
                    'dirty_zip']
TARGET = ['rent']
OMIT = ['intended_state', 'intended_city', 'intended_zip']
TRAIN_FILENAME = 'rent_data_low_error.csv'
VALID_FILENAME = 'rent_data_low_error_validation.csv'
EMBEDDING_DIM = 50

# Load dataset into one vocab for all columns, integer-encoded
train_df = pd.read_csv(TRAIN_FILENAME)
valid_df = pd.read_csv(VALID_FILENAME)
'''unique_values = np.concatenate(([train_df[cat_attr].astype(str).unique()
                                 for cat_attr in CATEGORICAL_ATTR]), axis=None)
keys = pd.Categorical(unique_values).codes
category_dict = dict(zip(keys, unique_values))
train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)
valid_df[CATEGORICAL_ATTR] = valid_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)
for o in OMIT:
    train_df.pop(o)
    valid_df.pop(o)'''
    
# Load dataset into separate vocabularies, integer-encoded
unique_values = [train_df[cat_attr].astype(str).unique() for cat_attr in CATEGORICAL_ATTR]
keys = [pd.Categorical(u).codes for u in unique_values]
category_dict = []
for i in range(len(keys)):
    category_dict.append(dict(zip(keys[i], unique_values[i])))
    train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values[i], keys[i])
for o in OMIT:
    train_df.pop(o)
    valid_df.pop(o)
# Generate labels, i.e. we label all observations as positive samples
train_target = np.ones(train_df.shape[0])
valid_target = np.ones(valid_df.shape[0])

# Load data into model, single vocab
# vocabulary_size = len(unique_values)
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(vocabulary_size ** 0.25))
# model = get_w2v_model(len(CATEGORICAL_ATTR), vocabulary_size, adaptive_embedding_dim)

# multi vocab
vocabulary_size = [len(u) for u in unique_values]
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(sum(vocabulary_size)**0.25))
adaptive_embedding_dim = 10
model = get_w2v_multi_vocab_model(len(CATEGORICAL_ATTR), len(TARGET), vocabulary_size, adaptive_embedding_dim)
train_np = [train_df[cat_attr].to_numpy() for cat_attr in CATEGORICAL_ATTR]
train_np.append(train_df[TARGET].to_numpy())

history = model.fit(x=train_np,
                    y=train_target,
                    batch_size=64,
                    epochs=100,
                    verbose=1)

# Save embeddings from one vocab
'''e = model.layers[3]
weights = e.get_weights()[0]
out_v = io.open('w2v_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('w2v_meta.tsv', 'w', encoding='utf-8')

for i in range(0, vocabulary_size):
    vec = weights[i]
    out_m.write(category_dict[i] + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

# Save for multiple vocabs
tsv_name = '{}_{}.tsv'
for i, cat_attr in enumerate(CATEGORICAL_ATTR, start=0):
    e = model.layers[3 + i]
    weights = e.get_weights()[0]
    out_v = io.open(tsv_name.format(cat_attr, 'vec'), 'w', encoding='utf-8')
    out_m = io.open(tsv_name.format(cat_attr, 'meta'), 'w', encoding='utf-8')
    
    for j in range(0, vocabulary_size[i]):
        vec = weights[j]
        out_m.write(category_dict[i][j] + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_v.close()
    out_m.close()
'''
print('SUCCESS')

## Embedding unpatched dataset

In [None]:
CATEGORICAL_ATTR = ['catalog_id',
                    'article_id',
                    'destination',
                    'lower_bound',
                    'currency',
                    'unit',
                    'set_id',
                    'duplicate_set_rating']
TARGET = 'ek_amount'
OMIT = ['vk_amount', 'currency', 'destination']
FILENAME = 'DE_unpatched_100000.csv'
EMBEDDING_DIM = 50

# Load dataset
df = pd.read_csv(FILENAME)
for cat_attr in CATEGORICAL_ATTR:
    df[cat_attr] = pd.Categorical(df[cat_attr])
    df[cat_attr] = df[cat_attr].cat.codes
for o in OMIT:
    df.pop(o)
target = df.pop(TARGET)
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))
train_dataset = dataset.shuffle(len(df)).batch(1)
vocab_size = sum([len(df[cat_attr].unique()) for cat_attr in CATEGORICAL_ATTR if cat_attr not in OMIT])
'''for feat, targ in dataset.take(5):
    print('Features: {}, Target: {}'.format(feat, targ))
'''

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, EMBEDDING_DIM))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(EMBEDDING_DIM, activation='relu'))
model.add(layers.Dense(1, activation='relu'))
#model.summary()

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])
history = model.fit(train_dataset, epochs=10)



In [None]:
##############################################################################
dump_history(history, 'hist.json')

In [None]:
CATEGORICAL_ATTR = ['dirty_state',
                    'dirty_city',
                    'dirty_zip']
TARGET = 'rent'
OMIT = ['intended_state', 'intended_city', 'intended_zip']
TRAIN_FILENAME = 'rent_data_low_error.csv'
VALID_FILENAME = 'rent_data_low_error_validation.csv'
EMBEDDING_DIM = 50

train_df = pd.read_csv(TRAIN_FILENAME)
unique_values = np.concatenate(([train_df[cat_attr].astype(str).unique()
                                 for cat_attr in CATEGORICAL_ATTR]), axis=None)
keys = pd.Categorical(unique_values).codes
category_dict = dict(zip(keys, unique_values))
train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)

for o in OMIT:
    train_df.pop(o)
    
hihi = train_df.to_numpy()

In [None]:
tsv_name = '{}_{}.tsv'
for i, cat_attr in enumerate(CATEGORICAL_ATTR, start=0):
    e = model.get_layer(index=3+i)
    weights = e.get_weights()[0]
    out_v = io.open(tsv_name.format(cat_attr, 'test_vec'), 'w', encoding='utf-8')
    out_m = io.open(tsv_name.format(cat_attr, 'test_meta'), 'w', encoding='utf-8')
    
    for j in range(0, vocabulary_size[i]):
        vec = weights[j]
        out_m.write(category_dict[i][j] + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_v.close()
    out_m.close()

In [None]:
def get_cbow_model(input_length, voc_size, e_dim):
    assert input_length > 1, 'Input length must be greater than 1, current: %i' % input_length 
    word_input_list = []
    word_input_name = 'word_input_{}'
    for i in range(input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i))
        word_input_list.append(input_i)
    numerical_input = keras.layers.Input(shape=(1, ))
    
    embedding = keras.layers.Embedding(input_dim=voc_size, 
                                       output_dim=e_dim, 
                                       input_length=input_length,
                                       name='embedding')
    word_reshaped_list = []
    word_reshaped_name = 'encode_reshape_{}'
    for i in range(input_length):
        encoded_i = embedding(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_reshaped_name.format(i))(encoded_i)
        word_reshaped_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(input_length-1):
        for j in range(i+1, input_length):
            dot_sim_i = keras.layers.dot([word_reshaped_list[i], word_reshaped_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         name=dot_sim_name.format(i, j))
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_final = keras.layers.concatenate([merge_sim, numerical_input], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.append(numerical_input)
    m = keras.Model(inputs=word_input_list, outputs=output, name='cbow_model')
    m.summary()
    return m    

hihi = get_cbow_model(5, 200, 50)