### Preparation

In [None]:
import os
import warnings
import io
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

os.chdir('../data')
#os.chdir('C:\\Users\\wangd\\git\\lets_learn\\infclean\\data')

def dump_history(hist, filename):
    json.dump(hist.history, open(filename, 'w'))
    
def load_history(filename):
    return json.load(open(filename, 'r'))

def load_dataset(train_path, valid_path, categorical_attributes, omit_attributes, target_attr):
    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    values = np.concatenate(([train_df[cat_attr].astype(str).unique() 
                              for cat_attr in categorical_attributes]), axis=None)
    keys = pd.Categorical(values).codes
    category_dict = dict(zip(keys, values))
    train_df[categorical_attributes] = train_df[categorical_attributes].astype(str).replace(values, keys)
    valid_df[categorical_attributes] = valid_df[categorical_attributes].astype(str).replace(values, keys)
    for o in omit_attributes:
        train_df.pop(o)
        valid_df.pop(o)
    train_target = train_df.pop(target_attr)
    valid_target = valid_df.pop(target_attr)
    train_dataset = tf.data.Dataset.from_tensor_slices((train_df.values, train_target.values))
    valid_dataset = tf.data.Dataset.from_tensor_slices((valid_df.values, valid_target.values))
    train_dataset = train_dataset.shuffle(len(train_df)).batch(1)
    valid_dataset = valid_dataset.shuffle(len(valid_df)).batch(1)
    vocab_size = len(values)
    return train_dataset, valid_dataset, category_dict, vocab_size

print('TENSORFLOW VERSION: {}'.format(tf.__version__))
if not tf.test.gpu_device_name():
    warnings.warn('NO GPU FOUND')
else:
    print('DEFAULT GPU DEVICE: {}'.format(tf.test.gpu_device_name()))

# Embedding Categorical Features
- A feed-forward NN that learns embedding vectors
- Model derived from word2vec

### 1. Word2Vec
Given a text corpus, it learns word vectors such that words that share similar context are located closer in the vector space.
![title](../resource/word2vec.png)

### 2. Our model
Given observation dataset, it learns vector representations for categorical columns, where categorical values that co-occur frequently get similar vector representations. Each categorical column has its own vocabulary.
![title](../resource/w2v_multi_vocab.png)

In [None]:
def get_w2v_multi_vocab_model(input_length, voc_size, e_dim):
    assert input_length > 1, 'Input length must be greater than 1, current: %i' % input_length 
    word_input_list = []
    word_input_name = 'word_input_{}'
    for i in range(input_length):
        input_i = keras.layers.Input(shape=(1, ), name=word_input_name.format(i))
        word_input_list.append(input_i)
    numerical_input = keras.layers.Input(shape=(1, ), name='numerical_input')
    
    word_encode_list = []
    word_encode_name = 'encode_reshape_{}'
    for i in range(input_length):
        encode_i = keras.layers.Embedding(input_dim=voc_size[i],
                                          output_dim=e_dim,
                                          input_length=1)(word_input_list[i])
        reshape_i = keras.layers.Reshape((e_dim, ), name=word_encode_name.format(i))(encode_i)
        word_encode_list.append(reshape_i)
    
    dot_sim_list = []
    dot_sim_name = 'dot_sim_{}_{}'
    for i in range(input_length-1):
        for j in range(i+1, input_length):
            dot_sim_i = keras.layers.dot([word_encode_list[i], word_encode_list[j]] , 
                                         axes=1, 
                                         normalize=True,
                                         name=dot_sim_name.format(i, j))
            dot_sim_list.append(dot_sim_i)
    merge_sim = keras.layers.concatenate(dot_sim_list, axis=1) if len(dot_sim_list) > 1 else dot_sim_list[0]
    merge_final = keras.layers.concatenate([merge_sim, numerical_input], axis=1)
    output = keras.layers.Dense(units=1, activation='sigmoid')(merge_final)
    
    word_input_list.append(numerical_input)
    m = keras.Model(inputs=word_input_list, outputs=output, name='cbow_model')
    #m.summary()
    m.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    keras.utils.plot_model(m, to_file='w2v_multi_vocab.png')
    return m

### 3. Training

In [None]:
CATEGORICAL_ATTR = ['dirty_state',
                    'dirty_city',
                    'dirty_zip']
TARGET = 'rent'
OMIT = ['intended_state', 'intended_city', 'intended_zip']
TRAIN_FILENAME = 'rent_data_low_error.csv'
VALID_FILENAME = 'rent_data_low_error_validation.csv'
EMBEDDING_DIM = 50

# Load dataset into one vocab for all columns, integer-encoded
train_df = pd.read_csv(TRAIN_FILENAME)
valid_df = pd.read_csv(VALID_FILENAME)
'''unique_values = np.concatenate(([train_df[cat_attr].astype(str).unique()
                                 for cat_attr in CATEGORICAL_ATTR]), axis=None)
keys = pd.Categorical(unique_values).codes
category_dict = dict(zip(keys, unique_values))
train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)
valid_df[CATEGORICAL_ATTR] = valid_df[CATEGORICAL_ATTR].astype(str).replace(unique_values, keys)
for o in OMIT:
    train_df.pop(o)
    valid_df.pop(o)'''
    
# Load dataset into separate vocabularies, integer-encoded
unique_values = [train_df[cat_attr].astype(str).unique() for cat_attr in CATEGORICAL_ATTR]
keys = [pd.Categorical(u).codes for u in unique_values]
category_dict = []
for i in range(len(keys)):
    category_dict.append(dict(zip(keys[i], unique_values[i])))
    train_df[CATEGORICAL_ATTR] = train_df[CATEGORICAL_ATTR].astype(str).replace(unique_values[i], keys[i])
for o in OMIT:
    train_df.pop(o)
    valid_df.pop(o)
# Generate labels, i.e. we label all observations as positive samples
train_target = np.ones(train_df.shape[0])
valid_target = np.ones(valid_df.shape[0])

# Load data into model, single vocab
# vocabulary_size = len(unique_values)
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(vocabulary_size ** 0.25))
# model = get_w2v_model(len(CATEGORICAL_ATTR), vocabulary_size, adaptive_embedding_dim)

# multi vocab
vocabulary_size = [len(u) for u in unique_values]
# adaptive_embedding_dim = min(EMBEDDING_DIM, int(sum(vocabulary_size)**0.25))
adaptive_embedding_dim = 10
model = get_w2v_multi_vocab_model(len(CATEGORICAL_ATTR), vocabulary_size, adaptive_embedding_dim)
train_np = [train_df[cat_attr].to_numpy() for cat_attr in CATEGORICAL_ATTR]
train_np.append(train_df[TARGET].to_numpy())

history = model.fit(x=train_np,
                    y=train_target,
                    batch_size=64,
                    epochs=50,
                    verbose=1)

# Save embeddings from one vocab
'''e = model.layers[3]
weights = e.get_weights()[0]
out_v = io.open('w2v_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('w2v_meta.tsv', 'w', encoding='utf-8')

for i in range(0, vocabulary_size):
    vec = weights[i]
    out_m.write(category_dict[i] + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

# Save for multiple vocabs
tsv_name = '{}_{}.tsv'
for i, cat_attr in enumerate(CATEGORICAL_ATTR, start=0):
    e = model.layers[3 + i]
    weights = e.get_weights()[0]
    out_v = io.open(tsv_name.format(cat_attr, 'vec'), 'w', encoding='utf-8')
    out_m = io.open(tsv_name.format(cat_attr, 'meta'), 'w', encoding='utf-8')
    
    for j in range(0, vocabulary_size[i]):
        vec = weights[j]
        out_m.write(category_dict[i][j] + "\n")
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_v.close()
    out_m.close()
'''
print('SUCCESS')

### 4. Results
![title](../resource/city_emb.png)

- The model clearly learns that cities from the same state are "similar"

- Erlangen, Ingolstadt and Freising are the most Bavarian cities except München

    $\to$ Rent is incorperated into the notion of similarity

### Next steps
- Plug embeddings into generative model and evaluate performance change
- Think about whether negative sampling could improve learning results
- Reason about the learning results and the meaning of similarity
- Check out other possible models for embedding learning