# Libraries

In [64]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd
from geohash import decode_exactly as dec_exa_fn,decode as dec_fn

from tensorflow.keras import Model
from tensorflow.keras.layers import Input,Dense,Dropout,LeakyReLU as lrelu
from tensorflow.keras.initializers import he_normal,constant
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.metrics import mae

# Load Data

In [2]:
data = pd.read_csv('../Data/train.csv')

In [3]:
def defix(x):
    groups = re.match("w(.*)",x)
    if groups == None:
        print(x)
        return -1
    elif len(groups[1]) == 6:
        return groups[1]

In [4]:
data['start_lon'] = data['geohashed_start_loc'].map(lambda x:dec_exa_fn(x)[0])
data['start_lat'] = data['geohashed_start_loc'].map(lambda x:dec_exa_fn(x)[1])
data['end_lon'] = data['geohashed_end_loc'].map(lambda x:dec_exa_fn(x)[0])
data['end_lat'] = data['geohashed_end_loc'].map(lambda x:dec_exa_fn(x)[1])

data['starttime'] = pd.to_datetime(data['starttime'])
data['defix_start'] = data['geohashed_start_loc'].map(lambda x:defix(x))
data['defix_end'] = data['geohashed_end_loc'].map(lambda x:defix(x))
data.drop(['geohashed_start_loc','geohashed_end_loc'],axis = 1,inplace = True)

In [5]:
data.head()

Unnamed: 0,orderid,userid,bikeid,biketype,starttime,defix_start,defix_end
0,1893973,451147,210617,2,2017-05-14 22:16:50,x4snhx,x4snhj
1,4657992,1061133,465394,1,2017-05-14 22:16:52,x4dr59,x4dquz
2,2965085,549189,310572,1,2017-05-14 22:16:51,x4fgur,x4fu5n
3,4548579,489720,456688,1,2017-05-14 22:16:51,x4d5r5,x4d5r4
4,3936364,467449,403224,1,2017-05-14 22:16:50,x4g27p,x4g266


# EDA

In [6]:
orderID = Counter(data['orderid'])
userID = Counter(data['userid'])
bikeID = Counter(data['bikeid'])
bikeType = Counter(data['biketype'])

In [7]:
len(orderID),int(len(orderID)/len(userID)),int(len(orderID)/len(bikeID)),len(bikeType)

(3214096, 9, 6, 2)

# Hashed Location Encoding

In [11]:
def tokenizer(col):
    char_list = []
    for ind,i in enumerate(data[col]):
        char_list.append(list(i))

    return char_list,list(np.unique(np.array(char_list)))

def Vocabularizer(vocab_keys):
    Vocab = {'token_to_idx':{},'idx_to_token':{}}
    for ind, i in enumerate(vocab_keys):
        Vocab['token_to_idx'][i] = ind
        Vocab['idx_to_token'][ind] = i
    return Vocab

def Numberizer(start_seq):
    source_encoded = np.zeros(np.array(start_seq).shape).astype(int)
    for idx_i, seq in enumerate(start_seq):
        for idx_j, tk in enumerate(seq):
            source_encoded[idx_i,idx_j] = Vocab['token_to_idx'][tk]
    return source_encoded

def OneHotEncoder(vocab_keys,source_encoded):
    onehot_token_len = len(vocab_keys)
    onehot_vec_len = 6 * onehot_token_len
    source_onehot_encoded = np.zeros((source_encoded.shape[0],onehot_vec_len))

    for idx_i, num_seq in enumerate(source_encoded):
        onehot_vec = np.zeros((onehot_vec_len,))
        for idx_j, num in enumerate(num_seq):
            onehot_vec[onehot_token_len * idx_j + num - 1] = 1
        source_onehot_encoded[idx_i,:] = onehot_vec
    
    return source_onehot_encoded

In [9]:
start_seq, vocab_keys = tokenizer('defix_start')
end_seq, _ = tokenizer('defix_end')

onehot_token_len = len(vocab_keys)
Vocab = Vocabularizer(vocab_keys)

source_encoded = Numberizer(start_seq)
target_encoded = Numberizer(end_seq)

In [12]:
source_onehot_encoded = OneHotEncoder(vocab_keys,source_encoded)
target_onehot_encoded = OneHotEncoder(vocab_keys,target_encoded)

# Embedding by AE

In [162]:
def build_AE():
    x = Input(shape=(192, ))
    h = Dense(128, activation=lrelu(0.1), kernel_initializer=he_normal())(x)
    h = Dense(64, activation=lrelu(0.1), kernel_initializer=he_normal())(h)
    h = Dense(128,
              activation=lrelu(0.1),
              kernel_initializer=he_normal(),
              name='dec_1')(h)
    h = Dense(192,
              activation=lrelu(0.1),
              kernel_initializer=he_normal(),
              name='dec_2')(h)

    model = Model(inputs=x, outputs=h)
    model.compile(optimizer=Adamax(lr=3e-4), loss='mse', metrics=[mae])
    
    return model


def build_embedder(model):
    return Model(inputs = model.input,outputs = model.layers[2].output)


def build_disembedder(model):
    emb = Input(shape = (64,))
    h = Dense(128,activation=lrelu(0.1),kernel_initializer=he_normal(),name='dec_1')(emb)
    h = Dense(192,activation=lrelu(0.1),kernel_initializer=he_normal(),name='dec_2')(h)
    DisEmbedder = Model(inputs = emb,outputs = h)
    DisEmbedder.layers[-1].set_weights(model.layers[-1].get_weights())
    DisEmbedder.layers[-2].set_weights(model.layers[-2].get_weights())
    
    return DisEmbedder


def train_AE(source_onehot_encoded,target_onehot_encoded,batch_size,epochs):
    model = build_AE()

    for epoch in range(epochs):
        model.fit(source_onehot_encoded,
                  source_onehot_encoded,
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0.2,
                  verbose=1)
        model.fit(target_onehot_encoded,
                  target_onehot_encoded,
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0.2,
                  verbose=1)
    model.save('./AE.h5')
    
    return model

In [156]:
train_AE(source_onehot_encoded,target_onehot_encoded,batch_size=1024,epochs = 10)

Embedder = build_embedder(model)
DisEmbedder = build_disembedder(model)

Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 samples
Train on 2571276 samples, validate on 642820 s

In [158]:
source_embed = Embedder.predict(source_onehot_encoded)
target_embed = Embedder.predict(target_onehot_encoded)

# Model

In [165]:
X_train,y_train = source_embed[:int(3e+6)],target_embed[:int(3e+6)]
X_test,y_test = source_embed[int(3e+6):],target_embed[int(3e+6):]

In [177]:
def characterizer(y_pred,DisEmbedder):
    y_pred_onehot = np.abs(np.round(DisEmbedder.predict(y_pred)))
    
    # return to idx
    target_encoded = np.zeros((y_pred_onehot.shape[0],6))
    for idx_i, seq in enumerate(y_pred_onehot):
        for idx_j in range(6):
            onehot_vec = np.zeros((32,))
            idx_onehot = np.argmax(seq[32 * idx_j:32 * (idx_j + 1)])
            target_encoded[idx_i,idx_j] = idx_onehot
    
    # idx to geohash token
    target_geohash = np.zeros_like(target_encoded).astype(str)
    for idx_i, seq in enumerate(list(target_encoded)):
        for idx_j, st in enumerate(seq):
            target_geohash[idx_i,idx_j] = Vocab['idx_to_token'][st]
    
    return target_geohash


def Acc(y_pred,y_test):
    y_pred_char = characterizer(y_pred,DisEmbedder)
    y_test_char = characterizer(y_test,DisEmbedder)
    acc = (((y_test_char == y_pred_char).sum(axis = 1) - 6) >= 0).sum() / y_test.shape[0]
    return acc

In [166]:
x = Input(shape = (64,))
h = Dense(128,activation='relu',kernel_initializer=he_normal())(x)
h = Dense(64,activation='relu',kernel_initializer=he_normal())(h)
h = Dense(128,activation='relu',kernel_initializer=he_normal())(h)
h = Dense(64,activation='relu',kernel_initializer=he_normal())(h)

model = Model(inputs = x,outputs = h)
model.compile(optimizer = Adamx(3e-4),loss = 'mse')

In [None]:
for epoch in range(20):
    hist = model.fit(X_train,y_train,batch_size=1024,epochs=1,verbose=1)
    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    print('Epoch: %d, Accuracy: %.2f%%' %(epoch + 1,100 * Acc(y_pred,y_test)))