# Libraries

In [76]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd
from geohash import decode_exactly as dec_exa_fn,decode as dec_fn

from tensorflow.keras import Model
from tensorflow.keras.layers import Input,Dense,Dropout
from tensorflow.keras.initializers import he_normal,constant
from tensorflow.keras.losses import CategoricalCrossentropy

# Load Data

In [15]:
data = pd.read_csv('../Data/train.csv')

In [6]:
def defix(x):
    groups = re.match("w(.*)",x)
    if groups == None:
        print(x)
        return -1
    elif len(groups[1]) == 6:
        return groups[1]

In [16]:
data['starttime'] = pd.to_datetime(data['starttime'])
data['defix_start'] = data['geohashed_start_loc'].map(lambda x:defix(x))
data['defix_end'] = data['geohashed_end_loc'].map(lambda x:defix(x))
data.drop(['geohashed_start_loc','geohashed_end_loc'],axis = 1,inplace = True)

In [17]:
data.head()

Unnamed: 0,orderid,userid,bikeid,biketype,starttime,defix_start,defix_end
0,1893973,451147,210617,2,2017-05-14 22:16:50,x4snhx,x4snhj
1,4657992,1061133,465394,1,2017-05-14 22:16:52,x4dr59,x4dquz
2,2965085,549189,310572,1,2017-05-14 22:16:51,x4fgur,x4fu5n
3,4548579,489720,456688,1,2017-05-14 22:16:51,x4d5r5,x4d5r4
4,3936364,467449,403224,1,2017-05-14 22:16:50,x4g27p,x4g266


# EDA

In [83]:
orderID = Counter(data['orderid'])
userID = Counter(data['userid'])
bikeID = Counter(data['bikeid'])
bikeType = Counter(data['biketype'])

In [86]:
len(orderID),int(len(orderID)/len(userID)),int(len(orderID)/len(bikeID)),len(bikeType)

(3214096, 9, 6, 2)

# Hashed Location Encoding

In [41]:
def tokenizer(col):
    char_list = []
    for ind,i in enumerate(data[col]):
        char_list.append(list(i))

    return char_list,list(np.unique(np.array(char_list)))

def Vocabularizer(vocab_keys):
    Vocab = {'token_to_idx':{},'idx_to_token':{}}
    for ind, i in enumerate(vocab_keys):
        Vocab['token_to_idx'][i] = ind
        Vocab['idx_to_token'][ind] = i
    return Vocab

In [42]:
start_seq, vocab_keys = tokenizer('defix_start')
end_seq, _ = tokenizer('defix_end')
Vocab = Vocabularizer(vocab_keys)

In [59]:
def numberizer(start_seq):
    source_encoded = np.zeros_like(start_seq)
    for idx_i, seq in enumerate(start_seq):
        for idx_j, tk in enumerate(seq):
            source_encoded[idx_i,idx_j] = Vocab['token_to_idx'][tk]
    source_encoded = source_encoded.astype(int)
    return source_encoded

In [60]:
source_encoded = numberizer(start_seq)
target_encoded = numberizer(end_seq)

In [66]:
source_encoded.shape,3e+6

((3214096, 6), 3000000.0)

# Model

In [70]:
X_train,y_train = source_encoded[:int(3e+6)],target_encoded[:int(3e+6)]
X_test,y_test = source_encoded[int(3e+6):],target_encoded[int(3e+6):]

In [79]:
x = Input(shape = (6,))
h = Dense(32,activation='relu',kernel_initializer=he_normal())(x)
h = Dense(64,activation='relu',kernel_initializer=he_normal())(h)
h = Dense(6,activation='relu',kernel_initializer=he_normal())(h)

model = Model(inputs = x,outputs = h)
model.compile(optimizer = 'adam',loss = 'mse')

In [124]:
for epoch in range(20):
    hist = model.fit(X_train,y_train,batch_size=256,epochs=1,verbose=1)
    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    print('Epoch: %d, Accuracy: %.2f%%' %(epoch + 1,100 * Acc(y_pred,y_test)))

Epoch: 1, Accuracy: 2.98%
Epoch: 2, Accuracy: 3.34%
Epoch: 3, Accuracy: 3.84%
Epoch: 4, Accuracy: 3.88%
Epoch: 5, Accuracy: 3.46%
Epoch: 6, Accuracy: 2.83%
Epoch: 7, Accuracy: 3.48%
Epoch: 8, Accuracy: 3.90%
Epoch: 9, Accuracy: 4.28%
Epoch: 10, Accuracy: 5.19%
Epoch: 11, Accuracy: 4.30%
Epoch: 12, Accuracy: 2.54%
Epoch: 13, Accuracy: 4.97%
Epoch: 14, Accuracy: 2.98%
Epoch: 15, Accuracy: 5.39%
Epoch: 16, Accuracy: 3.25%
Epoch: 17, Accuracy: 2.94%
Epoch: 18, Accuracy: 5.34%
Epoch: 19, Accuracy: 3.43%
Epoch: 20, Accuracy: 3.64%


In [125]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)
print('Accuracy: %.2f%%' %(100 * Acc(y_pred,y_test)))

Accuracy: 3.64%


In [108]:
def characterizer(y_pred):
    target_encoded = np.zeros_like(y_pred).astype(str)
    for idx_i, seq in enumerate(list(y_pred)):
        for idx_j, st in enumerate(seq):
            target_encoded[idx_i,idx_j] = Vocab['idx_to_token'][st]
    return target_encoded

In [122]:
def Acc(y_pred,y_test):
    y_pred_char = characterizer(y_pred)
    y_test_char = characterizer(y_test)
    acc = (((y_test_char == y_pred_char).sum(axis = 1) - 6) >= 0).sum() / y_test.shape[0]
    return acc