## Libraries

In [23]:
import re
import json
import gensim
#import gensim.downloader as api
import warnings
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPool1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

plt.style.use('seaborn')
#warnings.filterwarnings('ignore')
#v2v = api.load('word2vec-google-news-300')

## Global arguments

In [24]:
EMBED_TYPE = 'word2vec'
MODEL_TYPE = 'bilstm'
DIM_HIDDEN_1 = 150
DIM_HIDDEN_2 = 150
DROPOUT_RATE = 0.1
SEQ_LEN = 220
EMBED_DIM = 300
VOCAB_SIZE = 30_000
LR = 1e-3
NUM_EPOCHS = 10
BATCH_SIZE = 16

STAMP = f'{EMBED_TYPE}-{MODEL_TYPE}-{DIM_HIDDEN_1}-{DIM_HIDDEN_2}-{DROPOUT_RATE}_le_{SEQ_LEN}_em_{EMBED_DIM}_lr_{LR}_ep_{NUM_EPOCHS}_ba_{BATCH_SIZE}'

## Load and split data

In [25]:
TEXT_COLS = ['name', 'description', 'space', 'house_rules', 'access', 'interaction', 'neighborhood_overview', 'notes', 'transit']
INPUT_COLS = ['description']

df_train = pd.read_csv('./input/df_pre_train_deep.csv')
df_test = pd.read_csv('./input/df_pre_test_deep.csv')

print(df_train.shape)
print(df_test.shape)

(10400, 81)
(2600, 81)


## Preprocess sequences

In [26]:
def tokenize_string(string):
    '''Clean the text and return list of tokens'''  
    if pd.isna(string):
        return ''
    string = str(string)#.lower()
    string = re.sub("&", " and ", string)
    string = re.sub("\.\.\.", ".", string)
    string = re.sub("\.\.", ".", string)
    string = re.sub("-", " ", string)
    string = re.sub(r"\\", " ", string)
    string = re.sub("/", " ", string)
    string = re.sub("\’", "\'", string)
    string = re.sub("what's", "what is", string)
    string = re.sub("What's", "What is", string)
    string = re.sub("it\'s", "it is", string)
    string = re.sub("It\'s", "It is", string)
    string = re.sub("\'ve", " have", string)
    string = re.sub("can\'t", "can not", string)
    string = re.sub("Can\'t", "Can not", string)
    string = re.sub("n\'t", " not", string)
    string = re.sub("I\'m", "I am", string)
    string = re.sub("\'re", " are", string)
    string = re.sub("\'d", " would", string)
    string = re.sub("\'ll", " will", string)
    string = re.sub("he\'s", "he is", string)
    string = re.sub("He\'s", "He is", string)
    string = re.sub("she\'s", "she is", string)
    string = re.sub("She\'s", "She is", string)
    string = re.sub("that\'s", "that is", string)
    string = re.sub("That\'s", "That is", string)
    string = re.sub("what\'s", "what is", string)
    string = re.sub("What\'s", "What is", string)
    string = re.sub("where\'s", "where is", string)
    string = re.sub("Where\'s", "Where is", string)
    string = re.sub("how\'s", "how is", string)
    string = re.sub("How\'s", "How is", string)
    string = re.sub("who\'s", "who is", string)
    string = re.sub("Who\'s", "Who is", string)
    string = re.sub("won\'t", "will not", string)
    string = re.sub("Won\'t", "Will not", string)
    string = re.sub("n\'t", " not", string)
    string = re.sub("n\'", "ng", string)
    string = re.sub("\'bout", "about", string)
    string = re.sub("\'til", "until", string)
    string = re.sub("[^A-Za-z\s]", "", string)
    text = string.split(" ")    
    return text

def process_df(df):
    '''Tokenize a whole dataframe'''
    res = []
    for i, row in df.iterrows():
        seq = tokenize_string(row[INPUT_COLS].values)
        seq = [token for token in seq if token != '']
        res.append(seq) 
    return res

In [27]:
x_train = process_df(df_train)
x_test = process_df(df_test)

y_train = np.array(df_train['log_price'])
y_test = np.array(df_test['log_price'])

In [28]:
print(x_train[1])

['Fully', 'furnished', 'one', 'bedroom', 'apartment', 'situated', 'on', 'the', 'rd', 'floor', 'in', 'a', 'well', 'maintained', 'building', 'on', 'Na', 'Kozace', 'street', 'in', 'Prague', 'Vinohrady', 'The', 'flat', 'has', 'an', 'entrance', 'hall', 'living', 'area', 'with', 'fully', 'equipped', 'kitchen', 'corner', 'bedroom', 'with', 'the', 'window', 'facing', 'into', 'quiet', 'court', 'and', 'shower', 'bathroom', 'washing', 'machine', 'and', 'dryer', 'Building', 'is', 'equipped', 'with', 'an', 'elevator', 'Close', 'to', 'tram', 'bus', 'and', 'metro', 'A', 'Nmst', 'Mru', 'Fully', 'furnished', 'one', 'bedroom', 'apartment', 'situated', 'on', 'the', 'rd', 'floor', 'in', 'a', 'well', 'maintained', 'building', 'on', 'Na', 'Kozace', 'street', 'in', 'Prague', 'Vinohrady', 'The', 'flat', 'has', 'an', 'entrance', 'hall', 'living', 'area', 'with', 'fully', 'equipped', 'kitchen', 'corner', 'bedroom', 'with', 'the', 'window', 'facing', 'into', 'quiet', 'court', 'and', 'shower', 'bathroom', 'washin

## Create vocabulary

In [29]:
# --------- Create vocabulary index based on word frequency ---------
tokenizer = Tokenizer(num_words = VOCAB_SIZE, filters = "", lower = False)
tokenizer.fit_on_texts(x_train) 

# --------- Transform each text to a vector of integers ---------
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

word_index = tokenizer.word_index

print(f"Found {len(word_index)} unique tokens in data")

Found 35464 unique tokens in data


In [30]:
print(x_train[0])

[1326, 75, 150, 4603, 130, 62, 2, 338, 10, 11, 21, 3, 175, 6, 75, 603, 924, 8, 45, 908, 29, 13, 452, 12, 3, 339, 8, 23, 422, 507, 195, 24, 67, 107, 14, 595, 837, 2, 114, 567, 75, 150, 415, 124, 17334, 4603, 130, 32, 4, 30, 123, 320, 8, 1039, 22, 20, 1, 127, 44, 6, 3, 94, 58, 55, 362, 6702, 5021, 1, 28, 2, 1, 17, 14, 1, 537, 6, 1, 11, 10, 28, 4, 39, 8, 3, 88, 3, 842, 3, 241, 2, 2467, 5, 7291, 584, 10, 17, 4, 39, 8, 3, 636, 656, 1049, 3, 201, 3, 295, 3, 376, 3, 639, 279, 3102, 324, 1083, 191, 18, 340, 40, 324, 12543, 511, 340, 3, 1155, 136, 534, 10, 371, 6, 1, 58, 4, 30, 123, 30, 532, 55, 117, 192, 1492, 5, 1, 44, 72, 1311, 55, 743, 4, 87, 56, 17335, 56, 5841, 6, 1, 551, 2, 1, 165, 242, 7, 1, 116, 406]


In [31]:
print('0: [PAD]')
for word, i in word_index.items(): 
    if (i <= 10) or (len(word_index)-11 < i < len(word_index)):
        print(f"{i}: {word}")

0: [PAD]
1: the
2: and
3: a
4: is
5: to
6: of
7: in
8: with
9: you
10: The
35454: welches
35455: letzten
35456: Jahren
35457: Szeneviertel
35458: Kreative
35459: Welt
35460: entwickelte
35461: gelangt
35462: Fuss
35463: bringen


In [32]:
x_train = pad_sequences(x_train, maxlen = SEQ_LEN)
x_test = pad_sequences(x_test, maxlen = SEQ_LEN)

In [33]:
print(x_train[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0  1326    75   150  4603   130
    62     2   338    10    11    21     3   175     6    75   603   924
     8    45   908    29    13   452    12     3   339     8    23   422
   507   195    24    67   107    14   595   837     2   114   567    75
   150   415   124 17334  4603   130    32     4    30   123   320     8
  1039    22    20     1   127    44     6     3    94    58    55   362
  6702  5021     1    28     2     1    17    14     1   537     6     1
    11    10    28     4    39     8     3    88     3   842     3   241
     2  2467     5  7291   584    10    17     4    39     8     3   636
   656  1049     3   201     3   295     3   376   

## Construct embedding matrix

In [34]:
embeddings_dict = {}

if EMBED_TYPE == 'glove':
    with open('./input/glove.840B.300d.txt', encoding = 'utf8') as file:
        for line in file:
            parts = line.split()
            try:
                word = parts[0]
                coefs = np.asarray(parts[1:], dtype = 'float32')        
            except:
                pass
            if coefs.shape == (EMBED_DIM,):
                embeddings_dict[word] = coefs   
elif EMBED_TYPE == 'word2vec':
    v2v = gensim.models.KeyedVectors.load_word2vec_format('./input/GoogleNews-vectors-negative300.bin', binary = True)
    for token in v2v.vocab.keys():
        coefs = np.asarray(v2v[token], dtype = 'float32')
        if coefs.shape == (EMBED_DIM,):
            embeddings_dict[token] = coefs   

print(f"Found {len(embeddings_dict.values())} valid word vectors")

Found 3000000 valid word vectors


In [35]:
# Update vocab size from maximum to actual tokens found (+ pad token)
VOCAB_SIZE = min(VOCAB_SIZE, len(word_index)) + 1

embed_matrix = np.zeros((VOCAB_SIZE, EMBED_DIM))

print(f"Embedding matrix shape: {embed_matrix.shape}")

Embedding matrix shape: (30001, 300)


In [36]:
# --------- Search for embeddings from GloVe ---------
num_found = 0
for word, i in word_index.items(): # iterate over words found in data
    if i >= VOCAB_SIZE: 
        continue # if words found in data > vocab size: skip iteration
    embed_vector = embeddings_dict.get(word) # search for embedding from dict
    if embed_vector is not None:
        embed_matrix[i] = embed_vector # save vector to embedding matrix
        num_found += 1
        
print(f"Found embedding for {num_found} words")

Found embedding for 21066 words


## Specify model

In [37]:
#K.clear_session()

In [38]:
x = Input(shape = (SEQ_LEN,), dtype = 'int32')
h = Embedding(VOCAB_SIZE, EMBED_DIM, weights = [embed_matrix], input_length = SEQ_LEN, trainable = False)(x)
h = Bidirectional(LSTM(DIM_HIDDEN_1, return_sequences = False, dropout = DROPOUT_RATE, recurrent_dropout = DROPOUT_RATE), merge_mode = 'concat')(h)
#h = GlobalMaxPool1D()(h)
h = Dropout(DROPOUT_RATE)(h)
h = Dense(DIM_HIDDEN_2, activation = 'relu')(h)
h = Dropout(DROPOUT_RATE)(h)
y = Dense(1)(h)

model = Model(inputs = x, outputs = y)
model.compile(loss = 'mean_squared_error', optimizer = Adam(lr = LR)) 

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 220)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 220, 300)          9000300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300)               541200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 150)               45150     
_________________________________________________________________
dropout_3 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 151 

In [39]:
model.layers[2].weights

[<tf.Variable 'bidirectional_1/forward_lstm_1/kernel:0' shape=(300, 600) dtype=float32, numpy=
 array([[ 0.01636492, -0.02177608,  0.07274732, ...,  0.04867616,
         -0.00235229,  0.03806137],
        [-0.03361943,  0.06085576,  0.01252838, ..., -0.00084101,
          0.03197478,  0.06178802],
        [ 0.05570008,  0.04452485,  0.01821449, ...,  0.03646465,
         -0.06384901, -0.06608415],
        ...,
        [ 0.04917453, -0.01154429, -0.06693958, ...,  0.07245913,
         -0.02078933,  0.04338737],
        [ 0.07734028, -0.0148826 ,  0.01956581, ...,  0.04428703,
         -0.04972262,  0.03288473],
        [ 0.05735902, -0.05284134, -0.08056441, ..., -0.03044537,
          0.0580876 , -0.0694908 ]], dtype=float32)>,
 <tf.Variable 'bidirectional_1/forward_lstm_1/recurrent_kernel:0' shape=(150, 600) dtype=float32, numpy=
 array([[-0.05960584, -0.04756306, -0.00860993, ..., -0.03443159,
         -0.03795656,  0.01080187],
        [ 0.02192086,  0.03145051, -0.02476786, ..., -0

## Fit model

In [18]:
print(STAMP, '\n')

log = model.fit(
    x_train,
    y_train,
    epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    shuffle = True,
    validation_split = 0.20
)

glove-bilstm-150-150-0.1_le_220_em_300_lr_0.001_ep_10_ba_16 

Train on 8320 samples, validate on 2080 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Model evaluation

### Train set

In [19]:
y_pred = model.predict(x_train, batch_size = BATCH_SIZE)

np.round_(mean_squared_error(y_train, y_pred), 4)

0.2501

### Test set

In [20]:
y_pred = model.predict(x_test, batch_size = BATCH_SIZE)

MSE = np.round_(mean_squared_error(y_test, y_pred), 4)
MSE

0.4651

## Save training logs

In [21]:
log_df = pd.DataFrame(log.history)

log_df.to_csv(f'./output/log_train_{STAMP}-mse-{MSE}.csv', index = False)

## Save model and tokenizer

In [22]:
tokenizer_json = tokenizer.to_json()
with open(f'./input/tokenizer_{STAMP}-mse-{MSE}.json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii = False))

model.save(f'./input/model_{STAMP}-mse-{MSE}.h5') 

## Prediction for main model

In [None]:
df_train = pd.read_csv('./input/df_pre_train.csv')
df_test = pd.read_csv('./input/df_pre_test.csv')

input_train = process_df(df_train)
input_test = process_df(df_test)

In [None]:
input_train = tokenizer.texts_to_sequences(input_train)
input_train = pad_sequences(input_train, maxlen = SEQ_LEN)

input_test = tokenizer.texts_to_sequences(input_test)
input_test = pad_sequences(input_test, maxlen = SEQ_LEN)

In [None]:
df_train['text_feature'] = model.predict(input_train, batch_size = BATCH_SIZE)
df_test['text_feature'] = model.predict(input_test, batch_size = BATCH_SIZE)

df_train = df_train.drop(TEXT_COLS, axis = 1)
df_test = df_test.drop(TEXT_COLS, axis = 1)

df_train.to_csv(f'./input/df_final_train_{EMBED_TYPE}-{MODEL_TYPE}-mse-{MSE}.csv', index = False)
df_test.to_csv(f'./input/df_final_test_{EMBED_TYPE}-{MODEL_TYPE}-mse-{MSE}.csv', index = False)