# Libraries

In [None]:
# Main
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Sklearn
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Keras
import tensorflow as tf
from tensorflow.keras import regularizers, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Input, concatenate
from tensorflow.keras.optimizers import Adam, Adamax, Nadam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Other
import os

# Setup

In [None]:
pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
path = "../CSV Files"

train = pd.read_csv(path+"/train.csv", index_col="id")
test = pd.read_csv(path+"/test.csv",  index_col="id")

# Functions

In [None]:
def wrangle(X):
    # make a copy
    X = X.copy()
    
    # encode "t" and "f" as 1's and 0's
    X['host_has_profile_pic'][X['host_has_profile_pic']=='t'] = 1
    X['host_has_profile_pic'][X['host_has_profile_pic']=='f'] = 0
    
    X['host_identity_verified'][X['host_identity_verified']=='t'] = 1
    X['host_identity_verified'][X['host_identity_verified']=='f'] = 0
    
    X['host_has_profile_pic'] = X['host_has_profile_pic'].astype(float)
    X['host_identity_verified'] = X['host_identity_verified'].astype(float)
    
    X['instant_bookable'][X['instant_bookable']=='t'] = 1
    X['instant_bookable'][X['instant_bookable']=='f'] = 0
    
    # Group some of the many property types together
    X['property_type'][X['property_type'].isin(['Boat','Tent','Castle','Yurt', 'Hut', 'Treehouse',
                                                'Chalet','Earth House','Tipi','Cave',
                                                'Train','Parking Space','Island','Casa particular',
                                                'Lighthouse', 'Vacation home', 'Serviced apartment'])] = 'Other'

    # columns with unusable variance
    unusable_variance = ['zipcode']

    # columns with high percentage of missing values
    high_nans = ['first_review','host_response_rate','last_review',
                 'review_scores_rating','thumbnail_url']

    # categorical variables with high cardinality
    # 'neighborhood' has 620 and 'thumbnail_url' has many thousands
    high_card = ['neighbourhood','thumbnail_url','name','amenities',
                 'description', 'host_since']

    # Get the price and drop the log of price
    X['price'] = np.exp(X['log_price'])
    X = X.drop(['log_price'] + unusable_variance + high_nans + high_card, axis=1)

    return X

In [None]:
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        # encode
        train_enc = le.fit_transform(X_train.iloc[:, i].values)
        test_enc = le.fit_transform(X_test.iloc[:, i].values)
        # store
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [None]:
def scale_inputs(X_train, X_test):
    ss = StandardScaler()
    ss.fit_transform(X_train, X_test)
    return X_train, X_test

In [None]:
def model_creation_emb():
    lr = .001
    opt = Adam(lr)
#     input_shape = 16
#     prob = .50
    l2 = 0.02

    il_nodes = 10
#     hl_nodes_1 = 64*8
#     hl_nodes_2 = 64*8
#     hl_nodes_3 = 64*8
#     hl_nodes_4 = 64*8
    ol_nodes = 1
    
    dense = Dense(il_nodes,
                  activation='relu', 
                  kernel_regularizer=regularizers.l2(l2))(emb)
    
    output = Dense(ol_nodes,
                   activation='linear')(dense)
    
    model = Model(inputs=in_layers,
                  outputs=output)
    
    model.compile(optimizer=opt,
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])
    
    return model

In [None]:
def model_creation():
    lr = .001
    opt = Adam(lr)
    input_shape = 16
    prob = .50
    l2 = 0.02

    il_nodes = 64*8
    hl_nodes_1 = 64*8
    hl_nodes_2 = 64*8
    hl_nodes_3 = 64*8
    hl_nodes_4 = 64*8
    ol_nodes = 1
    
    model = Sequential([
    Dense(il_nodes, 
          activation='relu', 
          input_dim=input_shape,
          kernel_regularizer=regularizers.l2(l2)),
    
    Dropout(prob),
        
    Dense(hl_nodes_1, 
          activation='relu',
          kernel_regularizer=regularizers.l2(l2)),
        
    Dropout(prob),
        
    Dense(hl_nodes_2, 
          activation='relu',
          kernel_regularizer=regularizers.l2(l2)),
        
    Dropout(prob),
    
    Dense(hl_nodes_3, 
          activation='relu',
          kernel_regularizer=regularizers.l2(l2)),
        
    Dropout(prob),
        
    Dense(hl_nodes_4, 
          activation='relu',
          kernel_regularizer=regularizers.l2(l2)),
        
    Dropout(prob),
        
    Dense(ol_nodes, 
          activation='linear')
])
    
    model.compile(optimizer=opt,
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])

    return model

# EDA

In [None]:
print(train.shape)
train.head()

In [None]:
train.info()

In [None]:
train = wrangle(train)

In [None]:
train.head()

In [None]:
train.info()

# Split

In [None]:
target = 'price'
X = train.drop(target, axis=1)
y = train[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.20,
                                                    random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

# Baseline

In [None]:
baseline = mean_absolute_error(y_train, [y_train.mean()] * len(y_train))
print('Baseline MAE:', round(baseline,2))

# Pre-fitting

In [None]:
X_train, X_test = prepare_inputs(X_train, X_test)

In [None]:
X_train, X_test = scale_inputs(X_train, X_test)

In [None]:
# X_train = [list(X_train[i]) for i in range(len(X_train))]
# X_train = np.asarray(X_train).T

# X_test = [list(X_test[i]) for i in range(len(X_test))]
# X_test = np.asarray(X_test).T

In [None]:
in_layers, em_layers = list(), list()

for i in range(len(X_train)):
    n_labels = len(np.unique(X_train[i]))
    in_layer = Input(shape=(1,))
    em_layer = Embedding(n_labels, 10)(in_layer)
    in_layers.append(in_layer)
    em_layers.append(em_layer)
    
emb = concatenate(em_layers)

In [None]:
# print(X_train.shape)
# print(X_test.shape)

# Model

In [None]:
model = KerasRegressor(model_creation_emb, verbose=1)

In [None]:
!rm -rf ./logs/

In [None]:
%load_ext tensorboard

logdir = os.path.join("logs", "EarlyStopping")

board_callback = TensorBoard(logdir, histogram_freq=1)

# Not in use.
stop = EarlyStopping(monitor="val_mean_absolute_error",
                     min_delta=1,
                     patience=3)

checkpoint = ModelCheckpoint("weights_best.h5", 
                             save_weights_only=True)

In [None]:
result = model.fit(X_train, y_train,
                   epochs=1, batch_size=32,
                   validation_data=(X_test, y_test),
                   callbacks=[board_callback, stop, checkpoint])

In [None]:
result.model.summary()

In [None]:
result.model.evaluate(X_train, y_train)

In [None]:
result.model.evaluate(X_test, y_test)

In [None]:
%tensorboard --logdir logs

In [None]:
df = pd.DataFrame.from_records(result.history)
df['epoch'] = [i for i in range(df.shape[0])]

plt.plot(df['epoch'], df['val_mean_absolute_error'], label="VMAE")
plt.plot(df['epoch'], df['mean_absolute_error'], label="MAE")
# plt.xticks(df['epoch'])
plt.ylabel("Error")
plt.xlabel("Epochs")
plt.legend()
plt.show()

In [None]:
result.model.save('keras_model.h5')

# Tune

In [None]:
params = {
    'batch_size': [32,64,512],
    'epochs': [32,64,512],
    'opt': [],
    'lr': []
}

grid = GridSearchCV(model,
                    params,
                    cv=2,
                    n_jobs=-1,
                    verbose=1)

grid_result = grid.fit(X_train, y_train)

In [None]:
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
print(f"Mean test score: {grid_result.cv_results_['mean_test_score']}")