In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras
import tensorflow as tf

In [2]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,True,True,True,False,-0.221989,0.879947,0.206922,1.031946,-0.417068,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,True,True,False,True,-0.248121,0.883912,0.178912,1.034251,-0.425829,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,True,True,True,True,-0.252342,0.869698,0.126796,-0.607644,-0.445747,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,True,True,True,False,0.187295,0.883912,0.178912,-0.976832,2.696289,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,True,True,True,True,-0.242856,0.875005,0.145414,-0.634193,1.380088,-0.101872,-0.095776


In [3]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
features.remove('name_sentiment_analysis')

bool_features = [col for col in df.columns if df[col].dtype == 'bool']
df[bool_features] = df[bool_features].astype(int)
X = df.drop(columns=['log_price'], axis=1)
y = df['log_price']

In [4]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [5]:
X_train[:5]

Unnamed: 0,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
42191,0.391448,-0.404046,0.296013,-0.312048,-0.566461,1,1,1,0,-0.248121,0.883912,0.178912,1.034251,-0.434422,-0.724731,2.953063
37702,-0.536689,-0.404046,-0.398157,-0.312048,-0.566461,1,1,0,0,-0.221989,-1.032336,0.206922,1.031946,-0.417068,-0.111844,4.544577
32194,1.319585,-0.404046,0.296013,0.861454,1.027816,1,1,0,0,-0.221989,0.879947,0.206922,1.031946,-0.39655,1.378005,6.106757
946,-0.072621,-0.404046,0.157179,-0.312048,-0.566461,1,1,1,0,-0.248121,0.883912,0.178912,1.034251,1.479132,1.08132,0.714817
73606,-0.072621,-0.404046,0.296013,-0.312048,0.230678,1,1,1,0,-0.242856,0.875005,0.145414,1.033033,-0.45052,-0.248251,-0.897739


### Train with NN

In [6]:
# Uncomment below if tensorflow is not installed already
#%pip install tensorflow
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout

In [19]:
# Build Neural Network
nn_model = Sequential([
    Dense(128, activation='relu',input_dim=X_train.shape[1]),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])

nn_model.compile(optimizer='adam', loss='mean_squared_error')

In [20]:
# Train the model
history = nn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
# Evaluate the model
mse = nn_model.evaluate(X_test, y_test)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 0.1891198307275772


### Hyper Tuning

In [None]:
metrics = [keras.metrics.MeanSquaredError(name="mean_squared_error", dtype=None)]
def build_hp_model(hp):
    model = Sequential()
    for i in range(hp.Int('num_layers', 3, 6)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=256, step=32),
                        activation='leaky_relu'))
            
    model.add(Dense(1))
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='mean_squared_error',
                  metrics=['mean_squared_error'])
    return model

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=5,
    mode='min',
    restore_best_weights=True,
)

model_checkpoint = keras.callbacks.ModelCheckpoint(
    filepath='checkpoints',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only= False,
    mode='min'
)

In [None]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_hp_model,
    objective='val_loss',
    max_trials=10,
    seed=10,
    executions_per_trial=3,
    directory='tuner_results',
    project_name='keras_tuner_demo',
    overwrite=True
)

In [None]:
# Start searching
tuner_search = tuner.search(X_train, y_train, epochs=20, callbacks=[early_stopping, model_checkpoint], validation_data=(X_val, y_val))

In [None]:
# View summary of searching
# tuner.results_summary()

In [None]:
# View best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters found were: ", best_hyperparameters.values)

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
mse = best_model.evaluate(X_test, y_test)[0]
print(f"Mean Squared Error on Test Set: {mse}")

In [None]:
# View best model
# best_model = tuner.get_best_models()[0]
# best_model.summary()


In [None]:
# Evaluate the best model
# loss, acc = best_model.evaluate(X_test, y_test)
# print('ACC: ', acc)
# print('LOSS: ', loss)