In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import keras_tuner
import keras

In [2]:
csv_file_path = './airbnb_dataset/milestone3.csv'
df = pd.read_csv(csv_file_path)
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis,name_sentiment_analysis
0,5.010635,-0.072621,-0.404046,0.296013,-0.312048,-0.566461,1,1,1,0,-0.221989,0.879947,0.206922,1.031946,-0.417068,0.216242,-0.435248
1,5.129899,1.783653,-0.404046,0.296013,2.034955,1.027816,1,1,0,1,-0.248121,0.883912,0.178912,1.034251,-0.425829,0.837002,-1.788095
2,4.976734,0.855516,-0.404046,0.296013,-0.312048,1.027816,1,1,1,1,-0.252342,0.869698,0.126796,-0.607644,-0.445747,0.808508,-0.133695
3,6.620073,0.391448,-0.404046,0.296013,0.861454,0.230678,1,1,1,0,0.187295,0.883912,0.178912,-0.976832,2.696289,0.213627,-0.926212
4,4.744932,-0.536689,-0.404046,0.296013,-1.485549,-0.566461,1,1,1,1,-0.242856,0.875005,0.145414,-0.634193,1.380088,-0.101872,-0.095776


In [3]:
features = [col for col in df.columns if col != 'log_price' and df[col].dtype in ['int64', 'float64']]
features.remove('name_sentiment_analysis')
X = df[features]
y = df['log_price']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train[:5]

Unnamed: 0,accommodates,bathrooms,host_response_rate,bedrooms,beds,cleaning_fee_True,host_has_profile_pic_t,host_identity_verified_t,instant_bookable_t,property_type_encoded,room_type_encoded,bed_type_encoded,cancellation_policy_encoded,city_encoded,des_sentiment_analysis
41089,-0.072621,-0.404046,0.296013,-1.485549,0.230678,1,1,0,1,-0.221989,0.879947,0.206922,-0.53435,-0.39655,0.388023
16409,-0.536689,-0.404046,0.296013,-0.312048,-0.566461,1,1,0,1,-2.296676,-1.037693,0.126796,-0.607644,-0.44743,0.128472
8929,-0.536689,-0.404046,0.296013,-0.312048,-0.566461,0,1,0,0,0.197792,-1.032359,0.181735,0.999809,1.463817,-1.062835
71528,0.855516,-0.404046,-0.120489,-0.312048,1.027816,1,1,0,0,-0.248121,0.883912,0.178912,-0.619269,-0.434422,-0.006515
51597,0.391448,-0.404046,0.296013,-0.312048,0.230678,1,1,1,0,2.303805,-2.087807,-6.368695,-0.571735,-0.400043,-1.203702


### Train with NN

In [6]:
# Uncomment below if tensorflow is not installed already
#%pip install tensorflow
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [7]:
# Build Neural Network
nn_model = Sequential([
    Dense(32, activation='relu',input_dim=X_train.shape[1]),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1)
])

nn_model.compile(optimizer='adam', loss='mean_squared_error')

In [8]:
# Train the model
history = nn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
 260/1849 [===>..........................] - ETA: 0s - loss: 0.1989

KeyboardInterrupt: 

In [None]:
# Evaluate the model
mse = nn_model.evaluate(X_test, y_test)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 0.19317281246185303


### Hyper Tuning

In [None]:
metrics = [keras.metrics.MeanSquaredError(name="mean_squared_error", dtype=None)]
def build_hp_model(hp):
    model = Sequential()
    for i in range(hp.Int('num_layers', 3, 6)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=256, step=32),
                        activation='leaky_relu'))
    model.add(Dense(1))
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='mean_squared_error',
                  metrics=['mean_squared_error'])
    return model

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=5,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

model_checkpoint = keras.callbacks.ModelCheckpoint(
    filepath='checkpoints',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only= False,
    mode='auto'
)

In [None]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_hp_model,
    objective='val_loss',
    max_trials=5,
    seed=10,
    executions_per_trial=3,
    overwrite=True,
    tune_new_entries=True,
    allow_new_entries=True,
    max_consecutive_failed_trials=1
)



In [None]:
# Start searching
tuner_search = tuner.search(X_train, y_train, epochs=20, callbacks=[early_stopping, model_checkpoint], validation_split=0.2,batch_size=64)

Trial 5 Complete [00h 00m 53s]
val_loss: 0.18549620111783346

Best val_loss So Far: 0.18549620111783346
Total elapsed time: 00h 04m 57s


In [None]:
# View summary of searching
# tuner.results_summary()

In [None]:
# View best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters found were: ", best_hyperparameters.values)

Best hyperparameters found were:  {'num_layers': 5, 'units_0': 64, 'units_1': 128, 'units_2': 128, 'learning_rate': 0.0019004375238737127, 'units_3': 32, 'units_4': 128, 'units_5': 96}


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
mse = best_model.evaluate(X_test, y_test)
print(f"Mean Squared Error on Test Set: {mse}")



Mean Squared Error on Test Set: [0.18968026340007782, 0.18968026340007782]


In [None]:
# View best model
# best_model = tuner.get_best_models()[0]
# best_model.summary()


In [None]:
# Evaluate the best model
# loss, acc = best_model.evaluate(X_test, y_test)
# print('ACC: ', acc)
# print('LOSS: ', loss)