In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("datas/df_1122.csv")

In [3]:
# Split features and target
X = df.drop('price', axis=1)
y = df['price']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
# Define the neural network architecture
def build_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))  # Output layer for regression
    return model

# Get the number of input features
input_dim = X_train_scaled.shape[1]

# Build the model
model = build_model(input_dim)

# Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_squared_error']
)

# Display the model architecture
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
# Define Early Stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=50,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True,
    verbose=1
)

# Train the model
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=1000,  # Set a high number; EarlyStopping will halt training
    batch_size=32,
    validation_split=0.2,  # 20% of training data used for validation
    callbacks=[early_stop],
    verbose=2
)


Epoch 1/1000
314/314 - 1s - 4ms/step - loss: 1.9719 - mean_squared_error: 1.9719 - val_loss: 1.0813 - val_mean_squared_error: 1.0813
Epoch 2/1000
314/314 - 1s - 2ms/step - loss: 1.1648 - mean_squared_error: 1.1648 - val_loss: 1.1072 - val_mean_squared_error: 1.1072
Epoch 3/1000
314/314 - 1s - 2ms/step - loss: 1.0292 - mean_squared_error: 1.0292 - val_loss: 1.0433 - val_mean_squared_error: 1.0433
Epoch 4/1000
314/314 - 1s - 2ms/step - loss: 0.9045 - mean_squared_error: 0.9045 - val_loss: 0.9772 - val_mean_squared_error: 0.9772
Epoch 5/1000
314/314 - 1s - 2ms/step - loss: 0.8424 - mean_squared_error: 0.8424 - val_loss: 0.9297 - val_mean_squared_error: 0.9297
Epoch 6/1000
314/314 - 0s - 2ms/step - loss: 0.8109 - mean_squared_error: 0.8109 - val_loss: 0.8697 - val_mean_squared_error: 0.8697
Epoch 7/1000
314/314 - 1s - 2ms/step - loss: 0.7584 - mean_squared_error: 0.7584 - val_loss: 1.0142 - val_mean_squared_error: 1.0142
Epoch 8/1000
314/314 - 1s - 2ms/step - loss: 0.7106 - mean_squared_er

In [7]:
# Make predictions on the test set
y_pred_continuous = model.predict(X_test_scaled).flatten()

# Round predictions to nearest integer and clip to [0, 5]
y_pred_rounded = np.clip(np.round(y_pred_continuous), 0, 5).astype(int)

# Calculate RMSE
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rounded))
print(f"Final RMSE (with optimized rounding): {final_rmse:.4f}")

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 590us/step
Final RMSE (with optimized rounding): 0.8783


In [8]:
import keras_tuner as kt

def model_builder(hp):
    model = Sequential()
    # Tune the number of units in the first Dense layer
    hp_units = hp.Int('units1', min_value=64, max_value=256, step=32)
    model.add(Dense(units=hp_units, activation='relu', input_dim=input_dim))
    
    # Tune the dropout rate
    hp_dropout = hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(rate=hp_dropout))
    
    # Add a second Dense layer
    hp_units2 = hp.Int('units2', min_value=32, max_value=128, step=32)
    model.add(Dense(units=hp_units2, activation='relu'))
    
    # Tune the second dropout rate
    hp_dropout2 = hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dropout(rate=hp_dropout2))
    
    # Output layer
    model.add(Dense(1, activation='linear'))
    
    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss='mean_squared_error',
        metrics=['mean_squared_error']
    )
    
    return model

# Initialize the tuner
tuner = kt.RandomSearch(
    model_builder,
    objective='val_mean_squared_error',
    max_trials=20,
    executions_per_trial=2,
    directory='keras_tuner_dir',
    project_name='airbnb_price_prediction'
)

# Display search space summary
tuner.search_space_summary()

# Perform the hyperparameter search
tuner.search(
    X_train_scaled,
    y_train,
    epochs=1000,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete. The optimal number of units in the first layer is {best_hps.get('units1')},
the dropout rate is {best_hps.get('dropout1')}, the number of units in the second layer is {best_hps.get('units2')},
the second dropout rate is {best_hps.get('dropout2')}, and the optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")

# Build the model with the optimal hyperparameters and train it
optimized_model = tuner.hypermodel.build(best_hps)

history = optimized_model.fit(
    X_train_scaled,
    y_train,
    epochs=1000,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=2
)

# Make predictions with the optimized model
y_pred_continuous_opt = optimized_model.predict(X_test_scaled).flatten()
y_pred_rounded_opt = np.clip(np.round(y_pred_continuous_opt), 0, 5).astype(int)

# Calculate RMSE for the optimized model
final_rmse_opt = np.sqrt(mean_squared_error(y_test, y_pred_rounded_opt))
print(f"Final RMSE after hyperparameter tuning: {final_rmse_opt:.4f}")


Trial 20 Complete [00h 00m 27s]
val_mean_squared_error: 1.2619692087173462

Best val_mean_squared_error So Far: 0.7056520283222198
Total elapsed time: 00h 16m 02s

The hyperparameter search is complete. The optimal number of units in the first layer is 160,
the dropout rate is 0.4, the number of units in the second layer is 32,
the second dropout rate is 0.4, and the optimal learning rate for the optimizer is 0.001.

Epoch 1/1000
314/314 - 1s - 3ms/step - loss: 3.6200 - mean_squared_error: 3.6200 - val_loss: 1.5969 - val_mean_squared_error: 1.5969
Epoch 2/1000
314/314 - 0s - 1ms/step - loss: 1.8474 - mean_squared_error: 1.8474 - val_loss: 1.2091 - val_mean_squared_error: 1.2091
Epoch 3/1000
314/314 - 0s - 1ms/step - loss: 1.5218 - mean_squared_error: 1.5218 - val_loss: 1.0825 - val_mean_squared_error: 1.0825
Epoch 4/1000
314/314 - 0s - 1ms/step - loss: 1.3711 - mean_squared_error: 1.3711 - val_loss: 1.1135 - val_mean_squared_error: 1.1135
Epoch 5/1000
314/314 - 0s - 1ms/step - loss: 1.

In [10]:
", ".join(X.columns)

"latitude, longitude, host_since, host_response_rate, host_acceptance_rate, host_is_superhost, host_listings_count, host_total_listings_count, host_has_profile_pic, host_identity_verified, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_private_rooms, calculated_host_listings_count_shared_rooms, accommodates, bathrooms, bedrooms, beds, availability_30, availability_60, availability_90, availability_365, instant_bookable, minimum_nights, maximum_nights, number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, reviews_per_month, shared_bathrooms, has_washer, has_dryer, has_dishwasher, has_freezer, has_bbq_grill, has_hot_tub, has_pool, has_gym, has_balcony, has_backyard, amenities_count, description_length, name_length, neighbo

In [13]:
def build_enhanced_model(input_dim):
    model = Sequential()
    
    # First Hidden Layer
    model.add(Dense(512, input_dim=input_dim))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    # Second Hidden Layer
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    
    # Third Hidden Layer
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # Fourth Hidden Layer
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    # Output Layer
    model.add(Dense(1, activation='linear'))  # For regression
    
    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=['mean_squared_error']
    )
    
    return model

# Get the number of input features
input_dim = X_train_scaled.shape[1]

# Build the model
model = build_enhanced_model(input_dim)

# Display the model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Define Early Stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=30,  # Adjust based on your observations
    restore_best_weights=True,
    verbose=1
)

# Train the model
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=500,  # High number with early stopping
    batch_size=64,  # Larger batch size due to high dimensionality
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=2
)


Epoch 1/500
157/157 - 2s - 15ms/step - loss: 5.7749 - mean_squared_error: 5.7749 - val_loss: 2.0345 - val_mean_squared_error: 2.0345
Epoch 2/500
157/157 - 1s - 7ms/step - loss: 2.1549 - mean_squared_error: 2.1549 - val_loss: 1.0199 - val_mean_squared_error: 1.0199
Epoch 3/500
157/157 - 1s - 9ms/step - loss: 1.5960 - mean_squared_error: 1.5960 - val_loss: 0.9217 - val_mean_squared_error: 0.9217
Epoch 4/500
157/157 - 1s - 7ms/step - loss: 1.3585 - mean_squared_error: 1.3585 - val_loss: 0.9215 - val_mean_squared_error: 0.9215
Epoch 5/500
157/157 - 1s - 7ms/step - loss: 1.2338 - mean_squared_error: 1.2338 - val_loss: 0.8903 - val_mean_squared_error: 0.8903
Epoch 6/500
157/157 - 1s - 6ms/step - loss: 1.1335 - mean_squared_error: 1.1335 - val_loss: 0.8876 - val_mean_squared_error: 0.8876
Epoch 7/500
157/157 - 1s - 7ms/step - loss: 1.0725 - mean_squared_error: 1.0725 - val_loss: 0.8665 - val_mean_squared_error: 0.8665
Epoch 8/500
157/157 - 1s - 8ms/step - loss: 1.0162 - mean_squared_error: 1.

In [16]:
# Make predictions on the test set
y_pred_continuous = model.predict(X_test_scaled).flatten()

# Round predictions to nearest integer and clip to [0, 5]
y_pred_rounded = np.clip(np.round(y_pred_continuous), 0, 5).astype(int)

# Calculate RMSE
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rounded))
print(f"Final RMSE (with optimized rounding): {final_rmse:.4f}")

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Final RMSE (with optimized rounding): 0.8682
