In [2]:
# import libraries and convert to dataframe
import pandas as pd

df_bcn = pd.read_csv('Datasets/Final_cleaned_dataset/labled_features_bcn.csv', on_bad_lines='skip')
df_mad = pd.read_csv('Datasets/Final_cleaned_dataset/labled_features_mad.csv', on_bad_lines='skip')

In [3]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import StandardScaler

# Calculate z-scores for price
z_scores_bcn = np.abs(stats.zscore(df_bcn['price']))
z_scores_mad = np.abs(stats.zscore(df_mad['price']))

# Define a threshold
threshold = 3

# Remove outliers
df_bcn = df_bcn[(z_scores_bcn < threshold)]
df_mad = df_mad[(z_scores_mad < threshold)]

# Find and remove rows where 'neighbourhood' is "Sant Andreu" or "Gracias" and 'distance to city center' is less than 1 km
df_bcn = df_bcn[~((df_bcn['neighbourhood_group_cleansed'] == 'Sant Andreu') & (df_bcn['distance_from_city_center'] == '<1 km'))]
df_bcn = df_bcn[~((df_bcn['neighbourhood_group_cleansed'] == 'Gràcia') & (df_bcn['distance_from_city_center'] == '<1 km'))]

from sklearn.preprocessing import StandardScaler

# Select only the continuous numerical columns to normalize
continuous_columns_bcn = ['price', 'distance_from_city_center', 'number_of_reviews', 'review_scores_rating']
continuous_columns_mad = ['price', 'distance_from_city_center', 'number_of_reviews', 'review_scores_rating']

# Initialize the scaler
scaler = StandardScaler()

# Normalize only the continuous columns
df_bcn[continuous_columns_bcn] = scaler.fit_transform(df_bcn[continuous_columns_bcn])
df_mad[continuous_columns_mad] = scaler.fit_transform(df_mad[continuous_columns_mad])

# Leave the binary and ordinal encoded variables as they are
binary_columns = ['host_is_superhost', 'kitchen', 'patio or balcony', 'elevator', 'air conditioning']
ordinal_columns = ['bedrooms_encoded', 'room_type_encoded']

# No need to scale these columns
df_bcn[binary_columns + ordinal_columns] = df_bcn[binary_columns + ordinal_columns]
df_mad[binary_columns + ordinal_columns] = df_mad[binary_columns + ordinal_columns]


In [9]:

df_bcn['host_is_superhost'] = df_bcn['host_is_superhost'].map({'f': 0, 't': 1})
df_mad['host_is_superhost'] = df_mad['host_is_superhost'].map({'f': 0, 't': 1})


In [11]:
print(df_bcn.columns.tolist())

# print(df_bcn['host_is_superhost'].unique())
# print(df_mad['host_is_superhost'].unique())

['id', 'neighbourhood_group_cleansed', 'name', 'host_id', 'host_name', 'host_is_superhost', 'calculated_host_listings_count', 'latitude', 'longitude', 'kitchen', 'patio or balcony', 'elevator', 'air conditioning', 'number_of_reviews', 'review_scores_rating', 'room_type_encoded', 'bedrooms_encoded', 'price', 'price_category', 'distance_from_city_center', 'distance_category']


### Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Example with Barcelona data
X_bcn = df_bcn[['distance_from_city_center', 'host_is_superhost', 'kitchen', 'patio or balcony',
                'elevator', 'air conditioning', 'room_type_encoded', 'bedrooms_encoded',
                'number_of_reviews', 'review_scores_rating']]
y_bcn = df_bcn['price']

# Example with Madrid data
X_mad = df_mad[['distance_from_city_center', 'host_is_superhost', 'kitchen', 'patio or balcony',
                'elevator', 'air conditioning', 'room_type_encoded', 'bedrooms_encoded',
                'number_of_reviews', 'review_scores_rating']]
y_mad = df_mad['price']


In [22]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)


In [17]:
from sklearn.model_selection import train_test_split

# Split for Barcelona
X_bcn_train, X_bcn_test, y_bcn_train, y_bcn_test = train_test_split(X_bcn, y_bcn, test_size=0.2, random_state=42)

# Split for Madrid
X_mad_train, X_mad_test, y_mad_train, y_mad_test = train_test_split(X_mad, y_mad, test_size=0.2, random_state=42)


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Initialize the RandomForestRegressor model
model_bcn = RandomForestRegressor(n_estimators=100, random_state=42)
model_mad = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the RandomForestRegressor model for Barcelona
model_bcn.fit(X_bcn_train, y_bcn_train)

# Fit the RandomForestRegressor model for Madrid
model_mad.fit(X_mad_train, y_mad_train)


In [25]:
# Predictions
y_bcn_train_pred = model_bcn.predict(X_bcn_train)
y_bcn_test_pred = model_bcn.predict(X_bcn_test)
y_mad_train_pred = model_mad.predict(X_mad_train)
y_mad_test_pred = model_mad.predict(X_mad_test)


In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluation for Barcelona
mae_rf_bcn = mean_absolute_error(y_bcn_test, y_bcn_test_pred)
mse_rf_bcn = mean_squared_error(y_bcn_test, y_bcn_test_pred)
rmse_rf_bcn = np.sqrt(mse_rf_bcn)
r2_rf_bcn = r2_score(y_bcn_test, y_bcn_test_pred)

print("Random Forest - Barcelona:")
print(f"MAE: {mae_rf_bcn:.2f}")
print(f"MSE: {mse_rf_bcn:.2f}")
print(f"RMSE: {rmse_rf_bcn:.2f}")
print(f"R²: {r2_rf_bcn:.2f}")

# Evaluation for Madrid
mae_rf_mad = mean_absolute_error(y_mad_test, y_mad_test_pred)
mse_rf_mad = mean_squared_error(y_mad_test, y_mad_test_pred)
rmse_rf_mad = np.sqrt(mse_rf_mad)
r2_rf_mad = r2_score(y_mad_test, y_mad_test_pred)

print("\nRandom Forest - Madrid:")
print(f"MAE: {mae_rf_mad:.2f}")
print(f"MSE: {mse_rf_mad:.2f}")
print(f"RMSE: {rmse_rf_mad:.2f}")
print(f"R²: {r2_rf_mad:.2f}")

Random Forest - Barcelona:
MAE: 0.57
MSE: 0.75
RMSE: 0.86
R²: 0.34

Random Forest - Madrid:
MAE: 0.54
MSE: 0.73
RMSE: 0.86
R²: 0.28


In [26]:
# Calculate R^2 for training and testing data
r2_bcn_train = r2_score(y_bcn_train, y_bcn_train_pred)
r2_bcn_test = r2_score(y_bcn_test, y_bcn_test_pred)
r2_mad_train = r2_score(y_mad_train, y_mad_train_pred)
r2_mad_test = r2_score(y_mad_test, y_mad_test_pred)

# Calculate MSE for testing data
mse_bcn = mean_squared_error(y_bcn_test, y_bcn_test_pred)
mse_mad = mean_squared_error(y_mad_test, y_mad_test_pred)

# Print results
print(f'Barcelona Model R^2 (Training): {r2_bcn_train}')
print(f'Barcelona Model R^2 (Testing): {r2_bcn_test}')
print(f'Madrid Model R^2 (Training): {r2_mad_train}')
print(f'Madrid Model R^2 (Testing): {r2_mad_test}')
print(f'Barcelona Model MSE (Testing): {mse_bcn}')
print(f'Madrid Model MSE (Testing): {mse_mad}')

Barcelona Model R^2 (Training): 0.899487501323769
Barcelona Model R^2 (Testing): 0.33854677554886936
Madrid Model R^2 (Training): 0.9017054317905236
Madrid Model R^2 (Testing): 0.276685047490854
Barcelona Model MSE (Testing): 0.7476108231960358
Madrid Model MSE (Testing): 0.7348134442805478


In [27]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search for Barcelona
grid_search_bcn = GridSearchCV(estimator=model_bcn, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_bcn.fit(X_bcn_train, y_bcn_train)

print(f"Best Parameters for Barcelona: {grid_search_bcn.best_params_}")

# Grid search for Madrid
grid_search_mad = GridSearchCV(estimator=model_mad, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_mad.fit(X_mad_train, y_mad_train)

print(f"Best Parameters for Madrid: {grid_search_mad.best_params_}")


Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.3s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.3s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.2s
[



[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   2.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   6.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  11.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   6.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  11.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   2.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   2.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   9.2s
[CV] END bootstrap=True, max_depth=None, min_

In [29]:
# Re-initialize with best parameters
best_rf_model_bcn = RandomForestRegressor(**grid_search_bcn.best_params_)
best_rf_model_bcn.fit(X_bcn_train, y_bcn_train)
y_pred_best_rf_bcn = best_rf_model_bcn.predict(X_bcn_test)

best_rf_model_mad = RandomForestRegressor(**grid_search_mad.best_params_)
best_rf_model_mad.fit(X_mad_train, y_mad_train)
y_pred_best_rf_mad = best_rf_model_mad.predict(X_mad_test)

# Evaluate the tuned model
mae_best_rf_bcn = mean_absolute_error(y_bcn_test, y_pred_best_rf_bcn)
r2_best_rf_bcn = r2_score(y_bcn_test, y_pred_best_rf_bcn)

mae_best_rf_mad = mean_absolute_error(y_mad_test, y_pred_best_rf_mad)
r2_best_rf_mad = r2_score(y_mad_test, y_pred_best_rf_mad)

print(f"Tuned Random Forest - Barcelona: MAE: {mae_best_rf_bcn:.2f}, R²: {r2_best_rf_bcn:.2f}")
print(f"Tuned Random Forest - Madrid: MAE: {mae_best_rf_mad:.2f}, R²: {r2_best_rf_mad:.2f}")

Tuned Random Forest - Barcelona: MAE: 0.57, R²: 0.37
Tuned Random Forest - Madrid: MAE: 0.53, R²: 0.33
