# Predicting the price of the places.

In [1]:
import pandas as pd

df = pd.read_csv('/workspaces/Coworking/src/results/MergedPlacesScore.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 18 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              19 non-null     object 
 1   City                                              19 non-null     object 
 2   Country                                           19 non-null     object 
 3   Neighborhood                                      19 non-null     object 
 4   Population                                        19 non-null     int64  
 5   Median Household Income                           19 non-null     float64
 6   Percentage of population between 16 and 64 years  19 non-null     float64
 7   Transport                                         19 non-null     int64  
 8   Day Pass                                          19 non-null     float64
 9   Month Pass             

In [3]:
# Drop the Address column as it doesn't add predictive value
df = df.drop(columns=["Address", 'Latitude', 'Longitude'])

In [4]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features like City and Neighborhood
city_encoder = OneHotEncoder(sparse_output=False)
city_encoded = city_encoder.fit_transform(df[['City']])

neighborhood_encoder = OneHotEncoder(sparse_output=False)
neighborhood_encoded = neighborhood_encoder.fit_transform(df[['Neighborhood']])

# Create DataFrames from the one-hot encoded features
city_encoded_df = pd.DataFrame(city_encoded, columns=city_encoder.categories_[0])
neighborhood_encoded_df = pd.DataFrame(neighborhood_encoded, columns=neighborhood_encoder.categories_[0])

# Combine the original dataframe with the encoded columns
df_encoded = pd.concat([df, city_encoded_df, neighborhood_encoded_df], axis=1)

In [5]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 34 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              19 non-null     object 
 1   City                                              19 non-null     object 
 2   Country                                           19 non-null     object 
 3   Neighborhood                                      19 non-null     object 
 4   Population                                        19 non-null     int64  
 5   Median Household Income                           19 non-null     float64
 6   Percentage of population between 16 and 64 years  19 non-null     float64
 7   Transport                                         19 non-null     int64  
 8   Day Pass                                          19 non-null     float64
 9   Month Pass             

In [6]:
import numpy as np

# Load data (assumed you have this dataset loaded into 'df_encoded')
# Add log transformations for skewed variables
df_encoded['log_population'] = np.log(df_encoded['Population'] + 1)
df_encoded['log_income'] = np.log(df_encoded['Median Household Income'] + 1)
df_encoded['log_distance'] = np.log(df_encoded['distance_from_center'] + 1)
df_encoded['income_per_capita'] = df_encoded['Median Household Income'] / df_encoded['Population']

# Drop the original columns
df_encoded = df_encoded.drop(columns=['Population', 'Median Household Income', 'distance_from_center'])

In [7]:
df_encoded.columns

Index(['name', 'City', 'Country', 'Neighborhood',
       'Percentage of population between 16 and 64 years', 'Transport',
       'Day Pass', 'Month Pass', 'Rating', 'User Rating Count',
       'Weighted Rating', 'Score', 'Barcelona', 'Madrid', 'New York', 'Tokyo',
       'Adelfas', 'Akasaka', 'Ciutat Vella', 'Cortes', 'Delicias', 'Gràcia',
       'Higashishinjuku', 'Midtown', 'Nishiogi', 'Opañel', 'Sants-Montjuic',
       'Shibuya', 'Shinbashi', 'SoHo', 'Williansburg', 'log_population',
       'log_income', 'log_distance', 'income_per_capita'],
      dtype='object')

In [8]:
df_encoded.head()

Unnamed: 0,name,City,Country,Neighborhood,Percentage of population between 16 and 64 years,Transport,Day Pass,Month Pass,Rating,User Rating Count,...,Opañel,Sants-Montjuic,Shibuya,Shinbashi,SoHo,Williansburg,log_population,log_income,log_distance,income_per_capita
0,"WORKVILLE - Flexible Office Space, Conference ...",New York,USA,Midtown,0.7,6,49.0,300.0,5.0,349,...,0.0,0.0,0.0,0.0,0.0,0.0,10.981727,12.167062,1.761427,3.27182
1,Rise New York,New York,USA,Midtown,0.7,6,0.0,250.0,4.7,258,...,0.0,0.0,0.0,0.0,0.0,0.0,10.981727,12.167062,1.50945,3.27182
2,The Farm SoHo NYC - Coworking Office Space and...,New York,USA,SoHo,0.7,4,29.0,179.0,4.5,257,...,0.0,0.0,0.0,0.0,1.0,0.0,9.365291,12.073741,0.665836,15.007195
3,OASIS by Workville - Conference Center & Corpo...,New York,USA,Midtown,0.7,6,49.0,350.0,5.0,162,...,0.0,0.0,0.0,0.0,0.0,0.0,10.981727,12.167062,1.726793,3.27182
4,The New Work Project,New York,USA,Williansburg,0.6,2,40.0,370.0,5.0,126,...,0.0,0.0,0.0,0.0,0.0,1.0,11.842093,11.48692,1.638945,0.701051


## Predict the day pass.

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load the numerical data

# Define features (X) and target variable (y)
X = df_encoded[['log_population', 'log_income', 'log_distance', 'income_per_capita'] + 
       list(city_encoded_df.columns) + list(neighborhood_encoded_df.columns)]
y = df_encoded["Day Pass"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_preds = ridge_model.predict(X_test)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Evaluate models
ridge_mae = mean_absolute_error(y_test, ridge_preds)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_preds))

rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))

print(f'RidgeMAE=',ridge_mae, 'RidgeRMSE=',ridge_rmse, 'RandonForestMAE=',rf_mae, 'RandonForestRMSE=',rf_rmse)


RidgeMAE= 18.108500910764786 RidgeRMSE= 24.13010686858773 RandonForestMAE= 16.619300000000013 RandonForestRMSE= 23.3260422524911


In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Train the Gradient Boosting model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Predict the Day Pass values using the Gradient Boosting model
gb_preds = gb_model.predict(X_test)

# Evaluate the Gradient Boosting model
gb_mae = mean_absolute_error(y_test, gb_preds)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_preds))

print(f'Gradient Boosting MAE: {gb_mae}')
print(f'Gradient Boosting RMSE: {gb_rmse}')


Gradient Boosting MAE: 15.61639462795057
Gradient Boosting RMSE: 24.978091959114323


In [21]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Convert data to DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',   # regression task
    'max_depth': 6,                    # depth of trees
    'learning_rate': 0.1,              # step size
    'n_estimators': 100,               # number of boosting rounds
    'eval_metric': 'rmse'              # evaluation metric
}

# Train XGBoost model
xgboost_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
xgboost_preds = xgboost_model.predict(dtest)

# Evaluate the model
xgboost_mae = mean_absolute_error(y_test, xgboost_preds)
xgboost_rmse = np.sqrt(mean_squared_error(y_test, xgboost_preds))

print(f'XGBoost MAE: {xgboost_mae}')


XGBoost MAE: 17.80531078338623


Parameters: { "n_estimators" } are not used.



In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Ridge Regression
ridge_param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0]
}

# Define the parameter grid for Random Forest Regressor
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Initialize GridSearchCV for Ridge Regression
ridge_grid_search = GridSearchCV(Ridge(), ridge_param_grid, cv=5, scoring='neg_mean_absolute_error')
ridge_grid_search.fit(X_train, y_train)

# Initialize GridSearchCV for Random Forest Regressor
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_grid_search.fit(X_train, y_train)

# Get the best parameters and best score for Ridge Regression
best_ridge_params = ridge_grid_search.best_params_
best_ridge_score = -ridge_grid_search.best_score_

# Get the best parameters and best score for Random Forest Regressor
best_rf_params = rf_grid_search.best_params_
best_rf_score = -rf_grid_search.best_score_

print(f'Best Ridge Params: {best_ridge_params}, Best Ridge MAE: {best_ridge_score}')
print(f'Best RF Params: {best_rf_params}, Best RF MAE: {best_rf_score}')

Best Ridge Params: {'alpha': 10.0}, Best Ridge MAE: 5.860511494517161
Best RF Params: {'max_depth': None, 'n_estimators': 50}, Best RF MAE: 4.713026666666664


In [12]:
# Train the best Random Forest model using the best parameters
best_rf_model = RandomForestRegressor(**best_rf_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Predict the Day Pass values using the best model
best_rf_preds = best_rf_model.predict(X_test)

# Evaluate the best model
best_rf_mae = mean_absolute_error(y_test, best_rf_preds)
best_rf_rmse = np.sqrt(mean_squared_error(y_test, best_rf_preds))

print(f'Best Random Forest MAE: {best_rf_mae}')
print(f'Best Random Forest RMSE: {best_rf_rmse}')

Best Random Forest MAE: 16.8203
Best Random Forest RMSE: 23.63340225909084


In [13]:
df_encoded["Predicted_Price_RF"] = best_rf_model.predict(X) 

In [14]:
import joblib

# Save the model
joblib.dump(best_rf_model, "/workspaces/Coworking/src/results/random_forest_model.pkl")

['/workspaces/Coworking/src/results/random_forest_model.pkl']

In [None]:
import numpy as np

# Define a function to make predictions
def predict_price(city, transport_access):
    # Load the model
    model = joblib.load("/workspaces/Coworking/src/results/random_forest_model.pkl")
    
    # Create a DataFrame with the input data
    data = {
        'City': [city],
        'Transport Access': [transport_access]
    }
    df = pd.DataFrame(data)
    
    # One-hot encode the categorical features
    city_encoded = city_encoder.transform(df[['City']])
    
    # Combine the original dataframe with the encoded columns
    df_encoded = pd.concat([df, city_encoded_df], axis=1)
    
    # Make a prediction
    prediction = model.predict(df_encoded)
    
    return prediction[0]


In [24]:
#Test the function
predict_price('Madrid', 5)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- City
- Transport Access
Feature names seen at fit time, yet now missing:
- Adelfas
- Akasaka
- Ciutat Vella
- Cortes
- Delicias
- ...


In [None]:
user_max_price = 100  # Example value
user_min_rating = 4.0  # Example value
user_max_distance = 10  # Example in km

filtered_spaces = df[
    (df["Predicted_Price_RF"] <= user_max_price) &
    (df["Rating"] >= user_min_rating) &
    (df["log_distance"] <= user_max_distance)
]

filtered_spaces = filtered_spaces.sort_values(by="Predicted_Price_RF")

filtered_spaces.head()


Unnamed: 0,name,Percentage of population between 16 and 64 years,Transport,Day Pass,Month Pass,Latitude,Longitude,Rating,User Rating Count,Weighted Rating,...,Neightboorhood_Sants-Montjuic,Neightboorhood_Shibuya,Neightboorhood_Shinbashi,Neightboorhood_SoHo,Neightboorhood_Williansburg,log_population,log_income,log_distance,income_per_capita,Predicted_Price_RF
18,【法人登記可】コワーキングスペースfactoria nishiogi,0.65,3,20.67,310.05,35.704759,139.60086,4.9,47,230.3,...,False,False,False,False,False,11.289794,10.678385,2.74465,0.54258,22.241
12,EslabON Coworking,0.7,4,21.74,259.42,40.38691,-3.723964,4.9,55,269.5,...,False,False,False,False,False,10.430698,10.708075,1.554868,1.319673,23.2414
15,Pasela Co-work Higashishinjuku,0.65,3,20.67,310.05,35.698054,139.709368,4.2,248,1041.6,...,False,False,False,False,False,10.126671,10.747376,1.764334,1.860273,23.3578
14,Basis Point Shinbashi,0.65,4,25.84,413.39,35.666838,139.759638,4.3,328,1410.4,...,False,False,True,False,False,11.512935,11.115094,1.0208,0.671765,26.7836
13,Area Coworking,0.7,4,28.99,347.83,40.397134,-3.691715,4.9,37,181.3,...,False,False,False,False,False,10.260322,11.09417,1.227493,2.302205,28.5108


In [None]:
filtered_spaces.columns

Index(['name', 'Percentage of population between 16 and 64 years', 'Transport',
       'Day Pass', 'Month Pass', 'Latitude', 'Longitude', 'Rating',
       'User Rating Count', 'Weighted Rating', 'Score', 'Country_Spain',
       'Country_USA', 'Neightboorhood_Akasaka', 'Neightboorhood_Ciutat Vella',
       'Neightboorhood_Cortes', 'Neightboorhood_Delicias',
       'Neightboorhood_Gràcia', 'Neightboorhood_Higashishinjuku',
       'Neightboorhood_Midtown', 'Neightboorhood_Nishiogi',
       'Neightboorhood_Opañel', 'Neightboorhood_Sants-Montjuic',
       'Neightboorhood_Shibuya', 'Neightboorhood_Shinbashi',
       'Neightboorhood_SoHo', 'Neightboorhood_Williansburg', 'log_population',
       'log_income', 'log_distance', 'income_per_capita', 'Predicted_Price_RF',
       'Predicted Score'],
      dtype='object')

In [None]:
normalized_price = (user_max_price - filtered_spaces["Predicted_Price_RF"]) / user_max_price
normalized_rating = (filtered_spaces["Rating"] - user_min_rating) / (5 - user_min_rating)
normalized_distance = (user_max_distance - filtered_spaces["log_distance"]) / user_max_distance
normalize_score = (normalized_price + normalized_rating + normalized_distance) / 3
normalize_user_count = (filtered_spaces["User Rating Count"] - filtered_spaces["User Rating Count"].min()) / (filtered_spaces["User Rating Count"].max() - filtered_spaces["User Rating Count"].min())

In [None]:
w_transport = 0.1  # Example weight for transport quality

normalized_transport = filtered_spaces["Transport"] / filtered_spaces["Transport"].max()

predicted_score = 0.4 * normalized_price + 0.4 * normalized_rating + 0.2 * normalized_distance
predicted_score += w_transport * normalized_transport

filtered_spaces["Predicted Score"] = predicted_score

filtered_spaces = filtered_spaces.sort_values(by="Predicted Score", ascending=False)

In [None]:
filtered_spaces.head()

Unnamed: 0,name,Percentage of population between 16 and 64 years,Transport,Day Pass,Month Pass,Latitude,Longitude,Rating,User Rating Count,Weighted Rating,...,Neightboorhood_Shibuya,Neightboorhood_Shinbashi,Neightboorhood_SoHo,Neightboorhood_Williansburg,log_population,log_income,log_distance,income_per_capita,Predicted_Price_RF,Predicted Score
12,EslabON Coworking,0.7,4,21.74,259.42,40.38691,-3.723964,4.9,55,269.5,...,False,False,False,False,10.430698,10.708075,1.554868,1.319673,23.2414,0.875937
13,Area Coworking,0.7,4,28.99,347.83,40.397134,-3.691715,4.9,37,181.3,...,False,False,False,False,10.260322,11.09417,1.227493,2.302205,28.5108,0.861407
0,"WORKVILLE - Flexible Office Space, Conference ...",0.7,6,49.0,300.0,40.753615,-73.986561,5.0,349,1745.0,...,False,False,False,False,10.981727,12.167062,1.761427,3.27182,44.2046,0.847953
18,【法人登記可】コワーキングスペースfactoria nishiogi,0.65,3,20.67,310.05,35.704759,139.60086,4.9,47,230.3,...,False,False,False,False,11.289794,10.678385,2.74465,0.54258,22.241,0.846143
3,OASIS by Workville - Conference Center & Corpo...,0.7,6,49.0,350.0,40.753345,-73.993604,5.0,162,810.0,...,False,False,False,False,10.981727,12.167062,1.726793,3.27182,44.8816,0.845938


In [None]:
filtered_spaces.to_csv('/workspaces/Coworking/src/results/RecommendedSpaces.csv', index=False)