# Predicting the price of the places.

In [1]:
import pandas as pd

df = pd.read_csv('/workspaces/Coworking/src/results/MergedPlacesScore.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 18 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              19 non-null     object 
 1   City                                              19 non-null     object 
 2   Country                                           19 non-null     object 
 3   Neighborhood                                      19 non-null     object 
 4   Population                                        19 non-null     int64  
 5   Median Household Income                           19 non-null     float64
 6   Percentage of population between 16 and 64 years  19 non-null     float64
 7   Transport                                         19 non-null     int64  
 8   Day Pass                                          19 non-null     float64
 9   Month Pass             

In [3]:
# Drop the Address column as it doesn't add predictive value
df = df.drop(columns=["Address", 'Latitude', 'Longitude'])

In [4]:
from sklearn.preprocessing import OneHotEncoder
import joblib

# One-hot encode categorical features like City and Neighborhood
city_encoder = OneHotEncoder(sparse_output=False)
city_encoded = city_encoder.fit_transform(df[['City']])

neighborhood_encoder = OneHotEncoder(sparse_output=False)
neighborhood_encoded = neighborhood_encoder.fit_transform(df[['Neighborhood']])

# Create DataFrames from the one-hot encoded features
city_encoded_df = pd.DataFrame(city_encoded, columns=city_encoder.categories_[0])
neighborhood_encoded_df = pd.DataFrame(neighborhood_encoded, columns=neighborhood_encoder.categories_[0])

# Save the encoders
joblib.dump(city_encoder, "/workspaces/Coworking/src/results/city_encoder.pkl")
joblib.dump(neighborhood_encoder, "/workspaces/Coworking/src/results/neighborhood_encoder.pkl")

# Combine the original dataframe with the encoded columns
df_encoded = pd.concat([df, city_encoded_df, neighborhood_encoded_df], axis=1)

In [5]:
city_encoded_df.head()

Unnamed: 0,Barcelona,Madrid,New York,Tokyo
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [6]:
city_encoded_df.columns

Index(['Barcelona', 'Madrid', 'New York', 'Tokyo'], dtype='object')

In [7]:
neighborhood_encoded_df.columns

Index(['Adelfas', 'Akasaka', 'Ciutat Vella', 'Cortes', 'Delicias', 'Gràcia',
       'Higashishinjuku', 'Midtown', 'Nishiogi', 'Opañel', 'Sants-Montjuic',
       'Shibuya', 'Shinbashi', 'SoHo', 'Williansburg'],
      dtype='object')

In [8]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 34 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              19 non-null     object 
 1   City                                              19 non-null     object 
 2   Country                                           19 non-null     object 
 3   Neighborhood                                      19 non-null     object 
 4   Population                                        19 non-null     int64  
 5   Median Household Income                           19 non-null     float64
 6   Percentage of population between 16 and 64 years  19 non-null     float64
 7   Transport                                         19 non-null     int64  
 8   Day Pass                                          19 non-null     float64
 9   Month Pass             

In [9]:
import numpy as np

# Add log transformations for skewed variables
df_encoded['log_population'] = np.log(df_encoded['Population'] + 1)
df_encoded['log_income'] = np.log(df_encoded['Median Household Income'] + 1)
df_encoded['log_distance'] = np.log(df_encoded['distance_from_center'] + 1)
df_encoded['income_per_capita'] = df_encoded['Median Household Income'] / df_encoded['Population']

# Drop the original columns
df_encoded = df_encoded.drop(columns=['Population', 'Median Household Income', 'distance_from_center'])

In [10]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 35 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   name                                              19 non-null     object 
 1   City                                              19 non-null     object 
 2   Country                                           19 non-null     object 
 3   Neighborhood                                      19 non-null     object 
 4   Percentage of population between 16 and 64 years  19 non-null     float64
 5   Transport                                         19 non-null     int64  
 6   Day Pass                                          19 non-null     float64
 7   Month Pass                                        19 non-null     float64
 8   Rating                                            19 non-null     float64
 9   User Rating Count      

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Numerical columns
numerical_columns = ['log_population', 'log_income', 'log_distance', 'income_per_capita']

# Convert string columns to float
df_encoded[numerical_columns] = df_encoded[numerical_columns].astype(float)

# Apply the scaler to the numerical columns of the dataframe
df_encoded_scaled = pd.DataFrame(scaler.fit_transform(df_encoded[numerical_columns]), columns=numerical_columns)

# Save the scaler for future use
joblib.dump(scaler, "/workspaces/Coworking/src/results/minmax_scaler.pkl")

df_encoded_scaled.head()


Unnamed: 0,log_population,log_income,log_distance,income_per_capita
0,0.553769,1.0,0.600036,0.204923
1,0.553769,1.0,0.497535,0.204923
2,0.024998,0.960482,0.154363,1.0
3,0.553769,1.0,0.585948,0.204923
4,0.835214,0.711988,0.550212,0.030753


In [12]:
# Concatenate the scaled numerical columns to the original encoded dataframe
df_final = pd.concat([df_encoded.drop(columns=numerical_columns), df_encoded_scaled], axis=1)

df_final.head()

Unnamed: 0,name,City,Country,Neighborhood,Percentage of population between 16 and 64 years,Transport,Day Pass,Month Pass,Rating,User Rating Count,...,Opañel,Sants-Montjuic,Shibuya,Shinbashi,SoHo,Williansburg,log_population,log_income,log_distance,income_per_capita
0,"WORKVILLE - Flexible Office Space, Conference ...",New York,USA,Midtown,0.7,6,49.0,300.0,5.0,349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.553769,1.0,0.600036,0.204923
1,Rise New York,New York,USA,Midtown,0.7,6,0.0,250.0,4.7,258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.553769,1.0,0.497535,0.204923
2,The Farm SoHo NYC - Coworking Office Space and...,New York,USA,SoHo,0.7,4,29.0,179.0,4.5,257,...,0.0,0.0,0.0,0.0,1.0,0.0,0.024998,0.960482,0.154363,1.0
3,OASIS by Workville - Conference Center & Corpo...,New York,USA,Midtown,0.7,6,49.0,350.0,5.0,162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.553769,1.0,0.585948,0.204923
4,The New Work Project,New York,USA,Williansburg,0.6,2,40.0,370.0,5.0,126,...,0.0,0.0,0.0,0.0,0.0,1.0,0.835214,0.711988,0.550212,0.030753


In [13]:
df_final.columns

Index(['name', 'City', 'Country', 'Neighborhood',
       'Percentage of population between 16 and 64 years', 'Transport',
       'Day Pass', 'Month Pass', 'Rating', 'User Rating Count',
       'Weighted Rating', 'Score', 'Barcelona', 'Madrid', 'New York', 'Tokyo',
       'Adelfas', 'Akasaka', 'Ciutat Vella', 'Cortes', 'Delicias', 'Gràcia',
       'Higashishinjuku', 'Midtown', 'Nishiogi', 'Opañel', 'Sants-Montjuic',
       'Shibuya', 'Shinbashi', 'SoHo', 'Williansburg', 'log_population',
       'log_income', 'log_distance', 'income_per_capita'],
      dtype='object')

In [14]:
import pandas as pd

df_final.to_csv('/workspaces/Coworking/src/results/PreprocessedData.csv', index=False)

## Predict the day pass.

In [15]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load the numerical data

# Define features (X) and target variable (y)
X = df_final.drop(columns=["Day Pass", 'name', 'City', 'Neighborhood', 'Country'])
y = df_final["Day Pass"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_preds = ridge_model.predict(X_test)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Evaluate models
ridge_mae = mean_absolute_error(y_test, ridge_preds)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_preds))

rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))

print(f'RidgeMAE=',ridge_mae, 'RidgeRMSE=',ridge_rmse, 'RandonForestMAE=',rf_mae, 'RandonForestRMSE=',rf_rmse)


RidgeMAE= 18.450434011375556 RidgeRMSE= 22.791071848246606 RandonForestMAE= 14.639150000000004 RandonForestRMSE= 19.55457988771428


In [16]:
from sklearn.model_selection import cross_val_score

# Define the model
ridge_model = Ridge(alpha=1.0)

# Perform k-fold cross-validation
k = 5  # Number of folds
scores = cross_val_score(ridge_model, X, y, cv=k, scoring='neg_mean_absolute_error')

# Convert scores to positive MAE
mae_scores = -scores
print(f"MAE for each fold: {mae_scores}")
print(f"Average MAE: {mae_scores.mean()}")

MAE for each fold: [13.42677994 12.0620836   6.7893682  26.49668476 14.79779606]
Average MAE: 14.714542511425702


In [32]:
from sklearn.model_selection import KFold

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop through folds and inspect Fold 4
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    if fold == 3:  # Fold 4 (0-indexed)
        print("Fold 4 Test Indices:", test_idx)
        print("Fold 4 Test Data:")
        print(df_final.iloc[test_idx])

Fold 4 Test Indices: [ 4  7 12 18]
Fold 4 Test Data:
                                        name       City Country  Neighborhood  \
4                       The New Work Project   New York     USA  Williansburg   
7   OneCoWork Catedral | Barcelona Coworking  Barcelona   Spain  Ciutat Vella   
12                         EslabON Coworking     Madrid   Spain        Opañel   
18        【法人登記可】コワーキングスペースfactoria nishiogi      Tokyo   Japan      Nishiogi   

    Percentage of population between 16 and 64 years  Transport  Day Pass  \
4                                               0.60          2     40.00   
7                                               0.75          5     36.23   
12                                              0.70          4     21.74   
18                                              0.65          3     20.67   

    Month Pass  Rating  User Rating Count  ...  Opañel  Sants-Montjuic  \
4       370.00     5.0                126  ...     0.0             0.0   
7      

In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Ridge Regression
ridge_param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0]}

# Initialize GridSearchCV with k-fold cross-validation
ridge_grid_search = GridSearchCV(Ridge(), ridge_param_grid, cv=5, scoring='neg_mean_absolute_error')
ridge_grid_search.fit(X, y)

# Get the best parameters
best_ridge_params = ridge_grid_search.best_params_
print(f"Best Ridge Params: {best_ridge_params}")

Best Ridge Params: {'alpha': 100.0}


In [18]:
# Train the best Ridge Regression model using the best parameters
best_ridge_model = Ridge(**best_ridge_params)
best_ridge_model.fit(X_train, y_train)

# Predict the Month Pass values using the best model
best_ridge_preds = best_ridge_model.predict(X_test)

# Evaluate the best model
best_ridge_mae = mean_absolute_error(y_test, best_ridge_preds)
best_ridge_rmse = np.sqrt(mean_squared_error(y_test, best_ridge_preds))

print(f'Best Ridge MAE: {best_ridge_mae}')
print(f'Best Ridge RMSE: {best_ridge_rmse}')

Best Ridge MAE: 17.8252204741847
Best Ridge RMSE: 21.066337044130783


In [19]:
import joblib

# Save the model
joblib.dump(best_ridge_model, "/workspaces/Coworking/src/results/day_ridge_model.pkl")

['/workspaces/Coworking/src/results/day_ridge_model.pkl']

In [20]:
import joblib

# Load the trained model
model_path = "/workspaces/Coworking/src/results/day_ridge_model.pkl"
day_pass_model = joblib.load(model_path)

# Extract feature names used during training
trained_features = day_pass_model.feature_names_in_
print("Model trained with features:", trained_features)


Model trained with features: ['Percentage of population between 16 and 64 years' 'Transport'
 'Month Pass' 'Rating' 'User Rating Count' 'Weighted Rating' 'Score'
 'Barcelona' 'Madrid' 'New York' 'Tokyo' 'Adelfas' 'Akasaka'
 'Ciutat Vella' 'Cortes' 'Delicias' 'Gràcia' 'Higashishinjuku' 'Midtown'
 'Nishiogi' 'Opañel' 'Sants-Montjuic' 'Shibuya' 'Shinbashi' 'SoHo'
 'Williansburg' 'log_population' 'log_income' 'log_distance'
 'income_per_capita']


In [21]:
def predict_day_pass(city, neighborhood, df_final, model):
    # Get the trained features from the model
    trained_features = model.feature_names_in_

    # Create an input DataFrame with the same structure as df_final
    input_data = pd.DataFrame(columns=trained_features)
    input_data.loc[0] = 0  # Initialize all values to zero

    # Fill in numerical features using dataset mean
    num_features = ['log_population', 'log_income', 'log_distance', 'income_per_capita']
    for feature in num_features:
        if feature in trained_features and feature in df_final.columns:
            input_data.loc[0, feature] = df_final[feature].mean()

    # Fill in one-hot encoded city and neighborhood directly from df_final
    if city in df_final.columns and city in trained_features:
        input_data.loc[0, city] = 1
    if neighborhood in df_final.columns and neighborhood in trained_features:
        input_data.loc[0, neighborhood] = 1

    # Convert to float and match trained features
    input_data = input_data.astype(float)
    input_data = input_data[trained_features]  # Ensure correct feature order

    # Predict the price
    prediction = model.predict(input_data)
    
    return prediction[0]


In [22]:
# Example: Predict the price for a coworking space in Barcelona, Ciutat Vella
predicted_price = predict_day_pass(city="Barcelona", neighborhood="Ciutat Vella", df_final=df_final, model=day_pass_model)

print(f"Predicted Day Pass Price: ${predicted_price:.2f}")


Predicted Day Pass Price: $14.03


  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()


In [23]:
predicted_price = predict_day_pass(city='Madrid', neighborhood='Delicias', df_final=df_final, model=day_pass_model)

print(f"Predicted Day Pass Price: ${predicted_price:.2f}")

Predicted Day Pass Price: $14.13


  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()


## Predict the Month Pass

In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load the numerical data

# Define features (X) and target variable (y)
X = df_final.drop(columns=['name', 'City', 'Neighborhood', 'Country', 'Month Pass'])
y = df_final["Month Pass"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_preds = ridge_model.predict(X_test)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Evaluate models
ridge_mae = mean_absolute_error(y_test, ridge_preds)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_preds))

rf_mae = mean_absolute_error(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))

print(f'RidgeMAE=',ridge_mae, 'RidgeRMSE=',ridge_rmse, 'RandonForestMAE=',rf_mae, 'RandonForestRMSE=',rf_rmse)


RidgeMAE= 87.6087183664276 RidgeRMSE= 123.14144090932305 RandonForestMAE= 72.71329999999995 RandonForestRMSE= 85.15230542615978


In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Ridge Regression
ridge_param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0]
}

# Define the parameter grid for Random Forest Regressor
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Initialize GridSearchCV for Ridge Regression
ridge_grid_search = GridSearchCV(Ridge(), ridge_param_grid, cv=5, scoring='neg_mean_absolute_error')
ridge_grid_search.fit(X_train, y_train)

# Initialize GridSearchCV for Random Forest Regressor
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_grid_search.fit(X_train, y_train)

# Get the best parameters and best score for Ridge Regression
best_ridge_params = ridge_grid_search.best_params_
best_ridge_score = -ridge_grid_search.best_score_

# Get the best parameters and best score for Random Forest Regressor
best_rf_params = rf_grid_search.best_params_
best_rf_score = -rf_grid_search.best_score_

print(f'Best Ridge Params: {best_ridge_params}, Best Ridge MAE: {best_ridge_score}')
print(f'Best RF Params: {best_rf_params}, Best RF MAE: {best_rf_score}')

Best Ridge Params: {'alpha': 1.0}, Best Ridge MAE: 59.160034902410004
Best RF Params: {'max_depth': None, 'n_estimators': 200}, Best RF MAE: 66.35418333333334


In [26]:
# Train the best Ridge Regression model using the best parameters
best_ridge_model = Ridge(**best_ridge_params)
best_ridge_model.fit(X_train, y_train)

# Predict the Month Pass values using the best model
best_ridge_preds = best_ridge_model.predict(X_test)

# Evaluate the best model
best_ridge_mae = mean_absolute_error(y_test, best_ridge_preds)
best_ridge_rmse = np.sqrt(mean_squared_error(y_test, best_ridge_preds))

print(f'Best Ridge MAE: {best_ridge_mae}')
print(f'Best Ridge RMSE: {best_ridge_rmse}')

Best Ridge MAE: 87.6087183664276
Best Ridge RMSE: 123.14144090932305


In [27]:
import joblib

# Save the model
joblib.dump(best_ridge_model, "/workspaces/Coworking/src/results/month_ridge_model.pkl")

['/workspaces/Coworking/src/results/month_ridge_model.pkl']

In [28]:
import joblib

# Load the trained model
model_path = "/workspaces/Coworking/src/results/month_ridge_model.pkl"
month_pass_model = joblib.load(model_path)

# Extract feature names used during training
trained_features = month_pass_model.feature_names_in_
print("Model trained with features:", trained_features)


Model trained with features: ['Percentage of population between 16 and 64 years' 'Transport' 'Day Pass'
 'Rating' 'User Rating Count' 'Weighted Rating' 'Score' 'Barcelona'
 'Madrid' 'New York' 'Tokyo' 'Adelfas' 'Akasaka' 'Ciutat Vella' 'Cortes'
 'Delicias' 'Gràcia' 'Higashishinjuku' 'Midtown' 'Nishiogi' 'Opañel'
 'Sants-Montjuic' 'Shibuya' 'Shinbashi' 'SoHo' 'Williansburg'
 'log_population' 'log_income' 'log_distance' 'income_per_capita']


In [29]:
def predict_month_pass(city, neighborhood, df_final, model):
    # Get the trained features from the model
    trained_features = model.feature_names_in_

    # Create an input DataFrame with the same structure as df_final
    input_data = pd.DataFrame(columns=trained_features)
    input_data.loc[0] = 0  # Initialize all values to zero

    # Fill in numerical features using dataset mean
    num_features = ['log_population', 'log_income', 'log_distance', 'income_per_capita']
    for feature in num_features:
        if feature in trained_features and feature in df_final.columns:
            input_data.loc[0, feature] = df_final[feature].mean()

    # Fill in one-hot encoded city and neighborhood directly from df_final
    if city in df_final.columns and city in trained_features:
        input_data.loc[0, city] = 1
    if neighborhood in df_final.columns and neighborhood in trained_features:
        input_data.loc[0, neighborhood] = 1

    # Convert to float and match trained features
    input_data = input_data.astype(float)
    input_data = input_data[trained_features]  # Ensure correct feature order

    # Debugging: Check the final input data
    print("Final Input Data:")
    print(input_data)

    # Make a prediction
    prediction = model.predict(input_data)
    
    # Debugging: Check the prediction
    print("Prediction:", prediction)

    return prediction[0]


In [30]:
predicted_price = predict_day_pass(city="Barcelona", neighborhood="Ciutat Vella", df_final=df_final, model=month_pass_model)

print(f"Predicted Month Pass Price: ${predicted_price:.2f}")


Predicted Month Pass Price: $151.74


  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()


In [31]:
predicted_price = predict_day_pass(city='Madrid', neighborhood='Delicias', df_final=df_final, model=month_pass_model)

print(f"Predicted Month Pass Price: ${predicted_price:.2f}")

Predicted Month Pass Price: $155.92


  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
  input_data.loc[0, feature] = df_final[feature].mean()
