In [2]:
# import libraries and convert to dataframe

import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# import dataframes


df_bcn = pd.read_csv('Datasets/Final_cleaned_dataset/labled_features_bcn.csv', on_bad_lines='skip')
df_mad = pd.read_csv('Datasets/Final_cleaned_dataset/labled_features_mad.csv', on_bad_lines='skip')

In [3]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import StandardScaler

# Calculate z-scores for price
z_scores_bcn = np.abs(stats.zscore(df_bcn['price']))
z_scores_mad = np.abs(stats.zscore(df_mad['price']))

# Define a threshold
threshold = 3

# Remove outliers
df_bcn = df_bcn[(z_scores_bcn < threshold)]
df_mad = df_mad[(z_scores_mad < threshold)]

# Find and remove rows where 'neighbourhood' is "Sant Andreu" or "Gracias" and 'distance to city center' is less than 1 km
df_bcn = df_bcn[~((df_bcn['neighbourhood_group_cleansed'] == 'Sant Andreu') & (df_bcn['distance_from_city_center'] == '<1 km'))]
df_bcn = df_bcn[~((df_bcn['neighbourhood_group_cleansed'] == 'Gràcia') & (df_bcn['distance_from_city_center'] == '<1 km'))]

from sklearn.preprocessing import StandardScaler

# Select only the continuous numerical columns to normalize
continuous_columns_bcn = ['price', 'distance_from_city_center', 'number_of_reviews', 'review_scores_rating']
continuous_columns_mad = ['price', 'distance_from_city_center', 'number_of_reviews', 'review_scores_rating']

# Initialize the scaler
scaler = StandardScaler()

# Normalize only the continuous columns
df_bcn[continuous_columns_bcn] = scaler.fit_transform(df_bcn[continuous_columns_bcn])
df_mad[continuous_columns_mad] = scaler.fit_transform(df_mad[continuous_columns_mad])

# Leave the binary and ordinal encoded variables as they are
binary_columns = ['host_is_superhost', 'kitchen', 'patio or balcony', 'elevator', 'air conditioning']
ordinal_columns = ['bedrooms_encoded', 'room_type_encoded']

# No need to scale these columns
df_bcn[binary_columns + ordinal_columns] = df_bcn[binary_columns + ordinal_columns]
df_mad[binary_columns + ordinal_columns] = df_mad[binary_columns + ordinal_columns]

In [4]:
df_bcn['host_is_superhost'] = df_bcn['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0) 
df_mad['host_is_superhost'] = df_mad['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)

### Update Feature and Target Variables

In [5]:
# Features for clustering and regression
# For Barcelona
X_bcn = df_bcn[['distance_from_city_center', 'host_is_superhost', 'kitchen', 'patio or balcony',
                'elevator', 'air conditioning', 'room_type_encoded', 'bedrooms_encoded',
                'number_of_reviews', 'review_scores_rating']]
y_bcn = df_bcn['price']

# For Madrid
X_mad = df_mad[['distance_from_city_center', 'host_is_superhost', 'kitchen', 'patio or balcony',
                'elevator', 'air conditioning', 'room_type_encoded', 'bedrooms_encoded',
                'number_of_reviews', 'review_scores_rating']]
y_mad = df_mad['price']

# Standardize the data before clustering
scaler = StandardScaler()
X_bcn_scaled = scaler.fit_transform(X_bcn)
X_mad_scaled = scaler.fit_transform(X_mad)

# Apply DBSCAN for clustering
############COMENT NEEDED
dbscan_bcn = DBSCAN(eps=0.5, min_samples=5)
clusters_bcn = dbscan_bcn.fit_predict(X_bcn_scaled)

dbscan_mad = DBSCAN(eps=0.5, min_samples=5)
clusters_mad = dbscan_mad.fit_predict(X_mad_scaled)

# Add the cluster labels to the original datasets using .loc
X_bcn = X_bcn.copy()  # Ensure you're working with a copy of the DataFrame
X_bcn.loc[:, 'cluster'] = clusters_bcn

X_mad = X_mad.copy()  # Ensure you're working with a copy of the DataFrame
X_mad.loc[:, 'cluster'] = clusters_mad



In [6]:
# Split the data into training and testing sets
X_bcn_train, X_bcn_test, y_bcn_train, y_bcn_test = train_test_split(X_bcn, y_bcn, test_size=0.3, random_state=42)
X_mad_train, X_mad_test, y_mad_train, y_mad_test = train_test_split(X_mad, y_mad, test_size=0.3, random_state=42)


In [7]:
# Fit the linear regression model for Barcelona
model_bcn = LinearRegression()
model_bcn.fit(X_bcn_train, y_bcn_train)

# Fit the linear regression model for Madrid
model_mad = LinearRegression()
model_mad.fit(X_mad_train, y_mad_train)

# Predictions
y_bcn_pred = model_bcn.predict(X_bcn_test)
y_mad_pred = model_mad.predict(X_mad_test)

# Evaluate the model
mse_bcn = mean_squared_error(y_bcn_test, y_bcn_pred)
mse_mad = mean_squared_error(y_mad_test, y_mad_pred)

print(f'MSE Barcelona: {mse_bcn}')
print(f'MSE Madrid: {mse_mad}')

MSE Barcelona: 0.8051334764651235
MSE Madrid: 0.7359847049981425


In [8]:
from sklearn.metrics import r2_score

# Calculate R^2 for the Barcelona model
r2_bcn = r2_score(y_bcn_test, y_bcn_pred)
print(f'R^2 for Barcelona model: {r2_bcn}')

# Calculate R^2 for the Madrid model
r2_mad = r2_score(y_mad_test, y_mad_pred)
print(f'R^2 for Madrid model: {r2_mad}')



R^2 for Barcelona model: 0.24201038930024665
R^2 for Madrid model: 0.25734659653293146


In [9]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Assuming you have already split your data into training and testing sets:
# X_bcn_train, X_bcn_test, y_bcn_train, y_bcn_test
# X_mad_train, X_mad_test, y_mad_train, y_mad_test

# Fit the linear regression model for Barcelona
model_bcn = LinearRegression()
model_bcn.fit(X_bcn_train, y_bcn_train)

# Fit the linear regression model for Madrid
model_mad = LinearRegression()
model_mad.fit(X_mad_train, y_mad_train)

# Predictions
y_bcn_train_pred = model_bcn.predict(X_bcn_train)
y_bcn_test_pred = model_bcn.predict(X_bcn_test)
y_mad_train_pred = model_mad.predict(X_mad_train)
y_mad_test_pred = model_mad.predict(X_mad_test)

# Calculate R^2 for training and testing data
r2_bcn_train = r2_score(y_bcn_train, y_bcn_train_pred)
r2_bcn_test = r2_score(y_bcn_test, y_bcn_test_pred)
r2_mad_train = r2_score(y_mad_train, y_mad_train_pred)
r2_mad_test = r2_score(y_mad_test, y_mad_test_pred)

# Calculate MSE for testing data
mse_bcn = mean_squared_error(y_bcn_test, y_bcn_test_pred)
mse_mad = mean_squared_error(y_mad_test, y_mad_test_pred)

# Print results
print(f'Barcelona Model R^2 (Training): {r2_bcn_train}')
print(f'Barcelona Model R^2 (Testing): {r2_bcn_test}')
print(f'Madrid Model R^2 (Training): {r2_mad_train}')
print(f'Madrid Model R^2 (Testing): {r2_mad_test}')

print(f'Barcelona Model MSE (Testing): {mse_bcn}')
print(f'Madrid Model MSE (Testing): {mse_mad}')


Barcelona Model R^2 (Training): 0.2271760435415715
Barcelona Model R^2 (Testing): 0.24201038930024665
Madrid Model R^2 (Training): 0.26803739599890863
Madrid Model R^2 (Testing): 0.25734659653293146
Barcelona Model MSE (Testing): 0.8051334764651235
Madrid Model MSE (Testing): 0.7359847049981425


In [10]:
# Barcelona Coefficients
coef_bcn = model_bcn.coef_
intercept_bcn = model_bcn.intercept_

print("Barcelona Model Coefficients:")
for feature, coef in zip(X_bcn.columns, coef_bcn):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept_bcn:.2f}")

# Madrid Coefficients
coef_mad = model_mad.coef_
intercept_mad = model_mad.intercept_

print("\nMadrid Model Coefficients:")
for feature, coef in zip(X_mad.columns, coef_mad):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept_mad:.2f}")

Barcelona Model Coefficients:
distance_from_city_center: -0.17
host_is_superhost: -0.00
kitchen: -0.22
patio or balcony: -0.02
elevator: 0.20
air conditioning: 0.21
room_type_encoded: 0.27
bedrooms_encoded: 0.49
number_of_reviews: 0.01
review_scores_rating: 0.01
cluster: -0.00
Intercept: -0.66

Madrid Model Coefficients:
distance_from_city_center: -0.17
host_is_superhost: 0.06
kitchen: -0.23
patio or balcony: 0.13
elevator: 0.17
air conditioning: 0.19
room_type_encoded: 0.39
bedrooms_encoded: 0.69
number_of_reviews: -0.08
review_scores_rating: -0.03
cluster: -0.00
Intercept: -0.85
