In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load the datasets
barcelona_df = pd.read_csv('datasets/Final_cleaned_dataset/bcn_final_cleaned_data_csv.csv', on_bad_lines='skip', sep=";")
madrid_df = pd.read_csv('datasets/Final_cleaned_dataset/mad_final_cleaned_date.csv',on_bad_lines='skip', sep=";" ) 

#Delete any missing data
barcelona_df = barcelona_df.dropna()
madrid_df = madrid_df.dropna()


In [3]:
#Chose all the relevant features that should influence in the price 
features = ['latitude', 'longitude', 'calculated_host_listings_count', 'host_is_superhost', 
            'kitchen', 'patio or balcony', 'elevator', 'air conditioning', 
            'long_term', 'short_term', 'number_of_reviews', 'review_scores_rating', 
            'room_type_encoded', 'bedrooms_encoded']

# Prepare the feature matrix X and target variable y
X_bcn = barcelona_df[features]
y_bcn = barcelona_df['price']

X_mad = madrid_df[features]
y_mad = madrid_df['price']


In [6]:
#Encode categorical host_is_superhost
# Encode host_is_superhost for Barcelona
barcelona_df['host_is_superhost_encoded'] = barcelona_df['host_is_superhost'].map({'t': 1, 'f': 0})

# Encode host_is_superhost for Madrid
madrid_df['host_is_superhost_encoded'] = madrid_df['host_is_superhost'].map({'t': 1, 'f': 0})

#Verify the encoding
print(barcelona_df[['host_is_superhost', 'host_is_superhost_encoded']].head())
print(madrid_df[['host_is_superhost', 'host_is_superhost_encoded']].head())



  host_is_superhost  host_is_superhost_encoded
0                 f                          0
1                 f                          0
2                 f                          0
3                 t                          1
4                 f                          0
  host_is_superhost  host_is_superhost_encoded
0                 f                          0
1                 f                          0
2                 f                          0
3                 f                          0
4                 f                          0


In [7]:
#Split the data into training and testing sets
X_train_bcn, X_test_bcn, y_train_bcn, y_test_bcn = train_test_split(X_bcn, y_bcn, test_size=0.3, random_state=42)
X_train_mad, X_test_mad, y_train_mad, y_test_mad = train_test_split(X_mad, y_mad, test_size=0.3, random_state=42)


In [8]:
#Train the regresion model
# Initialize the model
model_bcn = LinearRegression()
model_mad = LinearRegression()

# Train the model
model_bcn.fit(X_train_bcn, y_train_bcn)
model_mad.fit(X_train_mad, y_train_mad)


ValueError: could not convert string to float: 'f'

In [None]:
# Make predictions
y_pred_bcn = model_bcn.predict(X_test_bcn)
y_pred_mad = model_mad.predict(X_test_mad)

# Calculate metrics
mse_bcn = mean_squared_error(y_test_bcn, y_pred_bcn)
r2_bcn = r2_score(y_test_bcn, y_pred_bcn)

mse_mad = mean_squared_error(y_test_mad, y_pred_mad)
r2_mad = r2_score(y_test_mad, y_pred_mad)

print(f"Barcelona - Mean Squared Error: {mse_bcn}, R^2: {r2_bcn}")
print(f"Madrid - Mean Squared Error: {mse_mad}, R^2: {r2_mad}")
