In [1]:
import ppscore as pps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import trim_mean, mode, skew, gaussian_kde, pearsonr, spearmanr
from folium import Choropleth, Circle, Marker, Icon, Map
from folium.plugins import HeatMap, MarkerCluster
from sklearn.model_selection import train_test_split


In [2]:
pd.set_option("display.max_columns", 60)
db= pd.read_csv("databases/houses_Madrid_2.csv")
db=db.drop(['Unnamed: 0'],axis=1)
db.head()

Unnamed: 0,id,title,neighborhood,sq_mt_built,n_rooms,n_bathrooms,n_floors,floor,neighborhood_id,operation,rent_price,buy_price,house_type_id,is_renewal_needed,is_new_development,built_year,has_central_heating,has_individual_heating,has_ac,has_fitted_wardrobes,has_lift,is_exterior,has_garden,has_pool,has_terrace,has_balcony,has_storage_room,has_green_zones,energy_certificate,has_parking,is_parking_included_in_price,district,price_sqm,year_profit
0,21742,"Piso en venta en calle de Godella, 64","San Cristóbal, Madrid",64.0,2.0,1.0,1.0,3,135,sale,471.0,85000.0,HouseType 1: Pisos,False,False,1960.0,not_specified,not_specified,True,False,False,True,False,False,False,False,False,False,D,False,False,Villaverde,1328.125,6.649412
1,21741,Piso en venta en calle de la del Manojo de Rosas,"Los Ángeles, Madrid",70.0,3.0,1.0,1.0,4,132,sale,666.0,129900.0,HouseType 1: Pisos,True,False,not_specified,not_specified,not_specified,False,True,True,True,False,False,True,False,False,False,en trámite,False,False,Villaverde,1855.714286,6.152425
2,21740,"Piso en venta en calle del Talco, 68","San Andrés, Madrid",94.0,2.0,2.0,1.0,1,134,sale,722.0,144247.0,HouseType 1: Pisos,False,False,not_specified,False,True,False,True,True,True,False,False,False,False,True,False,no indicado,False,False,Villaverde,1534.542553,6.006364
3,21739,Piso en venta en calle Pedro Jiménez,"San Andrés, Madrid",64.0,2.0,1.0,1.0,Bajo,134,sale,583.0,109900.0,HouseType 1: Pisos,False,False,1955.0,not_specified,not_specified,False,False,True,True,False,False,False,False,True,False,en trámite,False,False,Villaverde,1717.1875,6.365787
4,21738,Piso en venta en carretera de Villaverde a Val...,"Los Rosales, Madrid",108.0,2.0,2.0,1.0,4,133,sale,1094.0,260000.0,HouseType 1: Pisos,False,False,2003.0,not_specified,not_specified,True,True,True,True,False,True,False,False,True,True,en trámite,True,True,Villaverde,2407.407407,5.049231


In [3]:
db.district.unique()

array(['Villaverde', 'Vicalvaro', 'Villa de Vallecas', 'Usera', 'Tetuan',
       'Retiro', 'Puente de Vallecas', 'Moncloa-Aravaca', 'Moratalaz',
       'Latina', 'Fuencarral-Pardo', 'Hortaleza', 'Chamberi',
       'Ciudad Lineal', 'Chamartin', 'Centro', 'Carabanchel', 'Salamanca',
       'Arganzuela', 'Barajas'], dtype=object)

### Creating a model

In [None]:
#This is another interesting part of the project
#We create a supervised model which allows us to see, district by district, which prices are overvalued and which are undervalued, as well as to calculate their average price error
#This gives us a huge competitive advantage because without having to fully analyze each district's housing stock, we find great opportunities

In [4]:
dbModel= db[db.district.isin(['Carabanchel'])]


In [5]:
features_cols = ['sq_mt_built','n_rooms','n_bathrooms','has_lift','rent_price']
X = dbModel[features_cols]
y = dbModel['buy_price']


In [6]:
X.dtypes

sq_mt_built    float64
n_rooms        float64
n_bathrooms    float64
has_lift          bool
rent_price     float64
dtype: object

In [7]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X,y, test_size=0.2, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=123)


In [8]:
from numpy import mean
from numpy import std

from sklearn.ensemble import GradientBoostingRegressor


from matplotlib import pyplot

modelV1 = GradientBoostingRegressor()
modelV1.fit(X_train,y_train)


GradientBoostingRegressor()

In [9]:
#Evaluacion modelo
pred = modelV1.predict(X_val)

In [10]:
import sklearn
import math

mse = sklearn.metrics.mean_squared_error(y_val, pred)

rmse = math.sqrt(mse)

print(rmse)

6947.938122664811


In [11]:
from sklearn import linear_model

modelV2 = linear_model.BayesianRidge()
modelV2.fit(X_train,y_train)

BayesianRidge()

In [12]:
#Evaluacion modelo
predV2 = modelV2.predict(X_val)

In [13]:
import sklearn
import math

mse = sklearn.metrics.mean_squared_error(y_val, predV2)

rmse = math.sqrt(mse)

print(rmse)

12194.827737208228


In [14]:
ModelFinal =  linear_model.BayesianRidge()
ModelFinal.fit(X_train_full,y_train_full)

BayesianRidge()

In [15]:
FinalPreds = ModelFinal.predict(X_test)

In [16]:
dbAnalysis = pd.DataFrame(data = {'Real':y_test,'Pred':FinalPreds})

In [17]:
dbAnalysis['Diff'] = dbAnalysis['Real'] - dbAnalysis['Pred']

In [18]:
dbAnalysis

Unnamed: 0,Real,Pred,Diff
17191,110000.0,103688.692856,6311.307144
15825,143000.0,145012.345634,-2012.345634
16321,145000.0,147963.703591,-2963.703591
15803,295000.0,295760.933177,-760.933177
16743,122500.0,118775.644206,3724.355794
...,...,...,...
17269,134000.0,136001.140755,-2001.140755
16653,198000.0,213279.190995,-15279.190995
16033,332000.0,334464.418898,-2464.418898
16084,244000.0,257540.482407,-13540.482407


In [19]:
dbAnalysis.Diff.max()

49681.81078100903

In [20]:
dbAnalysis.Diff.min() #Here we have the example... we find a house undervalued which represent a huge buy opportunity 

-24018.606168293394

In [21]:
dbAnalysis.Diff.mean()

-251.39102919963196