### Packages ans utils

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from gaia.ml import SpatialModel

from sqlalchemy import create_engine
%matplotlib inline

### Load data

In [2]:
conn_params = {
    "host": "127.0.0.1",
    "port": 5432,
    "user": "focalai",
    "password": "ai",
    "database": "focalai"
}
uri = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(**conn_params)
print(uri)
engine = create_engine(uri)

postgresql+psycopg2://focalai:ai@127.0.0.1:5432/focalai


In [3]:
dvf_query = """
        SELECT * FROM dvf
        WHERE ST_DistanceSphere(geom, ST_GeomFromText('POINT({longitude} {latitude})', 4326))<1000000
        ORDER BY date_mutation DESC
        LIMIT 100000
        """.format(**{'longitude': 2.33, 'latitude': 48.86})
dvf_df = gpd.GeoDataFrame.from_postgis(dvf_query, engine)
dvf_df.head()

Unnamed: 0,date_mutation,id_mutation,valeur_fonciere,surface_totale,prix_m2,wkt,geom
0,2019-12-31,2019-920640,61000.0,50.9,1198.428291,POINT (6.446713 48.173733),POINT (6.44671 48.17373)
1,2019-12-31,2019-925242,149000.0,128.67,1158.001088,POINT (6.59617 48.015631),POINT (6.59617 48.01563)
2,2019-12-31,2019-941011,130000.0,60.83,2137.103403,POINT (2.533537 48.67825),POINT (2.53354 48.67825)
3,2019-12-31,2019-947089,160000.0,60.1,2662.229617,POINT (2.262482 48.735289),POINT (2.26248 48.73529)
4,2019-12-31,2019-947133,153000.0,52.92,2891.156463,POINT (2.383276 48.695156),POINT (2.38328 48.69516)


In [4]:
#dvf_df.plot(figsize=(25, 25), column='prix_m2', s=5)

### SpatialModel

In [5]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
spatial_estimator = SpatialModel(
    cluster=KMeans(n_clusters=50),
    estimator=RandomForestRegressor(n_estimators=25, n_jobs=-1)
    )

In [24]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dvf_df, test_size=0.33, random_state=3105)


Z_train = pd.DataFrame(zip(train['geom'].x.tolist(), train['geom'].y.tolist()))
X_train = pd.concat([train[['surface_totale']].reset_index(drop=True), Z_train], axis=1)
y_train = train['prix_m2']

Z_test = pd.DataFrame(zip(test['geom'].x.tolist(), test['geom'].y.tolist()))
X_test = pd.concat([test[['surface_totale']].reset_index(drop=True), Z_test], axis=1)
y_test = test['prix_m2']

In [25]:
spatial_estimator.fit(X_train, Z_train, y_train)

In [26]:
y_pred = spatial_estimator.predict(X_test, Z_test)

In [27]:
errors = pd.DataFrame(zip(y_test, y_pred))

In [28]:
from sklearn.metrics import mean_absolute_error

In [29]:
mean_absolute_error(y_test, y_pred)

743.1402101882552

In [31]:
errors

Unnamed: 0,0,1
0,2822.426330,3235.057168
1,5643.474790,5909.651260
2,4253.882512,5608.248403
3,1988.834613,2434.017425
4,1516.493248,1714.306126
...,...,...
32995,5387.071030,5935.413417
32996,4608.294931,4299.454486
32997,3873.331589,3029.554445
32998,3325.942350,3795.401590
