In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import sklearn.model_selection as skmod
import sklearn.neighbors as nnb
from geopy import distance as geod
import shapely as shp
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [2]:
sales = gpd.GeoDataFrame(pd.read_pickle('2014-01-01_2018-07-31_koebenhavn_predictive.pkl'), geometry='coords')
sales.crs = {'init': 'epsg:2077'}
sales['easting'] = sales['coords'].x
sales['northing'] = sales['coords'].y
sales['sale_date_epoch'] = sales['sale_date'].apply(lambda d : d.timestamp()) // (24*3600)

In [3]:
find_nn_cands = nnb.NearestNeighbors(radius=500)
nn_cands_dists, nn_cands = find_nn_cands.fit(sales[['easting','northing']]).radius_neighbors()
n_nns = [len(a) for a in nn_cands]

In [4]:
def bordermatch(i) :
    cond1 = sales.loc[nn_cands[i],'skolenavn'] == sales.loc[i,'nn_skolenavn']
    cond2 = sales.loc[nn_cands[i],'nn_skolenavn'] == sales.loc[i,'skolenavn']
    if sales.loc[i,'west_of_harbour'] :
        cond3 = sales.loc[nn_cands[i],'west_of_harbour']
    else :
        cond3 = ~sales.loc[nn_cands[i],'west_of_harbour']
    return cond1 & cond2 & cond3

In [5]:
timediffs = [sales.loc[nn_cands[i],'sale_date_epoch'] - sales.loc[i,'sale_date_epoch'] for i in range(0,len(nn_cands))]

In [6]:
df_nn_cands = pd.DataFrame({'dist' : np.concatenate(nn_cands_dists, axis=0),
                            'time' : np.concatenate(timediffs, axis=0)},
                             index=[np.repeat(sales.index,n_nns), np.concatenate(nn_cands, axis=0)])

In [7]:
df_nn_cands.index.names = ['s1', 's2']

In [8]:
df_nn_cands.to_pickle('nn_frame_predictive.pkl')

In [9]:
sales = gpd.GeoDataFrame(pd.read_pickle('2014-01-01_2018-07-31_koebenhavn_causal.pkl'), geometry='coords')
sales.crs = {'init': 'epsg:2077'}
sales['easting'] = sales['coords'].x
sales['northing'] = sales['coords'].y
sales['sale_date_epoch'] = sales['sale_date'].apply(lambda d : d.timestamp()) // (24*3600)

In [10]:
find_nn_cands = nnb.NearestNeighbors(radius=500)
nn_cands_dists, nn_cands = find_nn_cands.fit(sales[['easting','northing']]).radius_neighbors()
n_nns = [len(a) for a in nn_cands]

In [None]:
timediffs = [sales.loc[nn_cands[i],'sale_date_epoch'] - sales.loc[i,'sale_date_epoch'] for i in range(0,len(nn_cands))]
bordermatches = [bordermatch(i) for i in range(0,len(nn_cands))]

In [None]:
df_nn_cands = pd.DataFrame({'dist' : np.concatenate(nn_cands_dists, axis=0),
                            'time' : np.concatenate(timediffs, axis=0),
                            'bordermatch' : np.concatenate(bordermatches, axis=0)},
                             index=[np.repeat(sales.index,n_nns), np.concatenate(nn_cands, axis=0)])

In [None]:
df_nn_cands.index.names = ['s1', 's2']

In [None]:
df_nn_cands.to_pickle('nn_frame_causal.pkl')