## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal

  from .sqlite import head_to_sql, start_sql


Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data=pd.read_csv("./raw_data/synth_data_ex1.csv")

In [3]:
data.head(5)

Unnamed: 0,id,y,z,latitude,longitude
0,1,-0.844504,1.370958,2.5,2.5
1,2,-0.832568,-0.564698,2.5,7.5
2,3,-0.793995,0.363128,2.5,12.5
3,4,-0.73034,0.632863,2.5,17.5
4,5,-0.641493,0.404268,2.5,22.5


Create a pairwise distance matrix (Euclidean) between the points

In [4]:
dist = pysal.lib.cg.distance_matrix(np.array(data[["longitude","latitude"]]))

Get the `k` smallest distances (of the whole matrix, since the points are equally distributed)

In [5]:
k=10
u_dist = np.unique(dist)
k_min_dist = np.sort(u_dist.flatten())[:k]

In [6]:
k_min_dist

array([ 0.        ,  5.        ,  7.07106781, 10.        , 11.18033989,
       14.14213562, 15.        , 15.8113883 , 18.02775638, 20.        ])

Create spatial points object ([KDTree](https://pysal.readthedocs.io/en/dev/library/cg/kdtree.html))). 

In [7]:
import pysal.lib
kd = pysal.lib.cg.kdtree.KDTree(np.array(data[["longitude","latitude"]]))

In [8]:
kd

<scipy.spatial.kdtree.KDTree at 0x2cbdb4d32e8>

Compute spatial neighbourhoods weight matrix by distance threshold ("radius")


In [9]:
#wnn = pysal.lib.weights.KNN(kd, 8, ids=data["id"]) #KNN based weights
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[1],binary=False,p=2) #Rook
wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[4],binary=True,p=2) #Queen 2nd degree

## Spatial CV: Lat/Lon slicing

Create labels based on latitude / longitude binning and add the labels to the original data

In [10]:
data["lon_group"] = pd.cut(data["longitude"],bins=5,labels=[1,2,3,4,5])
data["lat_group"] = pd.cut(data["latitude"],bins=5,labels=[1,2,3,4,5])

This method loops over our lat/lon groups, keeping each group as test data and the rest as train data. However, as we want to do spatial cross-validation, we remove neighbors of the test set. This can help to prevent model overfitting. Here, we remove 1st and 2nd degree neighbors, but the method can be adapted as needed. We create 10 folds (5 lon, 5 lat slicing) and save these in the columns `lat_group[1-5]` and `lon_group[1-5]`. For the values in each of these columns, `1` indicates testing data, `2` training data and `0`indicates data to be removed.

In [11]:
for q in list(data)[-2::]: #Loop over the two slicing label columns 
    data["s_id"] = data[q] #Define which label column to use for slicing
    
    for j in np.unique(data["s_id"]): #Loop over the unique labels in the slicing column 
        
        data[q+str(j)] = 0
        
        test = data[data["s_id"]==j] #Define test data 
        data.loc[data["id"].isin(np.array(test["id"])),q+str(j)] = 1
        
        temp_id = [] #Create empty neighbourhood index
        
        for k in test.index: #Fill neighborhood index using first degree neighbors of test data
            temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[k]]).ravel().astype(np.int32))
            
        for l in temp_id: #Include second degree neighbors
            temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[l]]).ravel().astype(np.int32))
        
        #for m in temp_id: #Include third degree neighbors
        #    temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[m]]).ravel().astype(np.int32))
            
        train = data[data["s_id"]!=j] #Define train data 
        train = train.drop(temp_id,errors="ignore") #Exclude neighbors from index
        data.loc[data["id"].isin(np.array(train["id"])),q+str(j)] = 2

#Drop helper columns
data = data.drop(columns=["lon_group","lat_group","s_id"])

Rename columns to represent cross-validation folds

In [12]:
data.rename(columns=dict(zip(data.filter(regex='_group').columns,
                             ["fold1","fold2","fold3","fold4","fold5","fold6","fold7","fold8","fold9","fold10"])),
            inplace=True)

In [13]:
data

Unnamed: 0,id,y,z,latitude,longitude,fold1,fold2,fold3,fold4,fold5,fold6,fold7,fold8,fold9,fold10
0,1,-0.844504,1.370958,2.5,2.5,1,2,2,2,2,1,2,2,2,2
1,2,-0.832568,-0.564698,2.5,7.5,1,2,2,2,2,1,2,2,2,2
2,3,-0.793995,0.363128,2.5,12.5,1,0,2,2,2,1,2,2,2,2
3,4,-0.730340,0.632863,2.5,17.5,1,0,2,2,2,1,2,2,2,2
4,5,-0.641493,0.404268,2.5,22.5,0,1,2,2,2,1,2,2,2,2
5,6,-0.527385,-0.106125,2.5,27.5,0,1,2,2,2,1,2,2,2,2
6,7,-0.387048,1.511522,2.5,32.5,2,1,0,2,2,1,2,2,2,2
7,8,-0.222415,-0.094659,2.5,37.5,2,1,0,2,2,1,2,2,2,2
8,9,-0.031306,2.018424,2.5,42.5,2,0,1,2,2,1,2,2,2,2
9,10,0.184221,-0.062714,2.5,47.5,2,0,1,2,2,1,2,2,2,2


We can now save the data:

In [14]:
data.to_csv("grid_aug_ex1.csv",index=False)