In [1]:
import pandas as pd
import numpy as np
import math
from operator import itemgetter

In [6]:
###################################### FUNCTIONS ############################################
##Function to caclulate eucilidean distances in tuples
def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))

##Function to calculate Distance from one point to set of points
def _get_tuple_distance(training_instance, test_instance):
    return (training_instance, get_distance(test_instance, training_instance))

##Function to determine Geomean for one sample point
def get_geomean_pt(training_set, test_instance, funct, k):
    tuples = [tuple(x) for x in training_set.to_numpy()]
    distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in tuples]
 
    # index 1 is the calculated distance between training_instance and test_instance
    sorted_distances = sorted(distances, key=itemgetter(1))
 
    # extract only training instances
    sorted_training_instances = [tuple for tuple in sorted_distances]
 
    # select first k elements
    resp = pd.DataFrame(sorted_training_instances[:k])
    max_resp = resp.iloc[:,1].max()
    if funct == 'bisquared':
        resp['Weight'] =  (1-((resp.iloc[:,1]/max_resp)**2))**2 #Bi-squared function
    else:
        resp['Weight'] =  np.exp(-0.5*((resp.iloc[:,1]/(max_resp*0.9))**2)) #Guassian function
    max_weight = resp['Weight'].sum()
    resp['StdWeight'] = resp['Weight']/max_weight
    resp[['Easting', 'Northing', 'SalePrice']] = pd.DataFrame(resp[0].tolist(), index=resp.index) 
    resp['WeightPrice'] = resp['StdWeight'] * resp['SalePrice'] 
    wp = resp['WeightPrice'] .sum()
    return  wp

##Function to caclulate Geomeans for whole sample grid
# data_points is a dataframe containing projected coordinates and variable of interest
# samp_points are the points to return a geomean value, best to be a grid of points
# varbs are the column names for 1. Easting, 2. Northing, 3. Variable of Interest.n.b. have to be in that order!
# funct is decay function ("bisquared","guassian") - guassian is default
# k is number of points taken for mean
def get_geomean_data(data_points, samp_points, varbs, funct,  k):
 
    xsp_nm = data_points.columns
    varsp_idx = [data_points.columns.get_loc(col) for col in varbs]
    x_coord = data_points.iloc[:,varsp_idx]
    
    results = []

    East_dist = samp_points['Easting'].max() - samp_points['Easting'].min()
    North_dist = samp_points['Northing'].max() - samp_points['Northing'].min()

    East_dist_sel = East_dist/10
    North_dist_sel = North_dist/10

    for index, row in samp_points.iterrows():
        sample = x_coord.loc[(x_coord['Easting'] >= (row['Easting']- East_dist_sel)) & (x_coord['Easting'] <= (row['Easting'] + East_dist_sel)) & (x_coord['Northing'] >= (row['Northing']- North_dist_sel)) & (x_coord['Northing'] <= (row['Northing'] + North_dist_sel)),]

        if sample.shape[0] >= k:
            a = get_geomean_pt(sample,row,funct,k)
            results.append(a)
        else:
            a = get_geomean_pt(x_coord,row,funct,k)
            results.append(a)       

    results_pt = samp_points
    results_pt['Geomean'] = results
    return results_pt

In [4]:
df = pd.read_csv("C:\MyWork\Python Scripts\LandReg.csv")
gd = pd.read_csv("C:\MyWork\Python Scripts\grid.csv") 

In [7]:
#Example 
%timeit  df_results = get_geomean_data(data_points = df,samp_points = gd,varbs = ("Easting","Northing","SalePrice") ,funct ='guassian',k=16 )

5min 13s ± 24.4 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
df_results.to_csv("C:\MyWork\Python Scripts\geomeans_gua.csv")