In [85]:
import numpy as np
import pandas as pd
import os
import json
from geopy import distance

In [166]:
# calculate average distance from one coordinate to another colletion of coordinates in geojson
def avgDistance(coordinates, lat, long):
    '''
    @coordinates: list of coordinates
    @lat: lat of a policy
    @long: long of a policy
    '''
    distances = []
    for i in coordinates:
        mooveLong, mooveLat = i # Moove's coordinate is (long, lat)
        dist = distance.distance((lat, long), (mooveLat, mooveLong)).miles
        distances.append(dist)
    return np.mean(distances)


In [200]:
# paths
pathMoove = 'cap_moove_0114_part1.csv'
pathCap = 'CAPPolicyAddress_geocoded_sample.csv'

# columns
columnsMooveKeys = [
    'features', 'functional_class', 'pol_number'
]
columnsMoove = [
    'confidence_score', 'score', 'curvature', 'paved', 'railway_crossing', 
    'ramp', 'road_roughness', 'slope', 'speed', 'traffic_signs_count', 'urban'
]
columnsCap = [
    'pol_number', 'lat', 'lon'
]
polkey = 'pol_number'

# load data
moove = pd.read_csv(pathMoove)[columnsMooveKeys + columnsMoove]
cap = pd.read_csv(pathCap)[columnsCap]

# merge data by pol_number
data = pd.merge(moove, cap, how='left', on=polkey)

# new columns
data['coordinates'] = data.features.str.replace('\'', '\"').apply(json.loads).apply(pd.Series)['geometry'].apply(lambda x: x['coordinates'])
data['distance'] = data.apply(lambda row: avgDistance(row['coordinates'], row['lat'], row['lon']), axis=1)
data['distanceReverse'] = 1 / data['distance']

# Method 1 - simple average
# Boolean and Tri-value variables are averaged as well to represent the percentage of the conditions.
resultSimpleAverage = data.\
    groupby(polkey).\
    agg({x: 'mean' for x in columnsMoove})
resultSimpleAverage.rename(
    columns={col: col+'_SimpleAvg' for col in resultSimpleAverage.columns if col not in [polkey]},
    inplace=True
)

# Method 2 - distance based weighted average
resultDistanceAverage = data.\
    groupby(polkey).\
    apply(lambda x: pd.Series(np.average(x[columnsMoove], weights=x["distanceReverse"], axis=0), [columnsMoove]))
resultDistanceAverage.columns = resultDistanceAverage.columns.get_level_values(0)
resultDistanceAverage.rename(
    columns={col: col+'_DistanceAvg' for col in resultDistanceAverage.columns if col not in [polkey]},
    inplace=True
)
