In [1]:
import itertools
import os
from pathlib import Path
import pickle
import sys

from arcgis.features import GeoAccessor
from arcgis.geometry import Geometry
import pandas as pd

sys.path.append('../../src')
from geoai_retail.analysis import get_add_new_closest_dataframe

%load_ext autoreload
%autoreload 2

In [2]:
data = Path(os.path.abspath('../../data'))
interim_dir = data/'interim'
interim_gdb = interim_dir/'interim.gdb'
raw_dir = data/'raw'
raw_gdb = raw_dir/'raw.gdb'

origin_fc = str(raw_gdb/'sea_block_group')
origin_id_fld = 'ID'

dest_fc = str(raw_gdb/'sea_ace')
dest_id_fld = 'LOCNUM'

comp_fc = raw_gdb/'sea_ace_comp'
comp_id_fld = 'LOCNUM'

real_estate_fc = raw_gdb/'real_estate_hex'
real_estate_id_fld = 'GRID_ID'

closest_brand = interim_dir/'closest_store.csv'
closest_comp = interim_dir/'closest_competition.csv'
origin_demographics = interim_dir/'origin_enrich_all.csv'
inrix_trips = interim_dir/'cust_count_inrix.csv'
training_data = interim_dir/'training_data.csv'

In [3]:
origin_df = GeoAccessor.from_featureclass(str(origin_fc))
origin_df.head()

Unnamed: 0,OBJECTID,ID,NAME,SHAPE
0,1,530530701003,530530701.003,"{""rings"": [[[-122.06627999999995, 47.076520000..."
1,2,530530714071,530530714.071,"{""rings"": [[[-122.34031999999996, 47.071510000..."
2,3,530530714072,530530714.072,"{""rings"": [[[-122.35767999999996, 47.067370000..."
3,4,530530714073,530530714.073,"{""rings"": [[[-122.36847999999998, 47.067630000..."
4,5,530530714112,530530714.112,"{""rings"": [[[-122.41108999999994, 47.071690000..."


In [4]:
real_estate_df = GeoAccessor.from_featureclass(str(real_estate_fc), where_clause="road_reachable = 1")
real_estate_df = real_estate_df[[real_estate_id_fld, 'SHAPE']].copy()
real_estate_df.head()

Unnamed: 0,GRID_ID,SHAPE
0,28,"{""rings"": [[[-121.98783159799996, 46.740620218..."
1,30,"{""rings"": [[[-121.92611773799996, 46.740620218..."
2,31,"{""rings"": [[[-121.89526080699994, 46.728410000..."
3,32,"{""rings"": [[[-121.86440387799996, 46.740620218..."
4,33,"{""rings"": [[[-121.83354694699995, 46.728410000..."


In [5]:
xgr = pickle.load(open('../../models/sea_ace_xgr_stage01.pkl', 'rb'))
xgr



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [6]:
dest_df = GeoAccessor.from_featureclass(str(dest_fc))

In [7]:
master_df = pd.read_csv(training_data)
master_df[origin_id_fld] = master_df[origin_id_fld].astype('int64')

In [8]:
str_cols = master_df.select_dtypes('object').columns

factor_col_lst = [c for c in master_df.columns if
                  not c.startswith('destination_id') and
                  not c == origin_id_fld and
                  not c.endswith('_FY') and
                  not c.startswith('trip_count') and
                  not c in str_cols]
label_col = 'trip_count_01'

In [9]:
from sklearn.preprocessing import StandardScaler
std_sclr = StandardScaler()

In [10]:
row = real_estate_df.iloc[0]
id_fld = real_estate_id_fld

new_location_id = int(row[id_fld])
cnt_coords = row['SHAPE'].centroid
cnt_geom = Geometry({'x': cnt_coords[0], 'y': cnt_coords[1], 'spatialReference': real_estate_df.spatial.sr})
cnt_geom, new_location_id

({'x': -122.00840288499995,
  'y': 46.740618988666725,
  'spatialReference': {'latestWkid': 4326, 'wkid': 4326}},
 28)

In [62]:
# get a dataframe of all potential rebalanced areas
rebal_df = get_add_new_closest_dataframe(
        origins=origin_df,
        origin_id_field=origin_id_fld,
        destinations=dest_df,
        destination_id_field=dest_id_fld,
        closest_table=closest_brand,
        new_destination=cnt_geom
    )

rebal_df.head()

Unnamed: 0,origin_id,destination_id_01,proximity_traveltime_01,proximity_kilometers_01,destination_id_02,proximity_traveltime_02,proximity_kilometers_02,destination_id_03,proximity_traveltime_03,proximity_kilometers_03,destination_id_04,proximity_traveltime_04,proximity_kilometers_04,destination_id_05,proximity_traveltime_05,proximity_kilometers_05,destination_id_06,proximity_traveltime_06,proximity_kilometers_06
0,530530701003,677129595,43.576384,38.513638,371889957,46.482533,42.526899,427271369,52.472263,44.462895,421027779,55.156955,44.74881,721714069,61.428125,50.456415,216082099,58.905164,54.642485
1,530530714071,371889957,9.40584,6.295963,216082099,13.296163,10.944308,421027779,16.006815,12.366455,721714069,20.814301,15.290477,677129595,29.198017,18.11639,460556608,35.087418,25.237232
2,530530714072,371889957,10.003444,6.125873,216082099,11.53648,8.195827,421027779,18.268181,13.177555,721714069,22.034957,15.623036,677129595,29.20366,20.609116,460556608,34.472733,25.286682
3,530530714073,216082099,12.308627,7.540147,371889957,13.269904,8.468519,421027779,20.223521,13.544703,721714069,23.035677,15.164948,677129595,31.159,20.976263,460556608,35.244881,24.631002
4,530530714112,216082099,5.292562,3.556479,371889957,12.28519,9.615196,721714069,16.478653,11.395147,421027779,19.885017,15.617594,460556608,28.228815,20.647334,677129595,30.820496,23.049155


In [63]:
# while we soved way broader than possilby needed, we check and only keep changed rows
dest_id_cols = [col for col in rebal_df.columns if col.startswith('destination_id')]
dest_id_arr = dest_df[dest_id_fld].unique()
new_rebal_origin_ids = pd.concat([rebal_df[~rebal_df[dest_col].isin(dest_id_arr)]['origin_id'] for dest_id in dest_id_cols]).unique()
rebal_df = rebal_df[rebal_df['origin_id'].isin(new_rebal_origin_ids)]

In [64]:
rebal_df['origin_id'] = rebal_df['origin_id'].astype('int64')

affected_master = master_df[master_df['ID'].isin(rebal_df['origin_id'].unique())].copy()

keep_cols = [col for col in master_df.columns if not col in rebal_df]
affected_keep = affected_master[keep_cols].copy()

affected_df = affected_keep.join(rebal_df.set_index('origin_id'), on='ID')

X = std_sclr.fit_transform(affected_df[factor_col_lst])
affected_df['predict_new_01'] = xgr.predict(X)

affected_df['predict_new_01'] = affected_df['predict_new_01'].apply(lambda val: 0.0 if val < 0 else val)

affected_df['delta_01'] = affected_df['predict_new_01'] - affected_df['trip_count_01']

affected_df.insert(0, 'origin_id', affected_df[origin_id_fld])
affected_df.drop('ID', axis=1, inplace=True)

affected_df.insert(1, 'new_destination_id', int(new_location_id))

out_cols = [col for col in affected_df.columns 
            if '_id' in col or
            col.startswith('proxmity_') or
            col.startswith('predict_') or 
            col.startswith('delta_') or
            col.startswith('trip_count')
           ]

out_df = affected_df[out_cols].copy()

out_df.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Unnamed: 0,origin_id,new_destination_id,trip_count_01,trip_count_02,trip_count_03,trip_count_04,trip_count_05,trip_count_06,destination_id_01,destination_id_02,destination_id_03,destination_id_04,destination_id_05,destination_id_06,predict_new_01,delta_01
34,530530732003,28,0.0,1.0,0.0,0.0,0.0,0.0,1,371889957,216082099,721714069,421027779,677129595,2.146481,2.146481


In [21]:
def get_new_tuple(row, real_estate_id_fld):
    new_location_id = int(row[id_fld])
    cnt_coords = row['SHAPE'].centroid
    cnt_geom = Geometry({'x': cnt_coords[0], 'y': cnt_coords[1], 'spatialReference': real_estate_df.spatial.sr})
    return cnt_geom, new_location_id

def get_affected_df(input_tuple):
    
    cnt_geom, new_location_id = input_tuple
    
    # get a dataframe of all potential rebalanced areas
    rebal_df = get_add_new_closest_dataframe(
        origins=origin_df,
        origin_id_field=origin_id_fld,
        destinations=dest_df,
        destination_id_field=dest_id_fld,
        closest_table=closest_brand,
        new_destination=cnt_geom
    )
    
    # while we soved way broader than possilby needed, we check and only keep changed rows
    dest_id_cols = [col for col in rebal_df.columns if col.startswith('destination_id')]
    dest_id_arr = dest_df[dest_id_fld].unique()
    new_rebal_origin_ids = pd.concat([rebal_df[~rebal_df[dest_col].isin(dest_id_arr)]['origin_id'] for dest_col in dest_id_cols]).unique()
    rebal_df = rebal_df[rebal_df['origin_id'].isin(new_rebal_origin_ids)]

    # cast the origin id to integer to avoid data type issues
    rebal_df['origin_id'] = rebal_df['origin_id'].astype('int64')

    # get the affected areas in the master dataframe, the one with all the factors
    affected_master = master_df[master_df['ID'].isin(rebal_df['origin_id'].unique())].copy()

    # if the columns are not in the new rebalanced dataframe, create a dataframe of values to keep
    keep_cols = [col for col in master_df.columns if not col in rebal_df]
    affected_keep = affected_master[keep_cols].copy()

    # now, combine the factors data with the rebalanced data
    affected_df = affected_keep.join(rebal_df.set_index('origin_id'), on='ID')

    # infer new values for these changed areas
    X = std_sclr.fit_transform(affected_df[factor_col_lst])
    affected_df['predict_new_01'] = xgr.predict(X)

    # now, if any values are below zero, zero them out
    affected_df['predict_new_01'] = affected_df['predict_new_01'].apply(lambda val: 0.0 if val < 0 else val)

    # calcuate a change column
    affected_df['delta_01'] = affected_df['predict_new_01'] - affected_df['trip_count_01']

    # rename the id column to be somwhat more consistent
    affected_df.insert(0, 'origin_id', affected_df[origin_id_fld])
    affected_df.drop('ID', axis=1, inplace=True)

    # also, put the new destination unique identifier in, so results can be filtered
    affected_df.insert(1, 'new_destination_id', int(new_location_id))
    
    # trim down the output dataframe to just a few columns
    out_cols = [col for col in affected_df.columns 
                if '_id' in col or
                col.startswith('proxmity_') or
                col.startswith('predict_') or 
                col.startswith('delta_') or
                col.startswith('trip_count')
               ]
    out_df = affected_df[out_cols].copy()
    
    return out_df

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
fail_oid_lst = []
for idx, row in real_estate_df.iterrows():
    oid = row[0]
    try:
        if idx == 0:
            affected_df = get_affected_df(get_new_tuple(row, real_estate_id_fld))
        else:
            affected_df = affected_df.append(get_affected_df(get_new_tuple(row, real_estate_id_fld)))
        print(f'success - {oid}')
    except:
        fail_oid_lst.append(oid)
        print(f'failed - {oid}')

In [34]:
affected_df.to_csv(interim_dir/'ace_affected_df.csv')

In [39]:
fail_df = pd.DataFrame(fail_oid_lst, columns=['fail_oid'])
fail_df.to_csv(interim_dir/'ace_fail_oid.csv')

In [45]:
delta_srs = affected_df.groupby('new_destination_id').sum()['delta_01']
delta_srs.head()

new_destination_id
28    2.146481
30    2.146481
31    2.146481
32    2.146481
33    2.146481
Name: delta_01, dtype: float64

In [49]:
real_estate_df.GRID_ID = real_estate_df.GRID_ID.astype('int64')
real_estate_delta_df = real_estate_df.join(delta_srs, on='GRID_ID')
real_estate_delta_df.head()

Unnamed: 0,GRID_ID,SHAPE,delta_01
0,28,"{""rings"": [[[-121.98783159799996, 46.740620218...",2.146481
1,30,"{""rings"": [[[-121.92611773799996, 46.740620218...",2.146481
2,31,"{""rings"": [[[-121.89526080699994, 46.728410000...",2.146481
3,32,"{""rings"": [[[-121.86440387799996, 46.740620218...",2.146481
4,33,"{""rings"": [[[-121.83354694699995, 46.728410000...",2.146481


In [51]:
real_estate_delta_df.spatial.to_featureclass(str(interim_gdb/'real_estate01'))

'D:\\projects\\geoai-retail\\data\\interim\\interim.gdb\\real_estate01'

In [None]:
fail_oid_lst = []
for idx, row in real_estate_df[real_estate_delta_df.delta_01.isna()].iterrows():
    oid = row[0]
    try:
        if idx == 0:
            affected_df = get_affected_df(get_new_tuple(row, real_estate_id_fld))
        else:
            affected_df = affected_df.append(get_affected_df(get_new_tuple(row, real_estate_id_fld)))
        print(f'success - {oid}')
    except:
        fail_oid_lst.append(oid)
        print(f'failed - {oid}')