In [1]:
import os
import os.path

from glob import glob
import random
import shutil
import pathlib
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras


## Feature column crossing 

process taken from https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html


In [2]:
top_dir = pathlib.Path.cwd().parent.parent.resolve()

In [3]:
counties = pd.read_csv(top_dir.joinpath('experiments','noah','2021_Gaz_counties_national.txt'), delimiter='\t')

counties.columns = counties.columns.str.replace(" ","")

cdf = counties[['GEOID', 'INTPTLAT','INTPTLONG']].copy()
cdf.rename(columns={'INTPTLAT':'latitude', 'INTPTLONG':'longitude'},inplace=True)

print(f"max latitude = {cdf.latitude.max()}")
print(f"max longitude = {cdf.longitude.max()}")
                       
feature_columns = []

max latitude = 69.449343
max longitude = 178.338813


# Dropping Alaska and Hawaii

If we want to keep it then just remove this and set noak = cdf

In [4]:
noak=cdf.drop(cdf[(cdf['GEOID'] > 2000) & (cdf['GEOID'] < 3000)].index)
noak.drop(noak[(noak['GEOID']> 15000) & (noak['GEOID'] < 16000)].index, inplace=True)

In [5]:
print(f"long max = {noak.longitude.max()}, long min = {noak.longitude.min()}")

long max = -65.307769, long min = -124.210929


In [6]:
print(f"lat max = {noak.latitude.max()}, lat min = {noak.latitude.min()}")

lat max = 48.842653, lat min = 17.948052


For the continental US 

Latitude is in the range (17, 50) degrees.

Longitude is in the range (-124,-65) degrees.

In [7]:
print(f'latitude diff = {noak.latitude.max()-noak.latitude.min()}')
print(f'longitude diff = {noak.longitude.max()-noak.longitude.min()}')

latitude diff = 30.894600999999998
longitude diff = 58.90316


#### How many partititions  do we need using average distance between county centroids?

In [13]:
longList = noak.longitude.sort_values()
meanLong = longList.diff(10).mean()
latList = noak.latitude.sort_values()
meanLat = latList.diff(10).mean()

longSize = noak.longitude.max()-noak.longitude.min()
latSize = noak.latitude.max()-noak.latitude.min()
print(longSize/meanLong, latSize/meanLat)
print("This might be too many. I still used it. For hash bucket size, I multiplied these values and divided by ten." 
      "Might still be too granular.")

321.2296677905454 318.4777667803588
This might be too many. I still used it. For hash bucket size, I multiplied these values and divided by ten.Might still be too granular.


### Bucketizing

In [9]:

lat_buckets = list(np.linspace(noak.latitude.min(),noak.latitude.max(),322))
long_buckets = list(np.linspace(noak.longitude.min(),noak.longitude.max(),319))


In [10]:
lat_fc = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'),lat_buckets)
long_fc= tf.feature_column.bucketized_column(tf.feature_column.numeric_column('longitude'),long_buckets)

In [11]:
cross_coordinate_fc = tf.feature_column.crossed_column(keys=[lat_fc, long_fc], hash_bucket_size=10272) # No precise rule, maybe 1000 buckets will be good?

### Turning the feature column into a dense layer to be added to the model

In [12]:
crossed_feature = tf.feature_column.indicator_column(cross_coordinate_fc)
feature_columns.append(crossed_feature)

geo_crossed_layer = keras.layers.DenseFeatures(feature_columns)

# Final notes

The article says it's best to feed the lat_fc and long_fc into the network along with the crossed column 