In [None]:
# Note this notebook depends on a csv produced in R, gps_grid.csv

In [None]:
import os, json, requests # for downloading power data with `dl_power_data()`

import pandas as pd

import folium
from folium.plugins import MarkerCluster

from tqdm import tqdm
import time # for sleep

from dataG2F.qol import *

writeout_power_npys = False
writeout_gps_grid = False

In [None]:
cache_path = '../nbs_artifacts/06_gps_grid_nasa_power/'
ensure_dir_path_exists(dir_path = cache_path)

In [None]:
# TODO change gps_grid import to a location in data_ext; Do this after moving the R project into data_ext
gps_grid = pd.read_csv('../data_ext/'+'gps_grid.csv').drop(columns = ['Unnamed: 0'])

In [None]:
if True == False:
    #Create the map
    my_map = folium.Map(location = [38.928745, -92.352163], # LatLon
                        zoom_start = 4)

    for i in gps_grid.index:
        e = list(gps_grid.loc[i, [
            'state', 
            'lat', 
            'lon']])

        # check for nas
        if 0 in [1 if ee == ee else 0 for ee in e]:
            print(e[0]+' contains missing values!')
        else:
            folium.Marker((e[1], e[2]), 
                          popup = e[0], 
                         ).add_to(my_map)

    my_map

## Download and Prep NASA Power Data

In [None]:
# Copied from 01.02_g2fc_imputation.ipynb
def dl_power_data(
    latitude = 32.929, 
    longitude = -95.770,
    start_YYYYMMDD = 20150101,
    end_YYYYMMDD = 20150305
):
    # Modified by 
    # https://power.larc.nasa.gov/docs/tutorials/service-data-request/api/
    '''
    *Version: 2.0 Published: 2021/03/09* Source: [NASA POWER](https://power.larc.nasa.gov/)
    POWER API Multi-Point Download
    This is an overview of the process to request data from multiple data points from the POWER API.
    '''

    base_url = r"https://power.larc.nasa.gov/api/temporal/daily/point?parameters=QV2M,T2MDEW,PS,RH2M,WS2M,GWETTOP,ALLSKY_SFC_SW_DWN,ALLSKY_SFC_PAR_TOT,T2M_MAX,T2M_MIN,T2MWET,GWETROOT,T2M,GWETPROF,ALLSKY_SFC_SW_DNI,PRECTOTCORR&community=RE&longitude={longitude}&latitude={latitude}&start={start_YYYYMMDD}&end={end_YYYYMMDD}&format=JSON"

    api_request_url = base_url.format(
        longitude=longitude, 
        latitude=latitude,
        start_YYYYMMDD=start_YYYYMMDD, 
        end_YYYYMMDD=end_YYYYMMDD)

    response = requests.get(url=api_request_url, verify=True, timeout=30.00)

    content = json.loads(response.content.decode('utf-8'))

    # Repackage content as data frame
    df_list = [
        pd.DataFrame(content['properties']['parameter'][e], index = [0]).melt(
        ).rename(columns = {'variable':'Date', 'value':e})
        for e in list(content['properties']['parameter'].keys())
    ]

    for i in range(len(df_list)):
        if i == 0:
            out = df_list[i]
        else:
            out = out.merge(df_list[i])

    out['Latitude'] = latitude
    out['Longitude'] = longitude
    first_cols = ['Latitude', 'Longitude', 'Date']
    out = out.loc[:, first_cols+[e for e in list(out) if e not in first_cols]]
    return(out)

In [None]:
gps_grid

Unnamed: 0,lon,lat,state
0,-80.408115,25.418081,Florida
1,-80.905151,25.418081,Florida
2,-80.465913,25.869427,Florida
3,-80.964813,25.869427,Florida
4,-81.463714,25.869427,Florida
...,...,...,...
3117,-119.494095,48.844786,Washington
3118,-120.175271,48.844786,Washington
3119,-120.856447,48.844786,Washington
3120,-121.537623,48.844786,Washington


In [None]:
# Download data and cache it.
sleep_for = 1

ensure_dir_path_exists(dir_path = cache_path+'power_data/')
cached_files = os.listdir(cache_path+'power_data/')
for i in tqdm(gps_grid.index):
    lon, lat = gps_grid.loc[i, ['lon', 'lat']]
    start_date = 19810101
    end_date = 20221231
    save_name = str(lon)+'_'+str(lat)+'_'+str(start_date)+'_'+str(end_date)+'.pkl'
    
    if save_name in cached_files:
        pass
    else:
        if i != 0:
            time.sleep(sleep_for)   
            
        res = dl_power_data(
            latitude = lat, 
            longitude = lon,
            start_YYYYMMDD = start_date,
            end_YYYYMMDD = end_date
        )

        put_cached_result(cache_path+'power_data/'+save_name, res)    

100%|██████████| 3122/3122 [00:00<00:00, 5362.03it/s]


In [None]:
# load cached files and save into database
cached_files = os.listdir(cache_path+'power_data/')
cached_files[0:10]

['-106.692895889282_43.8968753814697_19810101_20221231.pkl',
 '-70.6439685821533_44.3468856811523_19810101_20221231.pkl',
 '-111.489644050598_34.8897933959961_19810101_20221231.pkl',
 '-104.400300979614_40.2957248687744_19810101_20221231.pkl',
 '-80.8516311645508_37.5933933258057_19810101_20221231.pkl',
 '-77.9374122619629_40.2957248687744_19810101_20221231.pkl',
 '-109.246292114258_35.7911491394043_19810101_20221231.pkl',
 '-111.143846511841_37.1428489685059_19810101_20221231.pkl',
 '-121.814107894897_42.0966339111328_19810101_20221231.pkl',
 '-98.8756561279297_29.0279769897461_19810101_20221231.pkl']

In [None]:
import re

# match files that look like pickled nasa power entries
# '-106.692895889282_43.8968753814697_19810101_20221231.pkl',
cached_files = [e for e in cached_files if re.match('^.\d*\.\d*_\d*\.\d*\_\d+\_\d*\.pkl$', e)]


In [None]:
import numpy as np

class prep_power():
    def __init__(self,
                 power_path,
                 power_files
                 ):
        super().__init__()

        self.lats = []
        self.lons = []
        self.date = []
        self.keys = []

        expected_shape = None

        res = pd.read_pickle(power_path+power_files[0])

        res = res.sort_values('Date')
        self.date = res.Date.to_list()
        self.keys = [e for e in list(res) if e not in ['Latitude', 'Longitude', 'Date']]
        
        self.lats += [res.loc[0, 'Latitude']]
        self.lons += [res.loc[0, 'Longitude']]

        expected_shape = res.shape

        # shape               entry,            channel,        date
        self.data = np.zeros([len(power_files), len(self.keys), expected_shape[0]])

        for i in tqdm(range(len(power_files))):
            e = power_files[i]
            # print(e)
            res = pd.read_pickle(power_path+e)
            if res.shape != expected_shape:
                print(f'Problem with {e}')
                self.data[i, :, :] = np.nan
            else:
                res = res.sort_values('Date')
                self.data[i, :, :] = res.loc[:, self.keys].to_numpy().transpose()
                self.lats += [res.loc[0, 'Latitude']]
                self.lons += [res.loc[0, 'Longitude']]
                

In [None]:
if writeout_power_npys:
    power = prep_power(
        power_path  = cache_path+'power_data/',
        power_files = cached_files
        )

    # write out numpy files
    tmp = [
        ['power_lats', np.array(power.lats)],
        ['power_lons', np.array(power.lons)],
        ['power_date', np.array(power.date)],
        ['power_keys', np.array(power.keys)],
        ['power_data',          power.data ],
        ]
    _ = [np.save(cache_path+f'power_data/{e[0]}.npy', e[1]) for e in tmp] 

100%|██████████| 3122/3122 [00:55<00:00, 56.24it/s]


## Get information to link GPS to County

In [None]:
def dl_geocoder_data(
    # note google maps uses lat/lon
    longitude = -92.4562972,
    latitude  = 38.9057937,
    benchmark = 'Public_AR_Current',
    vintage   = 'Current_Current'
):
    # https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html
    # https://geocoding.geo.census.gov/geocoder/geographies/coordinates
    if benchmark != 'Public_AR_Current':
        print(f'Mapping for {benchmark} is not defined!')
    else:
        benchmark = 4
    if vintage  != 'Current_Current':
        print(f'Mapping for {vintage} is not defined!')
    else:
        vintage  = 4

    api_request_url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={longitude}&y={latitude}&benchmark={benchmark}&vintage={vintage}'
    api_request_url = ''.join([
        'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?',
        f'x={longitude}&y={latitude}',
        f'&benchmark={benchmark}',
        f'&vintage={vintage}',
        '&format=json'
    ])

    response = requests.get(url=api_request_url, verify=True, timeout=30.00)

    content = json.loads(response.content.decode('utf-8'))
    return(content)


In [None]:
class geocoder_data():
    def __init__(self,
                 json_path
                 ):
        super().__init__()

        if os.path.exists(json_path):
            with open(json_path, 'r') as fp:
                data = json.load(fp)
        else: 
            data = {
                'lookup': {'longitude': [],
                        'latitude': [],
                        'idx': [],
                        },
                'data':[]
                }
            
        self.data = data
        self.json_path = json_path
        if self.data['lookup']['idx'] == []:
            self.next_idx = 0
        else:
            self.next_idx = 1+max(self.data['lookup']['idx'])

    def entry_exists(self, longitude, latitude, return_idx = False):
        for i, e in enumerate(zip(self.data['lookup']['longitude'],
                                  self.data['lookup']['latitude'])):
            if e == (longitude, latitude):
                if return_idx:
                    return i
                else:
                    return True
        if return_idx:
            return i
        else:
            return None
        
    def add_entry(self, longitude, latitude, overwrite = False):
        if ((overwrite == False) & 
            (self.entry_exists(longitude = longitude, latitude = latitude) == True)):
            pass
        else:
            res = dl_geocoder_data(
                # note google maps uses lat/lon
                longitude = longitude,
                latitude  = latitude,
                benchmark = 'Public_AR_Current',
                vintage   = 'Current_Current'
            )

            self.data['lookup']['longitude'] += [longitude] 
            self.data['lookup']['latitude']  += [latitude]
            self.data['lookup']['idx']       += [self.next_idx]

            self.data['data'] += [res]

            self.next_idx += 1

    def save(self):
        with open(self.json_path, 'w') as f:
            json.dump(self.data, f, ensure_ascii=False, indent=4)
        
    # def add_entries(self, longitude_list, latitude_list, overwrite = False):
    #     for i, e in zip(longitude_list, latitude_list):
    #         lon, lat = e
    #     self.add_entry(longitude = lon, latitude = lat, overwrite = overwrite)


In [None]:
ensure_dir_path_exists(dir_path = cache_path+'geocoder_data/')    

gps_grid_geocoder = geocoder_data(json_path = cache_path+'geocoder_data/'+'gps2geocoder.json')

In [None]:
save_every = 100
sleep_for  = 2#0
for i in tqdm(gps_grid.index):
    lon = gps_grid.loc[i, 'lon']
    lat = gps_grid.loc[i, 'lat']
    # print(lon, lat)
    next_i = gps_grid_geocoder.next_idx

    gps_grid_geocoder.add_entry(
        longitude = lon, 
        latitude = lat,
    )
    # only sleep if data was downloaded.
    if next_i != gps_grid_geocoder.next_idx:
        time.sleep(sleep_for)
    
    if save_every != None:
        if ((i % save_every) == 0):
            gps_grid_geocoder.save()
gps_grid_geocoder.save()

100%|██████████| 3122/3122 [1:26:18<00:00,  1.66s/it]


In [None]:
# confirm that the order is as expected
for i in tqdm(gps_grid.index):
    lon = gps_grid.loc[i, 'lon']
    lat = gps_grid.loc[i, 'lat']
    j = gps_grid_geocoder.entry_exists(longitude= lon, latitude= lat, return_idx = True)
    if i != j:
        break

100%|██████████| 3122/3122 [00:00<00:00, 9093.63it/s] 


In [None]:
# should be able to go off of gps_grid.index but check against the lon/lat just in case
def search_geocoder(geocoder = gps_grid_geocoder,
                    longitude = gps_grid.lon.to_list()[0],
                    latitude = gps_grid.lat.to_list()[0]
                    ):
    for e in zip(*[geocoder.data['lookup'][e] for e in ['longitude', 'latitude', 'idx']]):
        if ((longitude == e[0]) & (latitude == e[1])):
            return e[2]
    return None


gps_grid.loc[:, 'State'] = ''
gps_grid.loc[:, 'StateAbr'] = ''
gps_grid.loc[:, 'Counties'] = ''


for i in gps_grid.index:
    mask = (gps_grid.index == i)
    lon = gps_grid.loc[mask, 'lon'].to_list()[0]
    lat = gps_grid.loc[mask, 'lat'].to_list()[0]

    # check for a matching index and then fill values
    idx = search_geocoder(geocoder = gps_grid_geocoder, longitude = lon, latitude = lat)
    if idx == None:
        pass
    else:
        gps_grid.loc[mask, 'State'] = [
            gps_grid_geocoder.data['data'][idx]['result']['geographies']['States'][0]['BASENAME']
        ]
        gps_grid.loc[mask, 'StateAbr'] = [
            gps_grid_geocoder.data['data'][idx]['result']['geographies']['States'][0]['STUSAB'] 
        ]
        gps_grid.loc[mask, 'Counties'] = [
            gps_grid_geocoder.data['data'][idx]['result']['geographies']['Counties'][0]['NAME']
        ]

In [None]:
gps_grid = gps_grid.loc[(gps_grid['Counties'] != ''), ].reset_index(drop = True).copy()
gps_grid.Counties = gps_grid.Counties.str.upper().str.replace(' COUNTY', '')
gps_grid.State = gps_grid.State.str.upper()

if writeout_gps_grid:
    gps_grid.to_csv(cache_path+'latlon_to_county.csv')
gps_grid

Unnamed: 0,lon,lat,state,State,StateAbr,Counties
0,-80.408115,25.418081,Florida,FLORIDA,FL,MIAMI-DADE
1,-80.905151,25.418081,Florida,FLORIDA,FL,MONROE
2,-80.465913,25.869427,Florida,FLORIDA,FL,MIAMI-DADE
3,-80.964813,25.869427,Florida,FLORIDA,FL,COLLIER
4,-81.463714,25.869427,Florida,FLORIDA,FL,COLLIER
...,...,...,...,...,...,...
3117,-119.494095,48.844786,Washington,WASHINGTON,WA,OKANOGAN
3118,-120.175271,48.844786,Washington,WASHINGTON,WA,OKANOGAN
3119,-120.856447,48.844786,Washington,WASHINGTON,WA,WHATCOM
3120,-121.537623,48.844786,Washington,WASHINGTON,WA,WHATCOM


In [None]:
# cache_path+'power_data/'+cached_files[0]

In [None]:
# import sqlite3

# for e in tqdm(cached_files):
#     res = pd.read_pickle(cache_path+'power_data/'+e)
#     with sqlite3.connect(cache_path+"/power_gps_grid.sqlite") as con:
#         res.to_sql('data', con, if_exists='replace')

In [None]:
# res = pd.read_pickle(cache_path+'power_data/'+cached_files[0])
# res

In [None]:
# import sqlite3

# with sqlite3.connect(cache_path+"/power_gps_grid.sqlite") as con:
#     res.to_sql('data', con, if_exists='replace')

In [None]:
# res = get_cached_result(cache_path+'power_data/'+'-80.9051513671875_25.4180812835693_19810101_20221231.pkl')

In [None]:
# import sqlite3

In [None]:
# with sqlite3.connect(cache_path+"/power_gps_grid.sqlite") as con:
#     res.to_sql('data', con, if_exists='replace')

In [None]:
# with sqlite3.connect(cache_path+"/g2f_comp.sqlite") as con:
#     phno.to_sql('phno', con, if_exists='replace')
#     meta.to_sql('meta', con, if_exists='replace')
#     soil.to_sql('soil', con, if_exists='replace')
#     wthr.to_sql('wthr', con, if_exists='replace')
#     cgmv.to_sql('cgmv', con, if_exists='replace')
#     cmnt.to_sql('cmnt', con, if_exists='replace')