## This script appends weather data to the raw harvest

In [None]:
import requests
import pandas as pd
import numpy as np
import os
import json
import geopandas as gpd
import pyproj
from tqdm import tqdm
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline

### This script should work in the following way
- Given a coordinate and a timeframe we fetch all data for that coordinate for that year and afterwards we put it in the df
- note: Coordinates should be given in WGS84 (epsg:4326)

In [None]:
def fetch_weather_data(lat, lon, time_start, time_end, verbose=False,radius=5000):
    """
        lat, lon in standard CRS epsg:4326
        time_start, time_end has format 'yyyy-mm-dd hh:mm'
    """
    
    url = "https://weather.niftitech.com/api/metrics"
    
    payload={'lat': f'{lat}',
    'lng': f'{lon}',
    'radius': f'{radius}',
    'start': f'{time_start}',
    'end': f'{time_end}',
    'bucket_size': '24',
    'metrics[]' : {'temp_c','wind_speed_max_ms','rain_1h_mm','humidity_percent','wind_speed_ms','wind_deg','watt_per_m2'}}
    files=[

    ]
    headers = {
      'Authorization': #Insert bearer here
    }
    
    response = requests.request("POST", url, headers=headers, params = payload, files=files)
    if verbose:
        print(response)
    return response

# Test with random coordinate over 1 week to see what the data looks like

In [None]:
test_lat = 56.11592351932487
test_lon = 12.936500221055022
test_radius =7000
time_start = '2017-04-03 00:01'
time_end = '2017-07-30 23:59'
response_text= fetch_weather_data(test_lat, test_lon, time_start, time_end, verbose=False,radius=test_radius).text
json.loads(response_text)['buckets'][:3]

# Test with 1 coordinate from our dataset

In [105]:
SELECTED_YEARS = [2017,2018,2019,2020]
input_hostvete_paths = {}
for year in SELECTED_YEARS:
    input_hostvete_paths[year] = os.path.join( '/mimer/NOBACKUP/groups/snic2022-23-428/shared_oliver_christoffer/hostvete', f'gridify_{year}_hv_mv.json' )
    
output_hostvete_paths = {}
for year in SELECTED_YEARS:
    output_hostvete_paths[year] = os.path.join( '/mimer/NOBACKUP/groups/snic2022-23-428/shared_oliver_christoffer/weather_data' , f'hostvete_weather_augmented_{year}.json' )
    
acusition_intervals = {}
for year in SELECTED_YEARS:
    acusition_intervals[year] = [f'{year}-04-01',f'{year}-07-31'] 

## Fetch all data and augment dfs
 - We only need 1 coordinate from each field since the whole field will most likely have the same weather data
 - Construct a dataframe with columns [parcel_id, y, x, weather data ....]

In [None]:
def valid_response(response_text):
    if len( response_text['buckets'] ) == 0:
        return False
    for bucket in response_text['buckets']:
        if bucket['watt_per_m2_sum'] == None:
            return False
        if bucket['rain_sum'] == None:
            return False
        if bucket['humidity_percent_avg'] == None:
            return False
        if bucket['wind_speed_ms_avg'] == None:
            return False
        if bucket['temp_c_avg'] == None:
            return False
    return True
        
    
#Assume lat, lon in WGS84: epsg='4326'
def fetch_and_construct_row(lat, lon, first_week = 14, last_week = 30, radius=10000, verbose=False):
    #Construct time period
    monday = '1'
    sunday = '0'
    start_date = datetime.strptime(f'{year}-{first_week}-{monday} 00:01', '%Y-%W-%w %H:%M')
    end_date = datetime.strptime(f'{year}-{last_week}-{sunday} 23:59', '%Y-%W-%w %H:%M')
    
    #Fetch data over full time period:
    fetch_complete = False
    if verbose:
        print("Fetching data")
    while not fetch_complete:
        try:
            response = fetch_weather_data(lat, lon, start_date.strftime("%Y-%m-%d %H:%M"), end_date.strftime("%Y-%m-%d %H:%M"), verbose=False, radius=radius)
            response_text = json.loads( response.text )
            if valid_response(response_text):
                fetch_complete = True
            else:
                radius += 1000
                if verbose:
                    print(f"Increasing search radius, current: {radius}")
        except Exception as inst:
            if verbose:
                print(inst)
            time.sleep(10)
    if verbose:
        print("Fetching complete")

    #Construct bucket datastructure to contain weekly weather data
    sample_buckets = {week: {   'acc_sun_wpm2': 0, 
                                'acc_rain_sum': 0, 
                                'avg_percent_humidity': [], 
                                'avg_wind_speed': [], 
                                'avg_temp': []  } 
                                       for week in range(first_week, last_week+1)}

    #Fill weekly buckets with data from response
    #We can simply compute the accumilated value of sun and rain directly
    #But we need to store averages in a list so we can compute the mean later
    for fetched_bucket in response_text['buckets']:
        week = datetime.strptime(fetched_bucket['bucket'], "%Y-%m-%d %H:%M:%S").isocalendar()[1]
        if week > last_week:
            break
        sample_buckets[week]['acc_sun_wpm2'] +=              float(fetched_bucket['watt_per_m2_sum']) if fetched_bucket['watt_per_m2_sum']!=None else 0
        sample_buckets[week]['acc_rain_sum'] +=              float(fetched_bucket['rain_sum']) if fetched_bucket['rain_sum']!=None else 0
        sample_buckets[week]['avg_percent_humidity'].append( float(fetched_bucket['humidity_percent_avg']) )
        sample_buckets[week]['avg_wind_speed'].append(       float(fetched_bucket['wind_speed_ms_avg']) )
        sample_buckets[week]['avg_temp'].append(             float(fetched_bucket['temp_c_avg']) )

    #Sample all data in each bucket
    for week in range(first_week, last_week+1):
        sample_buckets[week]['avg_percent_humidity'] = np.nanmean(sample_buckets[week]['avg_percent_humidity'])
        sample_buckets[week]['avg_wind_speed']       = np.nanmean(sample_buckets[week]['avg_wind_speed'])
        sample_buckets[week]['avg_temp']             = np.nanmean(sample_buckets[week]['avg_temp'])

    #Construct and return row (dataframe object)
    row = [sample_buckets[week][prop] for week in range(first_week,last_week+1) for prop in sample_buckets[week]] + [radius]
    
    return row

In [None]:
def fetch_all_weather_data(first_week = 14, last_week=30, radius=10000):
    col_names = [[f'week_{week}_acc_sun_wpm2', f'week_{week}_acc_rain_sum',f'week_{week}_avg_percent_humidity',f'week_{week}_avg_wind_speed',f'week_{week}_avg_temp'] for week in range(first_week, last_week+1)]
    col_names = [item for sublist in col_names for item in sublist] + ['radius']
    for year in SELECTED_YEARS:
        if os.path.exists(f'weather_{year}.feather'):
            continue
        harvest_df = gpd.read_file(input_hostvete_paths[year]).to_crs(epsg='4326')
        weather_df = pd.DataFrame(columns=['parcel_id', 'sample_coord_lat', 'sample_coord_lon'])
        weather_df.parcel_id = harvest_df.blockid.unique().astype(int)
        weather_df.sample_coord_lat = weather_df.apply(lambda row: harvest_df[harvest_df.blockid == row.parcel_id].geometry.y.mean(), axis=1)
        weather_df.sample_coord_lon = weather_df.apply(lambda row: harvest_df[harvest_df.blockid == row.parcel_id].geometry.x.mean(), axis=1)
        
        rows = [fetch_and_construct_row(row.sample_coord_lat, row.sample_coord_lon, radius=radius) for _, row in tqdm(weather_df.iterrows())]
        weather_df = pd.concat([weather_df, pd.DataFrame(rows, columns=col_names)], axis=1)
        weather_df.to_feather(f'weather_{year}.feather')
fetch_all_weather_data(radius=1000)

## Append weather data to raw harvest df

In [106]:
for year in SELECTED_YEARS:
    weather_df = pd.read_feather(f"weather_{year}.feather")
    harvest_df = gpd.read_file(input_hostvete_paths[year])
    rows = []
    for _ , row in harvest_df.iterrows():
        weather_row = list(weather_df[weather_df.parcel_id == row.blockid].drop(["parcel_id", "sample_coord_lat" , "sample_coord_lon"],axis=1).values[0])
        rows.append( list(row.values) + weather_row )
    new_df = gpd.GeoDataFrame(data = rows, columns=list(harvest_df.columns.values) + list(weather_df.drop(["parcel_id", "sample_coord_lat" , "sample_coord_lon"],axis=1).columns.values))
    new_df.to_file(output_hostvete_paths[year],driver='GeoJSON')

In [74]:
weather_df

Unnamed: 0,parcel_id,sample_coord_lat,sample_coord_lon,week_14_acc_sun_wpm2,week_14_acc_rain_sum,week_14_avg_percent_humidity,week_14_avg_wind_speed,week_14_avg_temp,week_15_acc_sun_wpm2,week_15_acc_rain_sum,...,week_29_acc_rain_sum,week_29_avg_percent_humidity,week_29_avg_wind_speed,week_29_avg_temp,week_30_acc_sun_wpm2,week_30_acc_rain_sum,week_30_avg_percent_humidity,week_30_avg_wind_speed,week_30_avg_temp,radius
0,645,56.116658,12.937426,29001.9,2.0,80.342857,3.714286,7.7,29012.6,42.2,...,23.7,77.242857,3.5,16.485714,34918.7,21.0,85.228571,2.414286,16.414286,12000
1,602,56.102227,12.944871,29001.9,2.0,80.342857,3.714286,7.7,29012.6,42.2,...,23.7,77.242857,3.5,16.485714,34918.7,21.0,85.228571,2.414286,16.414286,11000
2,638,56.085491,12.960148,29001.9,1.0,88.171429,4.057143,7.285714,29012.6,29.4,...,10.4,86.028571,3.642857,16.128571,34918.7,0.6,91.871429,2.985714,15.9,8000
3,5993,55.779163,13.191371,29216.4,0.6,78.685714,3.3,8.257143,26263.6,25.2,...,30.1,78.942857,2.828571,16.614286,37025.7,17.4,83.657143,2.314286,16.857143,5000
