In [1]:
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas



In [2]:
import openmeteo_requests # GATHER API DATA WITH A NEW API (THIS API WAS USED TO GATHER EVERYTHING EXCEPT NDVI VALUES)
import requests_cache
import pandas as pd
from retry_requests import retry
from collections import Counter

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

def gather_data(lat, long, date): # YEAR/MM/DD

	# Make sure all required weather variables are listed here
	# The order of variables in hourly or daily is important to assign them correctly below
	url = "https://archive-api.open-meteo.com/v1/archive"
	params = {
		"latitude": lat,
		"longitude": long,
		"start_date": date,
		"end_date": date,
		"temperature_unit": "fahrenheit",
		"hourly": ["temperature_2m", "relative_humidity_2m", "wind_speed_10m", "wind_speed_100m", "precipitation", "apparent_temperature", "cloud_cover_low", "cloud_cover_high",
			       "cloud_cover_mid", "cloud_cover", "weather_code", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
		"elevation": None # ELEVATION IS NOT AN HOURLY PARAMETER SO WE MUST SPECIFY IT SEPARATELY (check https://open-meteo.com/en/docs/historical-weather-api for documentation)
	}
	responses = openmeteo.weather_api(url, params=params)

	# Process first location. Add a for-loop for multiple locations or weather models
	response = responses[0]

	# Process hourly data. The order of variables needs to be the same as requested.
	hourly = response.Hourly()

	temperature_arr = hourly.Variables(0).ValuesAsNumpy()
	humidity_arr = hourly.Variables(1).ValuesAsNumpy()
	wind_10m_arr = hourly.Variables(2).ValuesAsNumpy()
	wind_100m_arr = hourly.Variables(3).ValuesAsNumpy()
	precipitation_arr = hourly.Variables(4).ValuesAsNumpy()
	apparent_temperature_arr = hourly.Variables(5).ValuesAsNumpy() # what the temperature feels like to a human (wind makes things feel colder, man made fire go boom)
	cloud_cover_low_arr = hourly.Variables(6).ValuesAsNumpy()
	cloud_cover_high_arr = hourly.Variables(7).ValuesAsNumpy()
	cloud_cover_mid_arr = hourly.Variables(8).ValuesAsNumpy()
	cloud_cover_arr = hourly.Variables(9).ValuesAsNumpy()
	weather_code_arr = hourly.Variables(10).ValuesAsNumpy() # ALTHOUGH WEATHER CONDITIONS ARE ALREADY INTEGERS (WMO CODE TABLE), THEY ARE NOT ORDINALLY ENCODED, SO WE MUST USE A COUNTER
	soil_temp_arr = hourly.Variables(11).ValuesAsNumpy()
	soil_moisture_arr = hourly.Variables(12).ValuesAsNumpy()

	cnt = Counter(weather_code_arr) # example output: [('apple', 3), ('banana', 2), ('orange', 1)] (returns the count of each value in an array)

	return [temperature_arr.min(), temperature_arr.max(), humidity_arr.mean(), wind_10m_arr.mean(), wind_100m_arr.mean(), precipitation_arr.mean(), apparent_temperature_arr.mean(),
		    cloud_cover_low_arr.mean(), cloud_cover_high_arr.mean(), cloud_cover_mid_arr.mean(), cloud_cover_arr.mean(), cnt.most_common()[0][0], soil_temp_arr.mean(), soil_moisture_arr.mean(), response.Elevation()]

	# BELOW IS ALTERNATE CODE THAT RETURNS A SERIES THAT ALLOWS FOR THE ALTERNATIVE 1 LINE SOLUTION MENTIONED IN THE FOLLOWING CODE BLOCK
	# return pd.Series({'TEMP_MAX': temperature_arr.max(), 'TEMP_MIN': temperature_arr.min(), 'AVG_WIND': wind_arr.mean(), 'AVG_RELATIVE_HUMIDITY': humidity_arr.mean()})

gather_data(37.36379623413086, -121.91233825683594, "2010-01-01")

[47.8733,
 60.293297,
 75.40211,
 4.9450903,
 6.041948,
 0.0,
 50.873962,
 15.791667,
 83.75,
 47.75,
 90.25,
 3.0,
 50.400803,
 0.39425,
 18.0]

In [None]:
# IMPORT WILDFIRE DATA FROM API (fire column values are set to 1)

import pandas as pd
import numpy as np
from datetime import datetime
from time import sleep
from random import randint
from tqdm import tqdm

top_20000 = pd.read_csv('./all_fires.csv', index_col=0).iloc[17000:20000] # CHANGE THESE 2 LINES ACCORDINGLY
top_20000.index = np.arange(17000, 20000, 1)

# BELOW IS ALTERNATE CODE THAT ACHIEVES THE SAME PURPOSE IN 1 LINE. WE ARE NOT USING IT BECAUSE WE WANT TO KEEP TRACK OF ITERATIONS AND PERIODICALLY SAVE PROGRESS TO CSV FILES
#top_20000[['TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']] = top_20000.apply(lambda x: gather_data(x.LATDD83, x.LONGDD83, x.DISCOVERYDATETIME.split()[0]), axis=1)
#top_20000 = top_20000[['FIRENAME', 'DISCOVERYDATETIME', 'STATCAUSE', 'LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'zip_code', 'fire', 'geometry', 'TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']]
#top_20000.to_csv(f'./data/{top_20000.index[0]}_TO_{top_20000.index[-1] + 1}.csv')

for i in ['TEMP_MIN', 'TEMP_MAX', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
          'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', "ELEVATION"]:
    top_20000[i] = [0.0] * top_20000.shape[0]

top_20000 = top_20000[['FIRENAME', 'DISCOVERYDATETIME', 'STATCAUSE', 'LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'zip_code', 'fire', 'TEMP_MAX', 'TEMP_MIN',
                       'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH', 'CLOUD_MID', 'CLOUD',
                       'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']]

for idx, row in tqdm(top_20000.iterrows()):

    sleep(0.1)

    cur_lat = row.LATDD83
    cur_long = row.LONGDD83
    verbose_flag = 0
    target_date = row.DISCOVERYDATETIME.split()[0]

    if cur_lat > 90 or cur_lat < -90 or cur_long < -180 or cur_long > 180:
      continue

    results = gather_data(cur_lat, cur_long, target_date)

    if not results:
        print('no data')
        continue

    # temp_min = results[0]
    # temp_max = results[1]
    # humidity_mean = results[2]
    # wind_10m_mean = results[3]
    # wind_100m_mean = results[4]
    # precipitation_mean = results[5]
    # apparent_temp_mean = results[6]
    # cloud_cover_low_mean = results[7]
    # cloud_cover_high_mean = results[8]
    # cloud_cover_mid_mean = results[9]
    # cloud_cover_mean = results[10]
    # weather_condition_mean = results[11]
    # soil_temp_mean = results[12]
    # soil_moisture_mean = results[13]
    # elevation = results[14]

    columns = ['TEMP_MIN', 'TEMP_MAX', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
               'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']

    for i in range(len(columns)):
        top_20000.loc[idx, columns[i]] = results[i]

    if idx % 1000 == 0 and idx != top_20000.index[0]:
        top_20000.to_csv(f'./{top_20000.index[0]}_TO_{idx}.csv')

In [None]:
top_20000.to_csv(f'./{top_20000.index[0]}_TO_{top_20000.index[-1] + 1}.csv')

In [None]:
# IMPORT NOWILDFIRE DATA FROM API (fire column values are set to 0) (get data from same stations that experienced fires, but take data from a random month/day that doesn't conflict with the month/day the fire happened)
import pandas as pd
import numpy as np
from datetime import datetime
from time import sleep
from random import randint
from tqdm import tqdm

top_20000 = pd.read_csv('./all_fires.csv', index_col=0).iloc[19000:20000] # CHANGE THESE 2 LINES ACCORDINGLY
top_20000.index = np.arange(19000, 20000, 1)

# BELOW IS ALTERNATE CODE THAT ACHIEVES THE SAME PURPOSE IN 1 LINE. WE ARE NOT USING IT BECAUSE WE WANT TO KEEP TRACK OF ITERATIONS AND PERIODICALLY SAVE PROGRESS TO CSV FILES
#top_20000[['TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']] = top_20000.apply(lambda x: gather_data(x.LATDD83, x.LONGDD83, x.DISCOVERYDATETIME.split()[0]), axis=1)
#top_20000 = top_20000[['FIRENAME', 'DISCOVERYDATETIME', 'STATCAUSE', 'LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'zip_code', 'fire', 'geometry', 'TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']]
#top_20000.to_csv(f'./data/{top_20000.index[0]}_TO_{top_20000.index[-1] + 1}.csv')

for i in ['TEMP_MIN', 'TEMP_MAX', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
          'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']:
    top_20000[i] = [0.0] * top_20000.shape[0]

top_20000 = top_20000[['FIRENAME', 'DISCOVERYDATETIME', 'STATCAUSE', 'LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'zip_code', 'fire', 'TEMP_MAX', 'TEMP_MIN',
                       'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH', 'CLOUD_MID', 'CLOUD',
                       'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']]

tracker = set()
for idx, row in top_20000.iterrows():
    tracker.add((round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, row.MONTH, row.DAY))

for idx, row in tqdm(top_20000.iterrows()):
    
    sleep(0.2) # avoid minutely request limit

    cur_lat = row.LATDD83
    cur_long = row.LONGDD83
    verbose_flag = 0
    target_date = row.DISCOVERYDATETIME.split()[0]

    if cur_lat > 90 or cur_lat < -90 or cur_long < -180 or cur_long > 180:
        continue

    available_months = [i for i in np.arange(1, 13, 1)]
    available_days = [i for i in np.arange(1, 29, 1)]
    new_month = np.random.choice(available_months)
    new_day = np.random.choice(available_days)

    while (round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, new_month, new_day) in tracker:
        available_days = [i for i in np.arange(1, 29, 1)]
        new_month = np.random.choice(available_months)
        available_months.remove(new_month)
        while (round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, new_month, new_day) in tracker:
            new_day = np.random.choice(available_days)
            available_days.remove(new_day)

    if len(available_months) <= 0 and len(available_days) <= 0:
        continue

    tracker.add((round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, new_month, new_day))
    top_20000.loc[idx, 'MONTH'] = new_month
    top_20000.loc[idx, 'DAY'] = new_day

    if new_month < 10 and new_day < 10: # for date formatting purposes (month and day must be 2 digits)
      target_date = f'{row.YEAR}-0{new_month}-0{new_day}'
    elif new_day < 10:
      target_date = f'{row.YEAR}-{new_month}-0{new_day}'
    elif new_month < 10:
      target_date = f'{row.YEAR}-0{new_month}-{new_day}'
    else:
      target_date = f'{row.YEAR}-{new_month}-{new_day}'

    results = gather_data(cur_lat, cur_long, target_date)

    if not results:
        print('no data')
        continue

    # temp_min = results[0]
    # temp_max = results[1]
    # humidity_mean = results[2]
    # wind_10m_mean = results[3]
    # wind_100m_mean = results[4]
    # precipitation_mean = results[5]
    # apparent_temp_mean = results[6]
    # cloud_cover_low_mean = results[7]
    # cloud_cover_high_mean = results[8]
    # cloud_cover_mid_mean = results[9]
    # cloud_cover_mean = results[10]
    # weather_condition_mean = results[11]
    # soil_temp_mean = results[12]
    # soil_moisture_mean = results[13]
    # elevation = results[14]

    columns = ['TEMP_MIN', 'TEMP_MAX', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
               'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']

    for i in range(len(columns)):
        top_20000.loc[idx, columns[i]] = results[i]

    if idx % 1000 == 0 and idx != top_20000.index[0]:
        top_20000.to_csv(f'./{top_20000.index[0]}_TO_{idx}.csv')

In [None]:
top_20000.to_csv(f'./{top_20000.index[0]}_TO_20000.csv')

In [None]:
# IMPORT NOWILDFIRE DATA FROM API (fire column values are set to 0) (get data from RANDOMIZED LAT/LONG PAIRS)
import pandas as pd
import numpy as np
from datetime import datetime
from time import sleep
from random import randint
from tqdm import tqdm

top_20000 = pd.read_csv('./NEW_LAT_LONG_PAIRS.csv', index_col=0).iloc[23000:26001] # CHANGE THESE 2 LINES ACCORDINGLY (goes to 26252)
top_20000.index = np.arange(23000, 26001, 1)

# BELOW IS ALTERNATE CODE THAT ACHIEVES THE SAME PURPOSE IN 1 LINE. WE ARE NOT USING IT BECAUSE WE WANT TO KEEP TRACK OF ITERATIONS AND PERIODICALLY SAVE PROGRESS TO CSV FILES
#top_20000[['TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']] = top_20000.apply(lambda x: gather_data(x.LATDD83, x.LONGDD83, x.DISCOVERYDATETIME.split()[0]), axis=1)
#top_20000 = top_20000[['FIRENAME', 'DISCOVERYDATETIME', 'STATCAUSE', 'LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'zip_code', 'fire', 'geometry', 'TEMP_MAX', 'TEMP_MIN', 'AVG_WIND', 'AVG_RELATIVE_HUMIDITY']]
#top_20000.to_csv(f'./data/{top_20000.index[0]}_TO_{top_20000.index[-1] + 1}.csv')

top_20000.drop(['geometry'], axis=1, inplace=True)
top_20000 = top_20000.rename(columns={'x':'LONGDD83', 'y':'LATDD83'})
top_20000 = top_20000.iloc[:, [1, 0]]
top_20000['YEAR'] = [0] * top_20000.shape[0]
top_20000['MONTH'] = [0] * top_20000.shape[0]
top_20000['DAY'] = [0] * top_20000.shape[0]
top_20000['fire'] = [0] * top_20000.shape[0]

for i in ['TEMP_MAX', 'TEMP_MIN', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
          'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']:
    top_20000[i] = [0.0] * top_20000.shape[0]

top_20000 = top_20000[['LATDD83', 'LONGDD83', 'YEAR', 'MONTH', 'DAY', 'fire', 'TEMP_MAX', 'TEMP_MIN',
                       'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH', 'CLOUD_MID', 'CLOUD',
                       'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']]

tracker = set()
all_fires = pd.read_csv('./all_fires.csv')
for idx, row in all_fires.iterrows():
    tracker.add((round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, row.MONTH, row.DAY))

#no_fires = pd.read_csv('./all_no_fires.csv')
#for idx, row in no_fires.iterrows():
#    tracker.add((round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), row.YEAR, row.MONTH, row.DAY))

for idx, row in tqdm(top_20000.iterrows()):

    sleep(0.2) # avoid minutely request limit

    cur_lat = row.LATDD83
    cur_long = row.LONGDD83
    verbose_flag = 0
    target_date = ""
    year = np.random.choice([2021, 2022, 2023])

    if cur_lat > 90 or cur_lat < -90 or cur_long < -180 or cur_long > 180:
        continue

    available_months = [i for i in np.arange(1, 13, 1)]
    available_days = [i for i in np.arange(1, 29, 1)]
    new_month = np.random.choice(available_months)
    new_day = np.random.choice(available_days)

    while (round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), year, new_month, new_day) in tracker:
        available_days = [i for i in np.arange(1, 29, 1)]
        new_month = np.random.choice(available_months)
        available_months.remove(new_month)
        while (round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), year, new_month, new_day) in tracker:
            new_day = np.random.choice(available_days)
            available_days.remove(new_day)

    if len(available_months) <= 0 and len(available_days) <= 0:
        continue

    tracker.add((round(row.LATDD83, ndigits=1), round(row.LONGDD83, ndigits=1), year, new_month, new_day))
    top_20000.loc[idx, 'YEAR'] = year
    top_20000.loc[idx, 'MONTH'] = new_month
    top_20000.loc[idx, 'DAY'] = new_day

    if new_month < 10 and new_day < 10: # for date formatting purposes (month and day must be 2 digits)
      target_date = f'{year}-0{new_month}-0{new_day}'
    elif new_day < 10:
      target_date = f'{year}-{new_month}-0{new_day}'
    elif new_month < 10:
      target_date = f'{year}-0{new_month}-{new_day}'
    else:
      target_date = f'{year}-{new_month}-{new_day}'

    results = gather_data(cur_lat, cur_long, target_date)

    if not results:
        print('no data')
        continue

    # temp_min = results[0]
    # temp_max = results[1]
    # humidity_mean = results[2]
    # wind_10m_mean = results[3]
    # wind_100m_mean = results[4]
    # precipitation_mean = results[5]
    # apparent_temp_mean = results[6]
    # cloud_cover_low_mean = results[7]
    # cloud_cover_high_mean = results[8]
    # cloud_cover_mid_mean = results[9]
    # cloud_cover_mean = results[10]
    # weather_condition_mean = results[11]
    # soil_temp_mean = results[12]
    # soil_moisture_mean = results[13]
    # elevation = results[14]

    columns = ['TEMP_MIN', 'TEMP_MAX', 'AVG_RELATIVE_HUMIDITY', 'AVG_10M_WIND', 'AVG_100M_WIND', 'AVG_PRECIPITATION', 'APPARENT_TEMP', 'CLOUD_LOW', 'CLOUD_HIGH',
               'CLOUD_MID', 'CLOUD', 'AVG_WEATHER_CONDITION', 'AVG_SOIL_TEMP', 'AVG_SOIL_MOISTURE', 'ELEVATION']

    for i in range(len(columns)):
        top_20000.loc[idx, columns[i]] = results[i]

    if idx % 1000 == 0 and idx != top_20000.index[0]:
        top_20000.to_csv(f'./{top_20000.index[0]}_TO_{idx}.csv')

In [None]:
top_20000.to_csv(f'./data/{top_20000.index[0]}_TO_{top_20000.index[-1] + 1}.csv')

In [None]:
# DISCLAIMER: THE WEBSCRAPING CODE USED ABOVE WAS TAKEN FROM https://open-meteo.com/en/docs/historical-weather-api