# Libraries

In [6]:
from netCDF4 import Dataset
from urllib import request
from datetime import date
import pandas as pd
import numpy as np
import rasterio
import zipfile
import os

# Root directory

In [7]:
root_directory_path = "./datasets"

if not os.path.exists(root_directory_path):
  os.makedirs(root_directory_path)

# Forest Fire Data

Near real-time (NRT) Moderate Resolution Imaging Spectroradiometer (MODIS) Thermal Anomalies / Fire locations - Collection 61 processed by NASA's Land, Atmosphere Near real-time Capability for EO (LANCE) Fire Information for Resource Management System (FIRMS), using swath products (MOD14/MYD14) rather than the tiled MOD14A1 and MYD14A1 products. The thermal anomalies / active fire represent the center of a 1 km pixel that is flagged by the MODIS MOD14/MYD14 Fire and Thermal Anomalies algorithm (Giglio 2003) as containing one or more fires within the pixel. This is the most basic fire product in which active fires and other thermal anomalies, such as volcanoes, are identified.

For more information [here](https://www.earthdata.nasa.gov/learn/find-data/near-real-time/firms/mcd14dl-nrt#ed-firms-attributes) \\
Download dataset [here](https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_SV-C2_438634.zip)

In [None]:
# Datos desde el 2002 hasta 2011
remote_url = "https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_M-C61_438863.zip"
local_file = f"{root_directory_path}/forest_fire_Colombia_2002_2011.zip"
local_csv_2002_2011 = f"{root_directory_path}/forest_fire_Colombia_2002_2011.csv"
request.urlretrieve(remote_url, local_file)

with zipfile.ZipFile(local_file, 'r') as zip_ref:
  zip_ref.extractall(root_directory_path)

os.remove(local_file)
os.remove(f"{root_directory_path}/Readme.txt")
os.rename(f"{root_directory_path}/fire_archive_M-C61_438863.csv", local_csv_2002_2011)

In [None]:
# Datos desde el 2012 hasta 2022
remote_url = "https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_SV-C2_438634.zip"
local_file = f"{root_directory_path}/forest_fire_Colombia_2012_2022.zip"
local_csv_2012_2022 = f"{root_directory_path}/forest_fire_Colombia_2012_2022.csv"
request.urlretrieve(remote_url, local_file)

with zipfile.ZipFile(local_file, 'r') as zip_ref:
  zip_ref.extractall(root_directory_path)

os.remove(local_file)
os.remove(f"{root_directory_path}/Readme.txt")
os.remove(f"{root_directory_path}/fire_nrt_SV-C2_438634.csv")
os.rename(f"{root_directory_path}/fire_archive_SV-C2_438634.csv", local_csv_2012_2022)

In [None]:
# Datos desde el 2019 hasta 2024
remote_url = "https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_J1V-C2_438632.zip"
local_file = f"{root_directory_path}/forest_fire_Colombia_2019_2024.zip"
local_csv_2019_2024 = f"{root_directory_path}/forest_fire_Colombia_2019_2024.csv"
request.urlretrieve(remote_url, local_file)

with zipfile.ZipFile(local_file, 'r') as zip_ref:
  zip_ref.extractall(root_directory_path)

os.remove(local_file)
os.remove(f"{root_directory_path}/Readme.txt")
os.rename(f"{root_directory_path}/fire_nrt_J1V-C2_438632.csv", local_csv_2019_2024)

In [None]:
df_csv_2002_2011 = pd.read_csv(local_csv_2002_2011)
df_csv_2012_2022 = pd.read_csv(local_csv_2012_2022)
df_csv_2019_2024 = pd.read_csv(local_csv_2019_2024)

df_csv_2002_2011['acq_date'] = pd.to_datetime(df_csv_2002_2011['acq_date'])
df_csv_2012_2022['acq_date'] = pd.to_datetime(df_csv_2012_2022['acq_date'])
df_csv_2019_2024['acq_date'] = pd.to_datetime(df_csv_2019_2024['acq_date'])

date_min_2019_2024 = df_csv_2019_2024['acq_date'].min()
df_csv_2012_2022 = df_csv_2012_2022[df_csv_2012_2022['acq_date'] < date_min_2019_2024]

forest_fire_path = f"{root_directory_path}/forest_fire_Colombia.csv"
df_forest_fire = pd.concat([df_csv_2002_2011, df_csv_2012_2022, df_csv_2019_2024])
df_forest_fire.sort_values(by="acq_date")
df_forest_fire.to_csv(forest_fire_path, index=False)

In [None]:
os.remove(local_csv_2002_2011)
os.remove(local_csv_2012_2022)
os.remove(local_csv_2019_2024)

# NDVI Data

This dataset contains dekadal NDVI indicators computed from NASA's Moderate Resolution Imaging Spectroradiometer (MODIS) collection 6.1 from the Aqua and Terra satellite aggregated by sub-national administrative units.

Included indicators are (for each dekad):

- 10 day NDVI (vim)
- NDVI long term average (vim_lta)
- 10 day NDVI anomaly [%] (viq)

The administrative units used for aggregation are based on WFP data and contain a Pcode reference attributed to each unit. The number of input pixels used to create the aggregates, is provided in the n_pixels column.

More information [here](https://data.humdata.org/dataset/col-ndvi-subnational)

In [None]:
remote_url = "https://data.humdata.org/dataset/7f2ba5ba-8df1-41cf-ab18-fc1da928a1e5/resource/c06298d9-0d4d-4e40-aecc-abc1da75dc4d/download/col-ndvi-adm2-full.csv"
local_file = f"{root_directory_path}/ndvi_Colombia.csv"
request.urlretrieve(remote_url, local_file)

('./datasets/ndvi_Colombia.csv', <http.client.HTTPMessage at 0x7d0c596c2290>)

# Global Climate Data

TerraClimate is a dataset of monthly climate and climatic water balance for global terrestrial surfaces from 1958-2019. These data provide important inputs for ecological and hydrological studies at global scales that require high spatial resolution and time-varying data. All data have monthly temporal resolution and a ~4-km (1/24th degree) spatial resolution. The data cover the period from 1958-2020. We plan to update these data periodically (annually).

More information [here](https://www.climatologylab.org/terraclimate.html)

In [None]:
def check_latlon_bounds(lat,lon,lat_index,lon_index,lat_target,lon_target):
    #check final indices are in right bounds
    if(lat[lat_index]>lat_target):
        if(lat_index!=0):
            lat_index = lat_index - 1
    if(lat[lat_index]<lat_target):
        if(lat_index!=len(lat)):
            lat_index = lat_index +1
    if(lon[lon_index]>lon_target):
        if(lon_index!=0):
            lon_index = lon_index - 1
    if(lon[lon_index]<lon_target):
        if(lon_index!=len(lon)):
            lon_index = lon_index + 1

    return [lat_index, lon_index]

def get_data_country(df_modis, varnames):
  values = {varname: [] for varname in ["date", "lat", "lon"] + varnames}
  df_modis["acq_date"] = pd.to_datetime(df_modis["acq_date"])
  df_modis = df_modis.sort_values(by="acq_date")
  year_min, year_max = df_modis['acq_date'].min().year, df_modis['acq_date'].max().year

  for year in range(year_min, 2024):
    print(f"Descargando datos del año {year}: [ ", end="")
    df = df_modis[df_modis["acq_date"] <= pd.to_datetime(f"{year}-12-31")]
    df = df[pd.to_datetime(f"{year}-01-01") <= df["acq_date"]]

    date_values, lat_values, lon_values = df['acq_date'], df['latitude'], df['longitude']
    lat_min, lon_min = lat_values.min(), lon_values.min()
    lat_max, lon_max = lat_values.max(), lon_values.max()

    values['date'] += [str(date_.date()) for date_ in date_values]
    values['lat'] += list(lat_values.values)
    values['lon'] += list(lon_values.values)
    date_values = (date_values - pd.to_datetime("1900-01-01")).dt.days

    for varname in varnames:
      print(f"{varname} ", end="")
      pathname = f"http://thredds.northwestknowledge.net:8080/thredds/dodsC/TERRACLIMATE_ALL/data/TerraClimate_{varname}_{year}.nc"
      filehandle = Dataset(pathname, 'r', format="NETCDF4")

      # subset in space (lat/lon)
      lathandle = filehandle.variables['lat']
      lonhandle = filehandle.variables['lon']
      lat=lathandle[:]
      lon=lonhandle[:]

      # find indices of target lat/lon/day
      lat_index_min = (np.abs(lat-lat_min)).argmin()
      lat_index_max = (np.abs(lat-lat_max)).argmin()
      lon_index_min = (np.abs(lon-lon_min)).argmin()
      lon_index_max = (np.abs(lon-lon_max)).argmin()

      [lat_index_min,lon_index_min] = check_latlon_bounds(lat, lon, lat_index_min, lon_index_min, lat_min, lon_min)
      [lat_index_max,lon_index_max] = check_latlon_bounds(lat, lon, lat_index_max, lon_index_max, lon_max, lon_max)

      if(lat_index_min>lat_index_max):
          lat_index_range = range(lat_index_max, lat_index_min+1)
      else:
          lat_index_range = range(lat_index_min, lat_index_max+1)
      if(lon_index_min>lon_index_max):
          lon_index_range = range(lon_index_max, lon_index_min+1)
      else:
          lon_index_range = range(lon_index_min, lon_index_max+1)

      lat=lat[lat_index_range]
      lon=lon[lon_index_range]

      # subset in time
      timehandle=filehandle.variables['time']
      time=timehandle[:]
      time_min = (date(year,1,1)-date(1900,1,1)).days
      time_max = (date(year,12,31)-date(1900,1,1)).days
      time_index_min = (np.abs(time-time_min)).argmin()
      time_index_max = (np.abs(time-time_max)).argmin()
      time_index_range = range(time_index_min, time_index_max+1)
      time = timehandle[time_index_range]

      # subset data
      datahandle = filehandle.variables[varname]
      data = datahandle[time_index_range, lat_index_range, lon_index_range]

      # Indexes
      time_indexes = np.abs(time[:, np.newaxis] - date_values.to_numpy()).argmin(axis=0)
      lat_indexes = np.abs(lat[:, np.newaxis] - lat_values.to_numpy()).argmin(axis=0)
      lon_indexes = np.abs(lon[:, np.newaxis] - lon_values.to_numpy()).argmin(axis=0)
      values[varname] += list(data[time_indexes, lat_indexes, lon_indexes])
    print("]")
  return values

In [None]:
varnames = ["ws", "vpd", "vap", "tmin", "tmax", "swe", "srad", "soil", "q", "ppt", "pet", "def", "aet", "PDSI"]
global_climate_path = f"{root_directory_path}/global_climate_Colombia.csv"
values = get_data_country(df_forest_fire, varnames)
df_global_climate = pd.DataFrame(values)
df_global_climate.to_csv(global_climate_path, index=False)

Descargando datos del año 2002: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2003: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2004: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2005: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2006: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2007: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2008: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2009: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2010: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2011: [ ws vpd vap tmin tmax swe srad soil q ppt pet def aet PDSI ]
Descargando datos del año 2012: [ ws vpd vap tmin tmax swe s

# Population Density Data

Estimated population density per grid-cell. The dataset is available to download in Geotiff and ASCII XYZ format at a resolution of 30 arc (approximately 1km at the equator). The projection is Geographic Coordinate System, WGS84. The units are number of people per square kilometre based on country totals adjusted to match the corresponding official United Nations population estimates that have been prepared by the Population Division of the Department of Economic and Social Affairs of the United Nations Secretariat (2019 Revision of World Population Prospects). The mapping approach is Random Forest-based dasymetric redistribution.

More information [here](https://hub.worldpop.org/geodata/summary?id=45716)

In [None]:
for year in range(2002, 2021):
  remote_url = f"https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1km/{year}/COL/col_pd_{year}_1km_ASCII_XYZ.zip"
  local_file = f"{root_directory_path}/population_density_Colombia_{year}.zip"
  request.urlretrieve(remote_url, local_file)

  with zipfile.ZipFile(local_file, 'r') as zip_ref:
    zip_ref.extractall(root_directory_path)
  os.remove(local_file)

local_file = f"{root_directory_path}/col_pd_2020_1km_ASCII_XYZ.csv"
df_temp = pd.read_csv(local_file)
for year in range(2020, 2024):
  df_temp.to_csv(f"{root_directory_path}/col_pd_{year}_1km_ASCII_XYZ.csv", index=False)

In [None]:
lat_values, lon_values = df_forest_fire['latitude'], df_forest_fire['longitude']
lat_min, lat_max = lat_values.min(), lat_values.max()
lon_min, lon_max = lon_values.min(), lon_values.max()
pd_values = {"lat": lat_values.to_numpy(), "lon": lon_values.to_numpy()}
df_population_density = pd.DataFrame({'longitude': [], 'latitude': []})

# Unimos los datasets
for year in range(2002, 2024):
  df_pd = pd.read_csv(f"{root_directory_path}/col_pd_{year}_1km_ASCII_XYZ.csv")
  df_pd.rename(columns={'X': 'longitude', 'Y': 'latitude', 'Z': f'pd_{year}'}, inplace=True)
  df_population_density = pd.merge(df_population_density, df_pd, on=['longitude', 'latitude'], how='outer')

# Filtramos las latitudes
df_population_density.sort_values(by="latitude")
df_population_density = df_population_density[lat_min <= df_population_density['latitude']]
df_population_density = df_population_density[df_population_density['latitude'] <= lat_max]

# Filtramos las longitudes
df_population_density.sort_values(by="longitude")
df_population_density = df_population_density[lon_min <= df_population_density['longitude']]
df_population_density = df_population_density[df_population_density['longitude'] <= lon_max]
df_population_density.to_csv(f"{root_directory_path}/population_density_Colombia.csv", index=False)

# Eliminamos los datos
for year in range(2002, 2024):
  os.remove(f"{root_directory_path}/col_pd_{year}_1km_ASCII_XYZ.csv")

# Land cover data

The Intergovernmental Panel on Climate Change (IPCC) provides guidance on reporting areal extent and change of land cover and land use, requiring the use of estimators that neither over or underestimate dynamics to the degree possible, and that have known uncertainties. The maps provided by GLAD do not have these properties. However, the maps can be leveraged to facilitate appropriate probability-based statistical methods in deriving statistically valid areas of forest extent and change. Specifically, the maps may be used as a stratifier in targeting forest extent and/or change by a probability sample. The team at GLAD has demonstrated such approaches using the GLAD forest loss data in sample-based area estimation (Tyukavina et al., ERL, 2018, Turubanova et al., ERL, 2019, and Potapov et al., RSE, 2019, among others).

More information [here](https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/download.html)

Legend [here](https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/legend.xlsx)

In [None]:
range_values = [(20, 80), (10, 80), (10, 70), ('00', 80)]
for year in range(2000, 2021, 5):
  for N, W in range_values:
    remote_url = f"https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/{year}/{N}N_0{W}W.tif"
    land_cover_path = f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif"
    request.urlretrieve(remote_url, land_cover_path)
    print(f"Descargado: {land_cover_path}")

Descargado: ./datasets/land_cover_Colombia_2000_20N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2000_10N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2000_10N_070W.tif
Descargado: ./datasets/land_cover_Colombia_2000_00N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2005_20N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2005_10N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2005_10N_070W.tif
Descargado: ./datasets/land_cover_Colombia_2005_00N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2010_20N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2010_10N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2010_10N_070W.tif
Descargado: ./datasets/land_cover_Colombia_2010_00N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2015_20N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2015_10N_080W.tif
Descargado: ./datasets/land_cover_Colombia_2015_10N_070W.tif
Descargado: ./datasets/land_cover_Colombia_2015_00N_080W.tif
Descargado: ./datasets/l

In [None]:
def get_land_cover(lat, lon, year):
  for N, W in range_values:
    land_cover_path = f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif"

    with rasterio.open(land_cover_path) as src:
      x, y = src.index(lon, lat)
      land_cover = src.read(1, window=((y, y + 1), (x, x + 1)))

      if len(land_cover) > 0:
        if len(land_cover[0]) > 0:
          return land_cover[0][0]

  return None

In [None]:
for year in range(2000, 2021, 5):
  land_cover_values = []
  land_cover_csv_path = f"{root_directory_path}/land_cover_Colombia_{year}_{year + 4}.csv"
  min_date, max_date = pd.to_datetime(f"{year}-01-01"), pd.to_datetime(f"{year + 4}-12-31")

  df_tff = df_forest_fire[pd.to_datetime(df_forest_fire['acq_date']) <= max_date]
  df_tff = df_tff[min_date <= pd.to_datetime(df_tff['acq_date'])]

  lat_values, lon_values = df_tff['latitude'].to_numpy(), df_tff['longitude'].to_numpy()
  for lat, lon in zip(lat_values, lon_values):
    land_cover = get_land_cover(lat, lon, year)
    land_cover_values.append(land_cover)

  df_lc_csv = pd.DataFrame({'latitude': lat_values, 'longitude': lon_values, 'land_cover': land_cover_values})
  df_lc_csv.to_csv(land_cover_csv_path, index=False)
  print(f"Datos desde {str(min_date.date())} hasta {str(max_date.date())} descargados")

Datos desde 2000-01-01 hasta 2004-12-31 descargados
Datos desde 2005-01-01 hasta 2009-12-31 descargados
Datos desde 2010-01-01 hasta 2014-12-31 descargados
Datos desde 2015-01-01 hasta 2019-12-31 descargados
Datos desde 2020-01-01 hasta 2024-12-31 descargados


In [None]:
df_land_cover = pd.DataFrame()
for year in range(2000, 2021, 5):
  land_cover_csv_path = f"{root_directory_path}/land_cover_Colombia_{year}_{year + 4}.csv"
  df_tlc = pd.read_csv(land_cover_csv_path)
  df_land_cover = pd.concat([df_land_cover, df_tlc])

df_land_cover.to_csv(f"{root_directory_path}/land_cover_Colombia.csv", index=False)

# Eliminamos los datos
for year in range(2000, 2021, 5):
  os.remove(f"{root_directory_path}/land_cover_Colombia_{year}_{year + 4}.csv")

  for N, W in range_values:
    os.remove(f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif")

In [12]:
remote_url = f"https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/legend.xlsx"
land_cover_legend_path = f"{root_directory_path}/land_cover_legend_Colombia.xlsx"
request.urlretrieve(remote_url, land_cover_legend_path)

('./datasets/land_cover_legend_Colombia.xlsx',
 <http.client.HTTPMessage at 0x285bd58d150>)

In [28]:
df_land_cover_legend = pd.read_excel(land_cover_legend_path)
df_land_cover_legend.rename(columns={'Unnamed: 2': 'class'}, inplace=True)

def set_values(column1, column2, indexes, nan_indexes=[]):
    for start_index, end_index in indexes:
        total = end_index - start_index + 1
        df_land_cover_legend.loc[np.linspace(start_index, end_index, total), column1] = df_land_cover_legend.at[start_index, column2]
    
    for nan_index in nan_indexes:
        df_land_cover_legend.at[nan_index, column1] = np.NAN

# Same column
set_values('General class', 'General class', [(0, 96), (100, 196), (200, 207)], [97, 197, 208, 242, 245, 251, 255])
set_values('class', 'class', [(0, 1), (2, 18), (19, 24), (25, 48), (100, 101), (102, 118), (119, 124), (125, 148)], [49, 149])

# Other column
set_values('class', 'General class', [(200, 207), (241, 241), (244, 244), (250, 250), (254, 254)])
set_values('Sub-class', 'General class', [(241, 241), (244, 244), (250, 250), (254, 254)])

df_land_cover_legend.to_csv(f"{root_directory_path}/land_cover_legend_Colombia.csv", index=False)
os.remove(land_cover_legend_path)