In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from lonlat_to_country import get_country_vectorized_name , get_country_vectorized_ISO, get_country
import h5py
from shapely import vectorized
from tqdm import tqdm
import shapely.vectorized
import os

Index(['FIPS', 'ISO2', 'ISO3', 'UN', 'NAME', 'AREA', 'POP2005', 'REGION',
       'SUBREGION', 'LON', 'LAT', 'geometry'],
      dtype='object')


In [2]:
_SHAPEFILE = gpd.read_file("TM_WORLD_BORDERS-0.3.shp")

In [3]:
input_file = "forest_exp_v3.h5" # file with forest data converted from nc to h5
output_file = "forest_exp_region_final_v2.h5" # first output for region_id assignment
colum_latitude = 'latitude'
column_longitude = 'longitude'

df = pd.read_hdf(input_file)


# Convert string values in latitude and longitude columns 
df[colum_latitude] = pd.to_numeric(df[colum_latitude], errors='coerce')
df[column_longitude] = pd.to_numeric(df[column_longitude], errors='coerce')

# Create a GeoDataFrame from the latitude and longitude columns
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[column_longitude], df[colum_latitude]))

# Skip rows where latitude or longitude is NaN
gdf = gdf.dropna(subset=[column_longitude, colum_latitude])



In [4]:
gdf

Unnamed: 0,latitude,longitude,value,geometry
4763222,78.804167,11.723611,1.0,POINT (11.72361 78.80417)
4802069,78.720833,10.806944,1.0,POINT (10.80694 78.72083)
4892960,78.526389,15.556944,1.0,POINT (15.55694 78.52639)
5009602,78.276389,15.612500,1.0,POINT (15.61250 78.27639)
5048732,78.193056,22.556944,1.0,POINT (22.55694 78.19306)
...,...,...,...,...
67603419,-55.890278,-67.248611,1.0,POINT (-67.24861 -55.89028)
67603420,-55.890278,-67.220833,1.0,POINT (-67.22083 -55.89028)
67603424,-55.890278,-67.109722,1.0,POINT (-67.10972 -55.89028)
67629338,-55.945833,-67.276389,1.0,POINT (-67.27639 -55.94583)


## Main code to run 

In [5]:
def get_country_vectorized_ISO_2(coordinates):
    iso3_codes = np.empty(len(coordinates), dtype=object)
    for idx, country in tqdm(_SHAPEFILE.iterrows(), total=len(_SHAPEFILE), desc='Processing countries'):        
        xs = np.array([point.x for point in coordinates])
        ys = np.array([point.y for point in coordinates])
        lbls = shapely.vectorized.contains(country.geometry, xs, ys)
        iso3_codes[lbls] = country['ISO3']  # Assuming 'iso3' is the column with ISO3 codes in your _SHAPEFILE
    return iso3_codes.tolist()


# Apply the vectorized function to the entire GeoDataFrame
gdf['region_id'] = get_country_vectorized_ISO_2(gdf.geometry) 

Processing countries: 100%|██████████| 246/246 [4:10:19<00:00, 61.06s/it]  


In [6]:
gdf

Unnamed: 0,latitude,longitude,value,geometry,region_id
4763222,78.804167,11.723611,1.0,POINT (11.72361 78.80417),
4802069,78.720833,10.806944,1.0,POINT (10.80694 78.72083),SJM
4892960,78.526389,15.556944,1.0,POINT (15.55694 78.52639),SJM
5009602,78.276389,15.612500,1.0,POINT (15.61250 78.27639),SJM
5048732,78.193056,22.556944,1.0,POINT (22.55694 78.19306),SJM
...,...,...,...,...,...
67603419,-55.890278,-67.248611,1.0,POINT (-67.24861 -55.89028),CHL
67603420,-55.890278,-67.220833,1.0,POINT (-67.22083 -55.89028),CHL
67603424,-55.890278,-67.109722,1.0,POINT (-67.10972 -55.89028),CHL
67629338,-55.945833,-67.276389,1.0,POINT (-67.27639 -55.94583),


#### save first draft file

In [7]:
# # Drop the 'geometry' column
df = pd.DataFrame(gdf.drop(columns='geometry'))
df["value"] = 1 ## change value to just 1 # this is not necessary as the value changes to step 5
df

Unnamed: 0,latitude,longitude,value,region_id
4763222,78.804167,11.723611,1,
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
...,...,...,...,...
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL
67603424,-55.890278,-67.109722,1,CHL
67629338,-55.945833,-67.276389,1,


### save file

In [8]:
# # Save the updated GeoDataFrame to a new h5 file
df.to_hdf(output_file, key="data", mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['region_id'], dtype='object')]

  df.to_hdf(output_file, key="data", mode='w')


# save corrected file

In [18]:
data = pd.read_hdf("forest_exp_region_final_v2.h5")

In [19]:
data

Unnamed: 0,latitude,longitude,value,region_id
4763222,78.804167,11.723611,1,
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
...,...,...,...,...
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL
67603424,-55.890278,-67.109722,1,CHL
67629338,-55.945833,-67.276389,1,


In [20]:
data = data.dropna() # drop nan values
data

Unnamed: 0,latitude,longitude,value,region_id
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
5100313,78.081944,15.362500,1,SJM
...,...,...,...,...
67590463,-55.862500,-67.137500,1,CHL
67590464,-55.862500,-67.109722,1,CHL
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL


In [26]:
data.to_hdf("forest_exp_region_final_v3.h5", key="data", mode='w')

In [27]:
ds = pd.read_hdf("forest_exp_region_final_v3.h5") # final file to use for step 4
ds

Unnamed: 0,latitude,longitude,value,region_id
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
5100313,78.081944,15.362500,1,SJM
...,...,...,...,...
67590463,-55.862500,-67.137500,1,CHL
67590464,-55.862500,-67.109722,1,CHL
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL
