In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from lonlat_to_country import get_country_vectorized_name , get_country_vectorized_ISO, get_country
import h5py
from shapely import vectorized
from tqdm import tqdm
import shapely.vectorized
import os

Index(['FIPS', 'ISO2', 'ISO3', 'UN', 'NAME', 'AREA', 'POP2005', 'REGION',
       'SUBREGION', 'LON', 'LAT', 'geometry'],
      dtype='object')


In [2]:
_SHAPEFILE = gpd.read_file("TM_WORLD_BORDERS-0.3.shp")

In [3]:
input_file = "forest_exp_v2.h5"
output_file = "forest_exp_country_v3.h5"
colum_latitude = 'latitude'
column_longitude = 'longitude'

df = pd.read_hdf(input_file)


# Convert string values in latitude and longitude columns 
df[colum_latitude] = pd.to_numeric(df[colum_latitude], errors='coerce')
df[column_longitude] = pd.to_numeric(df[column_longitude], errors='coerce')

# Create a GeoDataFrame from the latitude and longitude columns
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[column_longitude], df[colum_latitude]))

# Skip rows where latitude or longitude is NaN
gdf = gdf.dropna(subset=[column_longitude, colum_latitude])



In [4]:
gdf

Unnamed: 0,latitude,longitude,value,geometry
4763222,78.804167,11.723611,1.0,POINT (11.72361 78.80417)
4802069,78.720833,10.806944,1.0,POINT (10.80694 78.72083)
4892960,78.526389,15.556944,1.0,POINT (15.55694 78.52639)
5009602,78.276389,15.612500,1.0,POINT (15.61250 78.27639)
5048732,78.193056,22.556944,1.0,POINT (22.55694 78.19306)
...,...,...,...,...
67603419,-55.890278,-67.248611,1.0,POINT (-67.24861 -55.89028)
67603420,-55.890278,-67.220833,1.0,POINT (-67.22083 -55.89028)
67603424,-55.890278,-67.109722,1.0,POINT (-67.10972 -55.89028)
67629338,-55.945833,-67.276389,1.0,POINT (-67.27639 -55.94583)


## check for 100 points

In [24]:
gdf_2 = gdf[200:300]

In [25]:
gdf_2

Unnamed: 0,latitude,longitude,value,geometry
7345388,73.276389,98.556944,1.0,POINT (98.55694 73.27639)
7345530,73.276389,102.501389,1.0,POINT (102.50139 73.27639)
7345755,73.276389,108.751389,1.0,POINT (108.75139 73.27639)
7345967,73.276389,114.640278,1.0,POINT (114.64028 73.27639)
7346065,73.276389,117.362500,1.0,POINT (117.36250 73.27639)
...,...,...,...,...
7423643,73.109722,112.306944,1.0,POINT (112.30694 73.10972)
7423644,73.109722,112.334722,1.0,POINT (112.33472 73.10972)
7423649,73.109722,112.473611,1.0,POINT (112.47361 73.10972)
7423651,73.109722,112.529167,1.0,POINT (112.52917 73.10972)


## Main code to run (for testing change name on last line to gdf_2)

In [5]:
def get_country_vectorized_ISO_2(coordinates):
    iso3_codes = np.empty(len(coordinates), dtype=object)
    for idx, country in tqdm(_SHAPEFILE.iterrows(), total=len(_SHAPEFILE), desc='Processing countries'):        
        xs = np.array([point.x for point in coordinates])
        ys = np.array([point.y for point in coordinates])
        lbls = shapely.vectorized.contains(country.geometry, xs, ys)
        iso3_codes[lbls] = country['ISO3']  # Assuming 'iso3' is the column with ISO3 codes in your _SHAPEFILE
    return iso3_codes.tolist()


# Apply the vectorized function to the entire GeoDataFrame
gdf['region_id'] = get_country_vectorized_ISO_2(gdf.geometry) #### for the test use change to gdf_2

Processing countries: 100%|███████████████████████████████████████████████████████| 246/246 [7:41:41<00:00, 112.61s/it]


In [27]:
gdf_2

Unnamed: 0,latitude,longitude,value,geometry,region_id
7345388,73.276389,98.556944,1.0,POINT (98.55694 73.27639),RUS
7345530,73.276389,102.501389,1.0,POINT (102.50139 73.27639),RUS
7345755,73.276389,108.751389,1.0,POINT (108.75139 73.27639),RUS
7345967,73.276389,114.640278,1.0,POINT (114.64028 73.27639),RUS
7346065,73.276389,117.362500,1.0,POINT (117.36250 73.27639),RUS
...,...,...,...,...,...
7423643,73.109722,112.306944,1.0,POINT (112.30694 73.10972),RUS
7423644,73.109722,112.334722,1.0,POINT (112.33472 73.10972),RUS
7423649,73.109722,112.473611,1.0,POINT (112.47361 73.10972),RUS
7423651,73.109722,112.529167,1.0,POINT (112.52917 73.10972),RUS


#### save first draft file

In [6]:
# # Drop the 'geometry' column
df1 = pd.DataFrame(gdf.drop(columns='geometry'))
df1["value"] = 1 ## change value to just 1
# # Save the updated GeoDataFrame to a new h5 file
df1.to_hdf(output_file, key="data", mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['region_id'], dtype='object')]

  df1.to_hdf(output_file, key="data", mode='w')


# check file and remove null

In [1]:
import h5py

In [3]:
df = pd.read_hdf("forest_exp_country_v3.h5")
df

Unnamed: 0,latitude,longitude,value,region_id
4763222,78.804167,11.723611,1,
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
...,...,...,...,...
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL
67603424,-55.890278,-67.109722,1,CHL
67629338,-55.945833,-67.276389,1,


In [19]:
df["region_id"].isnull().sum()

16853

In [20]:
df_2 = df.dropna()

In [22]:
df_2

Unnamed: 0,latitude,longitude,value,region_id
4802069,78.720833,10.806944,1,SJM
4892960,78.526389,15.556944,1,SJM
5009602,78.276389,15.612500,1,SJM
5048732,78.193056,22.556944,1,SJM
5100313,78.081944,15.362500,1,SJM
...,...,...,...,...
67590463,-55.862500,-67.137500,1,CHL
67590464,-55.862500,-67.109722,1,CHL
67603419,-55.890278,-67.248611,1,CHL
67603420,-55.890278,-67.220833,1,CHL


## Save final file 

In [23]:
df_2.to_hdf("forest_exp_region_final.h5", key="data", mode='w')