# Join all shps into one

In [1]:
import glob
import geopandas as gpd
import pandas as pd

# Get a list of all shapefiles in the directory
shapefiles = glob.glob('*.shp')

# Initialize an empty list to store GeoDataFrames
gdfs = []

# Loop through all shapefiles
for file in shapefiles:
    # Read the shapefile as a GeoDataFrame and append to gdfs list
    gdfs.append(gpd.read_file(file))

# Concatenate all GeoDataFrames in the gdfs list
gdf = pd.concat(gdfs, ignore_index=True)

# Check if it worked
#gdf.info()

In [None]:
gdf.head()

In [3]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 271741 entries, 0 to 271740
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   id        271741 non-null  int64   
 1   geometry  271741 non-null  geometry
dtypes: geometry(1), int64(1)
memory usage: 4.1 MB


In [4]:
gdf.rename(columns={'id': 'WDPAID'}, inplace=True)

## Read the csv obtained from the point_data_merge.ipynb

In [5]:
df = pd.read_csv('nopoly.csv', low_memory=False)

In [None]:
df.info()

In [7]:
# drop the geometry column, since it is empty anyways
df = df.drop('geometry', axis=1)

## Merge gdf with df

In [None]:
poly_df = df.merge(gdf, on='WDPAID', how='left')
poly_df.info()

In [9]:
# Checking for missing values
missing_value_columns = poly_df.columns[poly_df.isna().any()].tolist()
print("Columns with missing values:", missing_value_columns)

Columns with missing values: ['geometry']


Since the request failed to extract geometries from certain WDPAIDs, it was obvious that there are some missing values. Therefore, two separate dfs are going to be exported. One containing the geometries and the other not.

In [None]:
# Check for duplicates in 'wdpaid' column of GeoDataFrame
poly_df2 = poly_df[poly_df['WDPAID'].duplicated(keep=False)]

# Display rows with duplicate 'wdpaid' values in GeoDataFrame
print(poly_df2)

In [12]:
# Remove duplicates from the entire DataFrame
poly_df_nd = poly_df.drop_duplicates(keep='first')

In [None]:
poly_df_nd.info()

In [14]:
# Split df on 'geometry' containing data and none
df1 = poly_df_nd[poly_df_nd['geometry'].isnull()] # 4315 entries
df2 = poly_df_nd[poly_df_nd['geometry'].notnull()] # 277 195 entries

In [26]:
# Export 
#df2.to_csv('Poly_final.csv', index=False)

# If 'geometry' column is present, convert df2 to a GeoDataFrame
#gdf2 = gpd.GeoDataFrame(df2, geometry='geometry')

#gdf2['geometry'].iloc[272879]

# Export to Shapefile (SHP)
#gdf2.to_file('Poly_final.shp')

In [21]:
# Export 
#df1.to_csv('Nogeom_final.csv', index=False)
gdf1 = gpd.GeoDataFrame(df1, geometry='geometry')
#gdf1.to_file('Nogeom_final.shp')