# Subset Water Survey of Canada 2022 shapefiles
The source data contains 7000+ stations. We only want to retain those that are part of the Reference Hydrometric Basin Network.

In [13]:
import os
import sys
import shutil
import zipfile
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [5]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [9]:
# Get the required info from the config file
data_path = cs.read_from_config(config_file,'data_path')
shps_path = cs.read_from_config(config_file,'ref_shps_path')
rhbn_meta = cs.read_from_config(config_file,'can_rhbn_meta_url')

### Define file folder 

In [7]:
# Construct the data location
data_folder = Path(data_path) / shps_path / 'RHBN-CAN' / 'WSC2022' / 'temp'

### Define where the meta data file is

In [10]:
# Construct the download location
meta_folder = Path(data_path) / shps_path / 'RHBN-CAN'

In [11]:
# Make the file name: Get the last part of the url, strip whitespace and characters, replace extension
meta_name = rhbn_meta.split('/')[-1].strip().replace('xlsx','csv') 

### Subset WSC 2022 shapes to RHBN 2020 basins

In [15]:
# Open the RHBN 2020 list
df = pd.read_csv(str(meta_folder/meta_name))

In [17]:
# Find the stations for which we have WSC 2022 shapefiles
stations = [ f.name for f in os.scandir(data_folder) if f.is_dir() ]

In [46]:
# Remove any stations not in the list
for station in stations:
    if station not in df.STATION_NUMBER.values:
        shutil.rmtree(data_folder / station)
        os.remove(data_folder / (station + '.qgz'))