# RGI-07: Region 15

F. Maussion & S. Galos, June 2021

In [1]:
import geopandas as gpd
import pandas as pd
import os
import matplotlib.pyplot as plt
import subprocess
from utils import mkdir
import os

### specify RGI-region and storage paths

In [2]:
# Region of interest
reg = 15

# go down from rgi7_scripts/workflow
data_dir = '../../rgi7_data/'

# Level 2 GLIMS files
l2_dir = os.path.join(data_dir, 'l2_sel_reg_tars')

# Output directories
output_dir = mkdir(os.path.join(data_dir, 'l3_rgi7a'))
output_dir_tar = mkdir(os.path.join(data_dir, 'l3_rgi7a_tar'))

# RGI v6 file for comparison later 
rgi6_reg_file = os.path.join(data_dir, 'l0_RGIv6', '15_rgi60_SouthAsiaEast.zip')

In [3]:
# Region is based on GAMDAM, use for comparison
support_dir = os.path.join(data_dir, 'l0_support_data')
gamdam_dir = os.path.join(support_dir, 'gamdam')

# Region file to select from gamdam
reg_file = os.path.join(data_dir, 'l0_regions', '00_rgi70_regions', '00_rgi70_O1Regions.shp')

### Load the input data

In [4]:
# Read L2 files
shp = gpd.read_file('tar://' + l2_dir + f'/RGI{reg:02d}.tar.gz/RGI{reg:02d}/RGI{reg:02d}.shp')

### Apply selection criteria to create the RGI-07 data subset

In [5]:
# try to get the data relevant for RGI07 and select by attributes
RGI_ss = shp.loc[shp['analysts'] == 'Sakai, Akiko']

## Write out and tar 

In [6]:
dd = mkdir(f'{output_dir}/RGI{reg:02d}/', reset=True)

print('Writing...')
RGI_ss.to_file(dd + f'RGI{reg:02d}.shp')

print('Taring...')
print(subprocess.run(['tar', '-zcvf', f'{output_dir_tar}/RGI{reg:02d}.tar.gz', '-C', output_dir, f'RGI{reg:02d}']))

Writing...
Taring...
CompletedProcess(args=['tar', '-zcvf', '../../rgi7_data/l3_rgi7a_tar/RGI15.tar.gz', '-C', '../../rgi7_data/l3_rgi7a', 'RGI15'], returncode=0)


## New RGI-file created - Check result!

### load reference data (here GAMDAM original) to enable comparison

In [7]:
# specify path to reference data set
import zipfile
import glob
gam_files = glob.glob(gamdam_dir + '/*.zip')
df_ref = []
for gf in gam_files:
    # Just to know the name of the file to open from zip
    with zipfile.ZipFile(gf, "r") as z:
        for f in z.filelist:
            if '.shp' in f.filename:
                fname = f.filename
    df_ref.append(gpd.read_file('zip://' + gf + '/' + fname))

df_ref = pd.concat(df_ref).reset_index()

In [8]:
# Calculate representative points for reference data
ref_rp = df_ref.representative_point()

# Make a dataframe out of it and add the original index to recover it later
ref_rp = ref_rp.to_frame('geometry')
ref_rp['orig_index'] = df_ref.index

In [9]:
# Read region file
reg_f = gpd.read_file(reg_file)

In [10]:
# Make the overlay with the RGI region of interest -> get the rep. points which are located inside the region boundaries
ref_intersect = gpd.overlay(ref_rp, reg_f.loc[reg_f.RGI_CODE == f'{reg:02d}'], how='intersection')

In [11]:
# Now select the entries which intersect from the original shape file (-> extract the polygons) 
ref_odf = df_ref.loc[ref_intersect['orig_index'].values]

## Compare new RGI-file and reference data set 

### Number of elements (differences do not necessarily depict major problems)

In [12]:
print('Number of glaciers in new RGI subset:', len(RGI_ss))
print('Number of glaciers in reference data:', len(ref_odf))
print('Difference:', len(RGI_ss)-len(ref_odf))

Number of glaciers in new RGI subset: 18542
Number of glaciers in reference data: 18588
Difference: -46


### Total area

In [13]:
# add an area field to the selected GAMDAM table
ref_odf['area'] = ref_odf.to_crs({'proj':'cea'}).area

In [14]:
# print and compare area values
Area_RGI = RGI_ss['area'].sum() * 1e-6
print('Area RGI [km²]:', Area_RGI)
Area_ref = ref_odf['area'].sum() * 1e-6
print('Area Ref [km²]:', Area_ref)
d = (Area_RGI - Area_ref)
print('Area difference [km²]:',d)

Area RGI [km²]: 16044.535795981632
Area Ref [km²]: 16044.87806719377
Area difference [km²]: -0.3422712121373479


**We believe that remaining errors are of the same type as Region 01: multipolygons that weren't properly ingested in GLIMS**

### Comparison with RGI6 

In [15]:
# Just to know the name of the file to open from zip
import zipfile
with zipfile.ZipFile(rgi6_reg_file, "r") as z:
    for f in z.filelist:
        if '.shp' in f.filename:
            fname = f.filename

# load reference data
rgi6_odf = gpd.read_file('zip://' + rgi6_reg_file + '/' + fname)

In [16]:
print('Number of glaciers in new RGI subset:', len(RGI_ss))
print('Number of glaciers in RGI6 data:', len(rgi6_odf))
print('Difference:', len(RGI_ss)-len(rgi6_odf))

Number of glaciers in new RGI subset: 18542
Number of glaciers in RGI6 data: 13119
Difference: 5423


In [17]:
rgi6_odf['area'] = rgi6_odf.to_crs({'proj':'cea'}).area

In [18]:
# print and compare area values
Area_RGI = RGI_ss['area'].sum() * 1e-6
print('Area RGI7 [km²]:', Area_RGI)
Area_ref = rgi6_odf['area'].sum() * 1e-6
print('Area RGI6 [km²]:', Area_ref)
d = (Area_RGI - Area_ref)
print('Area difference [km²]:',d)

Area RGI7 [km²]: 16044.535795981632
Area RGI6 [km²]: 14734.176870010966
Area difference [km²]: 1310.358925970666


# End of revised notebook

## Optional: identify Identify potentially problematic cases and write them to a shapefile

In [None]:
# overlay RGI file with reference data set to evaluate differences (some computational effort)
delta_shape = gpd.overlay(RGI_ss, ref_odf , how='difference')

In [None]:
# add a column with the geometry area
delta_shape['area'] = delta_shape.to_crs({'proj':'cea'}).area

In [None]:
# extract all geometries larger than a threshold 
#10000 m² = 0,01 km² which is the common threshold for glaciers to be included in inventories
thr = 3000 
df_d = delta_shape.loc[delta_shape['area'] > thr]

In [None]:
# create a file which contains the spatial features which were defined as problematic as above
prob_IDs = RGI_ss.loc[df_d.glac_id.isin(RGI_ss.glac_id).index]

# and write it to shapefile
td = f'{testout}/RGI_R{reg:02d}probs.shp'
prob_IDs.to_file(td)

### Do the same for the reference data set

In [None]:
# overlay the representative Points of refernce data and and the above geometries to identify the same glaciers
df_d_ref_p = gpd.overlay(ref_rp, prob_IDs, how='intersection')

In [None]:
 # Now select the entries which intersect from the original shape file (-> extract the polygons) 
df_d_ref = df_ref.loc[df_d_ref_p['orig_index']]

In [None]:
# add a column with the shape area
df_d_ref['area'] = df_d_ref.to_crs({'proj':'cea'}).area

# and write it to shapefile
td = f'{testout}/R{reg:02d}refprobs.shp'
df_d_ref.to_file(td)