# RGI-07: Region 17 (Southern Andes)
##### F. Roura November 2021

Goal: compare L2 GLIMS files to original inventory to check possible errors in GLIMS ingestion.

In [None]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import os
from utils import mkdir

## Files and storage paths

In [None]:
# Region of interest
reg = 17

# go down from rgi7_scripts/workflow
data_dir = '../../rgi7_data/'

# Level 2 GLIMS files
l2_dir = os.path.join(data_dir, 'l2_sel_reg_tars')

# Output directories
output_dir = mkdir(os.path.join(data_dir, 'l3_rgi7a'))
output_dir_tar = mkdir(os.path.join(data_dir, 'l3_rgi7a_tar'))

# Original inventory for GLIMS check 
ref_reg_file = os.path.join(data_dir, 'l0_support_data', 'Shape_Inventario_de_Glaciares.zip') 

### Load the input data

In [None]:
# Read L2 files from GLIMS
shp = gpd.read_file('tar://' + l2_dir + f'/RGI{reg:02d}.tar.gz/RGI{reg:02d}/RGI{reg:02d}.shp')

## List of submissions in GLIMS L2

In [None]:
toprint = []
for subid in shp.subm_id.unique():
    s_loc = shp.loc[shp.subm_id == subid]
    s = ''
    for c in ['subm_id', 'analysts', 'src_date']:
        toprint = s_loc[c].unique()
        if c != 'src_date':
            s += ' ' + (str(toprint[0]))
        else:
            for d in toprint:
                s += ' ' + d[:4]
    print(s)

## Apply selection criteria to compare Glims data set to the original one

### Step 1: extract ingested inventory from GLIMS data and do a check

In [None]:
#...extract l2 from GLIMS based on 'sumb_id'
RGI_ss = shp.loc[shp['subm_id'] == 730].copy() #barcaza

#### load reference data (here original inventory)

In [None]:
# Just to know the name of the file to open from zip
import zipfile
with zipfile.ZipFile(ref_reg_file, "r") as z:
    for f in z.filelist:
        if 'Inventario_de_Glaciares.shp' in f.filename:
            if '.shp.xml' in f.filename:
                break
            else:
                fname = f.filename

# load reference data
ref_odf = gpd.read_file('zip://' + ref_reg_file + '/' + fname)

#### Number of elements (differences do not necessarily depict major problems)

In [None]:
print('Number of glaciers in new RGI subset:', len(RGI_ss))
print('Number of glaciers in reference data:', len(ref_odf))
print('Difference:', len(RGI_ss)-len(ref_odf))

#### check for dublicate glacier IDs: many glaciers have shared id. Let's see how many of them are actually different glaciers

In [None]:
len(ref_odf['OBJECTID'].unique())-len(ref_odf)

In [None]:
print(len(ref_odf['OBJECTID'].unique()))
print(len(ref_odf))

In [None]:
## subset ref_odf to work with the subset that has not unique IDs
rep_id = gpd.geodataframe.GeoDataFrame()
for idi in ref_odf['OBJECTID'].unique():
    if len(ref_odf.loc[ref_odf['OBJECTID']==idi]) > 1:
        rep_id = pd.concat([rep_id, ref_odf.loc[ref_odf['OBJECTID']==idi]])
        print(len(ref_odf.loc[ref_odf['OBJECTID']==idi]), idi)
rep_id['OBJECTID']

In [None]:
## Repeated Ids:
tmp_ids = rep_id['OBJECTID'].unique()

## find if the glaciers with same are actually the same glacier or not:
for tmp in tmp_ids:
    tmp_set = rep_id.loc[rep_id['OBJECTID'] == tmp]
## number of total glaciers with reperated IDs but different other attributes:
    print( 'Repeated Id: ', tmp, '; glaciers with this id based on "geometry": ', len(tmp_set['geometry'].unique()))
    

In [None]:
rep_id.loc[rep_id["OBJECTID"]==4201].plot() # --> repeated ID not repeated geometry. (different geometries with same ID)
rep_id.loc[rep_id["OBJECTID"]==4721].plot() # --> repeated ID not repeated geometry. (different geometries with same ID)

In [None]:
rep_id.loc[rep_id["OBJECTID"]==330].plot() # --> repeated ID and geometry. (glacier with ID=330 is twice in the dataset)
rep_id.loc[rep_id["OBJECTID"]==1364].plot() #--> repeated ID and geometry. (glacier with ID=1364 is twice in the dataset)


In [None]:
rep_id.loc[rep_id["OBJECTID"]==1996].plot() # --> repeated ID not repeated geometry. (different geometries with same ID)
rep_id.loc[rep_id["OBJECTID"]==6559].plot() # --> repeated ID not repeated geometry. (different geometries with same ID)

In [None]:
sum(np.array(rep_id.iloc[])==np.array(rep_id.iloc[item]))

In [None]:
## compare all the elements inside the repeated id == 0
## many loops but it's how i managed to do it...
rep_id0 = rep_id.loc[rep_id["OBJECTID"] == 0]
for id0 in range(0,len(rep_id0)): ## loop all entries
    tmp = list(range(0,id0)) + list(range(id0+1,len(rep_id0))) ## all entries minus the current one
    for item in tmp: ## loop all entries except the current one
        alltrue = sum(np.array(rep_id.iloc[item]) == np.array(rep_id.iloc[id0]))
        if alltrue == 67:
            print('current equal glaciers in ids ', id0, ' and ', item)

## we don'tget any message, so there are not 2 identical entries in the id=0 subset
## --> number of glaciers is the length of the subset:
len(rep_id0)

## we have 432 glaciers with repeated id=0 (+432), 2 glaciers repeated (identical entries) (-2), 4 repeated ids that represent different polygons (2+1+1+1 = +5) --> total glaciers are "non repeated" + 432+2+5=439

In [None]:
print(rep_id.iloc[id0-1][2]==np.none)
#== rep_id.iloc[id0][4]

In [None]:
#rep_id.loc[rep_id[id0][rep_id.iloc[id0]!=rep_id.iloc[id0]]]

In [None]:
len(rep_id0.iloc[1])

In [None]:
tmp = list(range(0,5)) + list(range(6,10))

In [None]:
import numpy as np
rep_id0.iloc[1]

In [None]:
rep_id.dtypes

In [None]:
pd.to_pickle(rep_id[0:5], "test_file.pcl")

In [None]:
rep_id[0:1]['COD_GLA']==rep_id[0:1]['COD_GLA']

In [None]:
rep_id.iloc[0]['COD_GLA']==rep_id.iloc[0]['COD_GLA']



In [None]:
print('Dublicate IDs in original:', len(ref_odf)-len(ref_odf['OBJECTID'].unique()))
print('Dublicate IDs in GLIMS:', len(RGI_ss)-len(RGI_ss['glac_id'].unique()))

#### Total area

In [None]:
# add an area field to RGI_ss and reference data
RGI_ss['area'] = RGI_ss.to_crs({'proj':'cea'}).area
ref_odf['area'] = ref_odf.to_crs({'proj':'cea'}).area

In [None]:
# print and compare area values
Area_Rep = RGI_ss['area'].sum()/1000000
print('Area Rep [km²]:', Area_Rep)
Area_RGI6 = ref_odf['area'].sum()/1000000
print('Area RGI6 [km²]:', Area_RGI6)
d = (Area_Rep - Area_RGI6)
d_perc = (d/Area_Rep*100)
print('Area difference [km²]:',d,'/','percentage:', d_perc)

### result of check (RGI from GLIMS L2 and original inventory):
#### difference in number of glaciers: 438
#### duplicate IDs: 0 in RGI, 438
#### nominal glaciers: 0
#### area difference: 2.9 km² / 0.13 % (related to edited outlines at a volcano in Ecuador (G281556E00697S, G281572E00688S, G281559E00671S, G281551E00681S))
#### general comment: in general reproduction works...differences need more detailed check

## Write out and tar 

In [None]:
dd = mkdir(f'{output_dir}/RGI{reg:02d}/', reset=True)

print('Writing...')
RGI_ss.to_file(dd + f'RGI{reg:02d}.shp')

print('Taring...')
print(subprocess.run(['tar', '-zcvf', f'{output_dir_tar}/RGI{reg:02d}.tar.gz', '-C', output_dir, f'RGI{reg:02d}']))## Write out and tar 

## Find missing glaciers 

In [None]:
from utils import haversine
import numpy as np
import progressbar

In [None]:
def xy_coord(geom):
    """To compute CenLon CenLat ourselves"""
    x, y = geom.xy
    return x[0], y[0]

In [None]:
df_ref = ref_odf.copy()
rgi7 = RGI_ss.copy()

In [None]:
# Remove nominal
df_ref = df_ref.loc[df_ref.Status != 2].copy()

In [None]:
# compute CenLon CenLat ourselves
rp = df_ref.representative_point()

coordinates = np.array(list(rp.apply(xy_coord)))
df_ref['CenLon'] = coordinates[:, 0]
df_ref['CenLat'] = coordinates[:, 1]

In [None]:
df_ref_orig = df_ref.copy()

In [None]:
# Loop over all RGI7 glaciers and find their equivalent in ref
df_ref = df_ref_orig.copy()
not_found = {}
to_drop = []
for i, (ref_area, lon, lat) in progressbar.progressbar(enumerate(zip(rgi7['area'].values, rgi7.CenLon.values, rgi7.CenLat.values)), max_value=len(rgi7)):
#     dist = haversine(lon, lat, df_ref.CenLon.values, df_ref.CenLat.values)
    dist = (lon - df_ref.CenLon.values)**2 + (lat - df_ref.CenLat.values)**2 
    found = False
    for j in np.argsort(dist)[:10]:
        s6 = df_ref.iloc[j]
        if np.allclose(s6['area'], ref_area, rtol=0.001):
            found = True
            to_drop.append(s6.name)
            break
    if not found:
        not_found[i] = df_ref.iloc[np.argsort(dist)[:10]]
    if len(to_drop) > 1000:
        df_ref.drop(labels=to_drop, inplace=True)
        to_drop = []
df_ref.drop(labels=to_drop, inplace=True)

In [None]:
print(len(not_found), len(df_ref))

In [None]:
pb_rgi7 = rgi7.iloc[list(not_found.keys())]

In [None]:
pb_rgi7.plot(edgecolor='k');
plt.title('GLIMS');

In [None]:
df_ref.plot(edgecolor='k');
plt.title('RGI6');

In [None]:
# Output directories
output_dir = mkdir(os.path.join(data_dir, 'l3_problem_glaciers'))
output_dir_tar = mkdir(os.path.join(data_dir, 'l3_problem_glaciers_tar'))

In [None]:
dd = mkdir(f'{output_dir}/RGI{reg:02d}/', reset=True)

print('Writing...')
pb_rgi7.to_file(dd + f'RGI{reg:02d}_glims.shp')
df_ref.to_file(dd + f'RGI{reg:02d}_ref.shp')

print('Taring...')
print(subprocess.run(['tar', '-zcvf', f'{output_dir_tar}/RGI{reg:02d}.tar.gz', '-C', output_dir, f'RGI{reg:02d}']))