# Global overview tables and statistics

This is a bit messy, but there is a bunch of code producing maps and stats for the technical report.

In [None]:
import pandas as pd
import geopandas as gpd
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import sys, os
import os
import matplotlib.patches as mpatches
from oggm import utils
import csv
import seaborn as sns
import cartopy
import cartopy.crs as ccrs
from matplotlib.image import imread

In [None]:
# go down from rgi7_scripts/workflow
data_dir = '../../../../rgi7_data/'

final_dir = os.path.join(data_dir, 'rgi7_final')

In [None]:
user_guide_dir = '../../../../rgi_user_guide/'

## Read RGI6 and RGI7 attrs 

In [None]:
df_rgi7g = pd.read_csv(os.path.join(final_dir, 'global_files', 'attributes', 'RGI2000-v7.0-G-global-attributes.csv.zip'), 
                       index_col=0,
                       compression='zip',
                       dtype={'o1region': str})

In [None]:
df_rgi6g = pd.read_hdf(utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5'))
df_rgi6g = df_rgi6g.loc[df_rgi6g.Connect != 2]

In [None]:
df_rgi7c = pd.read_csv(os.path.join(final_dir, 'global_files', 'attributes', 'RGI2000-v7.0-C-global-attributes.csv.zip'), 
                       index_col=0,
                       compression='zip',
                       dtype={'o1region': str})

In [None]:
np.testing.assert_allclose(df_rgi7g.area_km2.sum(), df_rgi7c.area_km2.sum())

In [None]:
df_reg_o1 = gpd.read_file('zip://' + final_dir + '/RGI2000-v7.0-regions.zip/RGI2000-v7.0-o1regions.shp')
df_reg_o2 = gpd.read_file('zip://' + final_dir + '/RGI2000-v7.0-regions.zip/RGI2000-v7.0-o2regions.shp')

## RGI6 vs RGI7 table for `overview.md`

In [None]:
df = df_rgi6g.groupby('O1Region')[['Area']].sum()
df.columns = ['Area RGI6']

In [None]:
df['Area RGI7'] = df_rgi7g.groupby('o1region')[['area_km2']].sum()

In [None]:
df['Diff A (%)'] = (df['Area RGI7'] / df['Area RGI6'] - 1) * 100

In [None]:
df['N RGI6'] = df_rgi6g.groupby('O1Region').count()['Area']
df['N RGI7'] = df_rgi7g.groupby('o1region').count()['area_km2']

df['Diff N (%)'] = (df['N RGI7'] / df['N RGI6'] - 1) * 100

In [None]:
df.loc['20'] = [0] * 6

In [None]:
ss = df.sum()
ss.name = 'Global'
df = pd.concat([df, ss.to_frame().T])

df['Diff A (%)'] = (df['Area RGI7'] / df['Area RGI6'] - 1) * 100
df['Diff N (%)'] = (df['N RGI7'] / df['N RGI6'] - 1) * 100

In [None]:
df['Area RGI6'] = df['Area RGI6'].astype(float).round(0).astype(int) 
df['Area RGI7'] = df['Area RGI7'].astype(float).round(0).astype(int) 
df['N RGI6'] = df['N RGI6'].astype(int) 
df['N RGI7'] = df['N RGI7'].astype(int) 

df['Diff A (%)'] = df['Diff A (%)'].round(1).replace(-0, 0).replace(np.NaN, 0)
df['Diff N (%)'] = df['Diff N (%)'].round(1).replace(-0, 0).replace(np.NaN, 0)

In [None]:
df

In [None]:
print(f"N in RGI7: N={len(df_rgi7g)}, A={int(df_rgi7g['area_km2'].sum())} km²")

s1 = df_rgi7g.loc[df_rgi7g.is_rgi6 == 1]
print(f"Same as RGI6: N={len(s1)}, A={int(s1['area_km2'].sum())} km²")

s2 = df_rgi7g.loc[df_rgi7g.is_rgi6 == 0]
print(f"New  in RGI7: N={len(s2)}, A={int(s2['area_km2'].sum())} km²")

print(f"New in % area: {s2['area_km2'].sum() / df_rgi7g['area_km2'].sum()}")

print(f"New in % number: {len(s2) / len(df_rgi7g)}")

In [None]:
df.index.name = 'Region'

In [None]:
df_formd = df.copy()
df_formd.index = [f'[](rgi{i}.md)' for i in df_formd.index[:-1]] + ['Global']
df_formd.columns = ['Area<br>RGI 6.0 (km²)', 'Area<br>RGI 7.0 (km²)', 'Area<br>Diff. (%)', 'Count<br>RGI 6.0', 'Count<br>RGI 7.0', 'Count<br>Diff. (%)']
print(df_formd.to_markdown(floatfmt=(".0f",".0f",".0f",".1f",".0f",".0f",".1f")).replace('Global    ', '**Global**'))

In [None]:
df['Full name'] = list(df_reg_o1.full_name.unique()) + ['']
df['Code'] = ['`' + r + '`' for r in df_reg_o1.long_code.unique()] + ['']

In [None]:
df

In [None]:
for_csv = df[['Full name', 'Area RGI6', 'Area RGI7', 'Diff A (%)', 'N RGI6', 'N RGI7', 'Diff N (%)']].copy()
for_csv.columns = ['Full name', 'Area RGI 6.0', 'Area RGI 7.0', 'Diff. Area (%)', 'Count RGI 6.0', 'Count RGI 7.0', 'Diff. Count (%)']
for_csv.reset_index().to_csv(user_guide_dir + '/docs/appendix/RGI2000-v7.0-G-comparison-rgi6.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

## Tables Appendix (no RGI6) 

### O1 

In [None]:
for_csv = df[['Full name', 'Code', 'N RGI7', 'Area RGI7']].copy()
for_csv['Code'] = [s.replace("`", "") for s in for_csv['Code']]

In [None]:
for_csv.columns = [['full_name', 'long_code', 'count', 'area_km2']]
for_csv.index.name = 'o1region'
for_csv.reset_index().to_csv(user_guide_dir + '/docs/appendix/RGI2000-v7.0-G-o1region-summary.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
for_print = df[['Full name', 'Code', 'N RGI7', 'Area RGI7']]
for_print.columns = ['Full name', 'Code', 'Count', 'Area (km²)']
print(for_print.to_markdown().replace('Global    ', '**Global**'))

### O2 

In [None]:
df_reg_o2 = df_reg_o2.drop_duplicates('o2region')
df_reg_o2

In [None]:
dfsr = df_reg_o2[['o2region', 'full_name', 'long_code']].copy().set_index('o2region')

dfsr.columns = ['Full name', 'Code']

dfsr['Count'] = df_rgi7g.groupby('o2region')['cenlon'].count()
dfsr['Area (km²)'] = df_rgi7g.groupby('o2region')['area_km2'].sum()

ss = dfsr.sum()
ss.name = 'Global'
dfsr = pd.concat([dfsr, ss.to_frame().T])

dfsr.loc['Global', 'Full name'] = ''
dfsr.loc['Global', 'Code'] = ''

dfsr['Area (km²)']  = dfsr['Area (km²)'].astype(float).round(0).fillna(0).astype(int) 
dfsr['Count']  = dfsr['Count'].fillna(0).astype(int) 

dfsr['Code'] = ['`' + r + '`' for r in dfsr.Code]
dfsr.loc['Global', 'Code'] = ''

In [None]:
print(dfsr.to_markdown())

In [None]:
dfsr['Code'] = [s.replace("`", "") for s in dfsr['Code']]
dfsr.columns = [['full_name', 'long_code', 'count', 'area_km2']]
dfsr.index.name = 'o2region'
dfsr.reset_index().to_csv(user_guide_dir + '/docs/appendix/RGI2000-v7.0-G-o2region-summary.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

## Global statistics (`06_dataset_summary.md`) 

In [None]:
rgi6 = df_rgi6g.copy()
rgi7 = df_rgi7g.copy()

### Target year 

In [None]:
rgi7['year'] = [int(y.split('-')[0]) for y in rgi7['src_date']]
rgi6['year'] = [int(y[0:4]) for y in rgi6['BgnDate']]

In [None]:
dy6 = np.abs(rgi6['year'] - 2000)
dy7 = np.abs(rgi7['year'] - 2000)

In [None]:
ydf = pd.DataFrame()
ydf.loc['2000 ± 2 years', 'RGI6 (%)'] = (dy6 <= 2).sum() / len(rgi6) * 100
ydf.loc['2000 ± 2 years', 'RGI7 (%)'] = (dy7 <= 2).sum() / len(rgi7) * 100
ydf.loc['2000 ± 2-5 years', 'RGI6 (%)'] = ((dy6 <= 5) & (dy6 > 2)).sum() / len(rgi6) * 100
ydf.loc['2000 ± 2-5 years', 'RGI7 (%)'] = ((dy7 <= 5) & (dy7 > 2)).sum() / len(rgi7) * 100
ydf.loc['2000 ± 5-10 years', 'RGI6 (%)'] = ((dy6 <= 10) & (dy6 > 5)).sum() / len(rgi6) * 100
ydf.loc['2000 ± 5-10 years', 'RGI7 (%)'] = ((dy7 <= 10) & (dy7 > 5)).sum() / len(rgi7) * 100
ydf.loc['2000 ± > 10 years', 'RGI6 (%)'] = (dy6 > 10).sum() / len(rgi6) * 100
ydf.loc['2000 ± > 10 years', 'RGI7 (%)'] = (dy7 > 10).sum() / len(rgi7) * 100
ydf = ydf.round(1)
ydf.index.name = 'Outline year'
ydf.columns = ['RGI 6.0 (%)', 'RGI 7.0 (%)']
ydf

In [None]:
print(ydf.to_markdown())

Same but with area (less good):

In [None]:
ydf = pd.DataFrame()
ydf.loc['2000 ± 2 years', 'RGI6 (%)'] = rgi6.loc[(dy6 <= 2)].Area.sum() / rgi6.Area.sum() * 100
ydf.loc['2000 ± 2 years', 'RGI7 (%)'] = rgi7.loc[(dy7 <= 2)].area_km2.sum() / rgi7.area_km2.sum() * 100
ydf.loc['2000 ± 2-5 years', 'RGI6 (%)'] = rgi6.loc[((dy6 <= 5) & (dy6 > 2))].Area.sum() / rgi6.Area.sum() * 100
ydf.loc['2000 ± 2-5 years', 'RGI7 (%)'] = rgi7.loc[((dy7 <= 5) & (dy7 > 2))].area_km2.sum() / rgi7.area_km2.sum() * 100
ydf.loc['2000 ± 5-10 years', 'RGI6 (%)'] = rgi6.loc[((dy6 <= 10) & (dy6 > 5))].Area.sum() / rgi6.Area.sum() * 100
ydf.loc['2000 ± 5-10 years', 'RGI7 (%)'] = rgi7.loc[((dy7 <= 10) & (dy7 > 5))].area_km2.sum() / rgi7.area_km2.sum() * 100
ydf.loc['2000 ± > 10 years', 'RGI6 (%)'] = rgi6.loc[(dy6 > 10)].Area.sum() / rgi6.Area.sum() * 100
ydf.loc['2000 ± > 10 years', 'RGI7 (%)'] = rgi7.loc[(dy7 > 10)].area_km2.sum() / rgi7.area_km2.sum() * 100
ydf = ydf.round(1)
ydf.index.name = 'Outline year'
ydf.columns = ['RGI 6.0 (%)', 'RGI 7.0 (%)']
ydf

### More year statistics 

In [None]:
rgi7['year'].min(), rgi7['year'].max()

In [None]:
(rgi7['year'] < 1990).sum() / len(rgi7)

### Size classes 

In [None]:
adf = pd.DataFrame()
adf.loc['< 1 km²', 'RGI6 (N)'] = (rgi6['Area'] < 1).sum()
adf.loc['< 1 km²', 'RGI6 (%)'] = (rgi6['Area'] < 1).sum() / len(rgi6) * 100
adf.loc['< 1 km²', 'RGI7 (N)'] = (rgi7['area_km2'] < 1).sum()
adf.loc['< 1 km²', 'RGI7 (%)'] = (rgi7['area_km2'] < 1).sum() / len(rgi7) * 100
adf.loc['1-10 km²', 'RGI6 (N)'] = ((rgi6['Area'] >= 1) & (rgi6['Area'] < 10)).sum()
adf.loc['1-10 km²', 'RGI6 (%)'] = ((rgi6['Area'] >= 1) & (rgi6['Area'] < 10)).sum() / len(rgi6) * 100
adf.loc['1-10 km²', 'RGI7 (N)'] = ((rgi7['area_km2'] >= 1) & (rgi7['area_km2'] < 10)).sum()
adf.loc['1-10 km²', 'RGI7 (%)'] = ((rgi7['area_km2'] >= 1) & (rgi7['area_km2'] < 10)).sum() / len(rgi7) * 100
adf.loc['10-100 km²', 'RGI6 (N)'] = ((rgi6['Area'] >= 10) & (rgi6['Area'] < 100)).sum()
adf.loc['10-100 km²', 'RGI6 (%)'] = ((rgi6['Area'] >= 10) & (rgi6['Area'] < 100)).sum() / len(rgi6) * 100
adf.loc['10-100 km²', 'RGI7 (N)'] = ((rgi7['area_km2'] >= 10) & (rgi7['area_km2'] < 100)).sum()
adf.loc['10-100 km²', 'RGI7 (%)'] = ((rgi7['area_km2'] >= 10) & (rgi7['area_km2'] < 100)).sum() / len(rgi7) * 100
adf.loc['> 100 km²', 'RGI6 (N)'] = (rgi6['Area'] >= 100).sum()
adf.loc['> 100 km²', 'RGI6 (%)'] = (rgi6['Area'] >= 100).sum() / len(rgi6) * 100
adf.loc['> 100 km²', 'RGI7 (N)'] = (rgi7['area_km2'] >= 100).sum()
adf.loc['> 100 km²', 'RGI7 (%)'] = (rgi7['area_km2'] >= 100).sum() / len(rgi7) * 100
adf.loc['Total'] = adf.sum()
adf[['RGI6 (N)', 'RGI7 (N)']] = adf[['RGI6 (N)', 'RGI7 (N)']].astype(int)
adf[['RGI6 (%)', 'RGI7 (%)']] = adf[['RGI6 (%)', 'RGI7 (%)']].round(1)
adf.index.name = 'Area'
adf.columns = ['RGI 6.0 (N)', 'RGI 6.0 (%)', 'RGI 7.0 (N)', 'RGI 7.0 (%)']
adf

In [None]:
print(adf.to_markdown())

In [None]:
with sns.plotting_context('talk'), sns.axes_style('ticks'):

    bins = np.logspace(-2, 3, 100)

    h7c, b7c = np.histogram(df_rgi7c['area_km2'], bins=bins)
    h7, b7 = np.histogram(rgi7['area_km2'], bins=bins)
    h6, b6 = np.histogram(rgi6['Area'], bins=bins)

    f, ax = plt.subplots(figsize=(10, 6))
    ax.plot(b6[:-1], h6, label='RGI 6.0');
    ax.plot(b7[:-1], h7, label='RGI 7.0');
    ax.plot(b7c[:-1], h7c, label='RGI 7.0 C');
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_ylim([7, 5e4])
    plt.legend()
    ax.set_title('Number of glaciers per size category (global)')
    ax.set_xlabel('Glacier area (km², logscale)');
    ax.set_ylabel('Glacier number (logscale)');

In [None]:
with sns.plotting_context('talk'), sns.axes_style('ticks'):

    bins = np.logspace(-2, 3, 100)

    h7, b7 = np.histogram(rgi7['area_km2'], bins=bins)
    h6, b6 = np.histogram(rgi6['Area'], bins=bins)

    f, ax = plt.subplots(figsize=(12, 7))
    ax.plot(b6[:-1], h6, color='C0', label='RGI 6.0');
    ax.plot(b7[:-1], h7, color='C3', label='RGI 7.0');
    ax.set_xscale('log')
    ax.set_yscale('log')
    plt.legend()
    ax.set_title('Number of glaciers per size category (global)')
    ax.set_xlabel('Glacier area (km², logscale)');
    ax.set_ylabel('Glacier number (logscale)');
    plt.savefig(user_guide_dir + '/docs/img/global_stats/global_histogram.png', dpi=100, bbox_inches='tight')

### Global attributes statistics 

In [None]:
rgi7['rgi_id'] = rgi7.index
rgi6['RGIId'] = rgi6.index

#### Terminus 

In [None]:
rdf = pd.DataFrame(index=[0, 1, 2, 3, 9])
rdf.index.name = 'Value'
rdf['Terminus type'] = ['Land-terminating', 'Marine-terminating', 'Lake-terminating', 'Shelf-terminating', 'Not assigned']
rdf['RGI7 (N)'] = rgi7.groupby('term_type').count()['rgi_id']
rdf['RGI6 (N)'] = rgi6.groupby('TermType').count()['RGIId'].reset_index(drop=True)
rdf['RGI7 (Area)'] = rgi7.groupby('term_type')['area_km2'].sum().round(0).astype(int)
rdf['RGI6 (Area)'] = rgi6.groupby('TermType')['Area'].sum().round(0).reset_index(drop=True)
rdf = rdf.replace(np.NaN, 0)
rdf[['RGI7 (N)', 'RGI6 (N)']] = rdf[['RGI7 (N)', 'RGI6 (N)']].astype(int)
rdf[['RGI7 (Area)', 'RGI6 (Area)']] = rdf[['RGI7 (Area)', 'RGI6 (Area)']].astype(int)
rdf.columns = ['Terminus type', 'RGI 7.0 (N)', 'RGI 6.0 (N)', 'RGI 7.0 (Area)', 'RGI 6.0 (Area)']
rdf

In [None]:
print(rdf.to_markdown())

#### Surging 

In [None]:
rdf = pd.DataFrame(index=[0, 1, 2, 3, 9])
rdf.index.name = 'Value'
rdf['Surging'] = ['No evidence', 'Possible', 'Probable', 'Observed', 'Not assigned']
rdf['RGI7 (N)'] = rgi7.groupby('surge_type').count()['rgi_id']
rdf['RGI6 (N)'] = rgi6.groupby('Surging').count()['RGIId']
rdf['RGI7 (Area)'] = rgi7.groupby('surge_type')['area_km2'].sum().round(0).astype(int)
rdf['RGI6 (Area)'] = rgi6.groupby('Surging')['Area'].sum().round(0).astype(int)
rdf = rdf.replace(np.NaN, 0)
rdf[['RGI7 (N)', 'RGI6 (N)']] = rdf[['RGI7 (N)', 'RGI6 (N)']].astype(int)
rdf[['RGI7 (Area)', 'RGI6 (Area)']] = rdf[['RGI7 (Area)', 'RGI6 (Area)']].astype(int)
rdf.columns = ['Surge type', 'RGI 7.0 (N)', 'RGI 6.0 (N)', 'RGI 7.0 (Area)', 'RGI 6.0 (Area)']
rdf

In [None]:
print(rdf.to_markdown())