# Combining EAD Metadata and Scan Statistics

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

repo_name = 'archival-structures'
repo_dir = os.path.split(os.getcwd())[0].split(repo_name)[0] + repo_name
repo_dir = os.path.split(os.getcwd())[0].split(repo_name)[0] + repo_name

print("adding project dir to path:", repo_dir)

if repo_dir not in sys.path:
    sys.path = [repo_dir] + sys.path
else:
    sys.path.remove(repo_dir)
    sys.path = [repo_dir] + sys.path

adding project dir to path: /Users/rikhoekstra/develop/archival-structures


In [2]:
%pip install pandas
%pip install matplotlib
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import glob
import os

import pandas as pd
import archival_structures.ead_parser as ead_parser

ead_file = '../data/metadata/ead_metadata.tsv'
ead = pd.read_csv(ead_file, sep='\t', dtype={'inventory_num': str})
ead.head(2)

Unnamed: 0,series,subseries_1,subseries_2,subseries_3,subseries_4,filegroup,inventory_range,file,unitdate,inventory_num,mets_file
0,Resoluties van de Staten-Generaal,De ordinaris resoluties.,De minuten van de notulen van de vergaderingen...,,,"Registers van ordinaris minuut-resoluties,",1-257,1576 oktober 1 - 1577 mei 18,1576 oktober 1 - 1577 mei 18,1,https://service.archief.nl/gaf/api/mets/v1/86e...
1,Resoluties van de Staten-Generaal,De ordinaris resoluties.,De minuten van de notulen van de vergaderingen...,,,"Registers van ordinaris minuut-resoluties,",1-257,1577 mei 20 - 1577 november 21,1577 mei 20 - 1577 november 21,2,https://service.archief.nl/gaf/api/mets/v1/8dd...


The EAD metadata contains a row per inventory number (corresponding to a `file` in the EAD XML file), including the `series`, `subseries` and `filegroup` it belongs to, and any date information that is associated with the file. Finally, in the case of the National Archives of the Netherlands, EADs can contain a link to a METS metadata file describing the digitised version of the file, but only in the case it has been digitised. This means that the presence of a METS file is also an indicator whether an inventory number has been digitised. 

In [4]:
ead.shape, ead[ead.mets_file.notna()].shape

((17899, 11), (9832, 11))

Just over half of the entire States General archive has been digitised.

In [5]:
ead[ead.subseries_3.notna()].head(2)

Unnamed: 0,series,subseries_1,subseries_2,subseries_3,subseries_4,filegroup,inventory_range,file,unitdate,inventory_num,mets_file
4805,De bijlagen bij de resoluties van de Staten-Ge...,De ingekomen brieven en stukken en minuten van...,De Liassen Binnenland,1550-1796,1550-1796.,Ingekomen ordinaris brieven en stukken van alg...,"4867-5461, ---",1588,1588,4867,
4806,De bijlagen bij de resoluties van de Staten-Ge...,De ingekomen brieven en stukken en minuten van...,De Liassen Binnenland,1550-1796,1550-1796.,Ingekomen ordinaris brieven en stukken van alg...,"4867-5461, ---",1589 - 1590 juli,1589 - 1590 juli,4868,


In [6]:
scan_stats_dir = '../data/scan_stats-1.01.02/'
scan_stats_files = glob.glob(os.path.join(scan_stats_dir, '*'))
len(scan_stats_files), scan_stats_files[0]

(9122, '../data/scan_stats-1.01.02/scan_stats-inv_9348.tsv.gz')

In [7]:
scan_stats = pd.concat([pd.read_csv(ssf, sep='\t', compression='gzip') for ssf in scan_stats_files])
scan_stats.shape

(2528802, 56)

In [8]:
import re

scan_stats['inventory_id'] = scan_stats.doc_id.apply(lambda x: re.sub(r"^(NL-HaNA_1.01.02_\w+)_\d+\.jpg", r"\1", x))
scan_stats['inventory_num'] = scan_stats.inventory_id.apply(lambda x: x.split('_')[-1])

In [9]:
ead_scans = pd.merge(ead, scan_stats, on='inventory_num', how='right')
ead_scans.head(2)

Unnamed: 0,series,subseries_1,subseries_2,subseries_3,subseries_4,filegroup,inventory_range,file,unitdate,inventory_num,...,line_width_range_300-600,line_width_range_600-900,line_width_range_900-1200,line_width_range_1200-1500,line_width_range_1500-1800,line_width_range_1800-2100,line_width_range_2100-2400,line_width_range_2400-2700,line_width_range_2700-,inventory_id
0,De bijlagen bij de resoluties van de Staten-Ge...,"De ingekomen delen en dossiers, afzonderlijk v...","Verbalen, journalen, memories en rapporten bet...",Verbalen en journalen van zee-officieren betre...,,,,Rapport aan de Staten-Generaal van vice-admira...,1721,9348,...,4,0,0,0,1,1,1,0,0,NL-HaNA_1.01.02_9348
1,De bijlagen bij de resoluties van de Staten-Ge...,"De ingekomen delen en dossiers, afzonderlijk v...","Verbalen, journalen, memories en rapporten bet...",Verbalen en journalen van zee-officieren betre...,,,,Rapport aan de Staten-Generaal van vice-admira...,1721,9348,...,0,0,0,0,0,0,0,0,0,NL-HaNA_1.01.02_9348


## Sanity check

All inventory numbers for which we have scan stats should have series information:

In [10]:
ead_scans[ead_scans.series.isna()]

Unnamed: 0,series,subseries_1,subseries_2,subseries_3,subseries_4,filegroup,inventory_range,file,unitdate,inventory_num,...,line_width_range_300-600,line_width_range_600-900,line_width_range_900-1200,line_width_range_1200-1500,line_width_range_1500-1800,line_width_range_1800-2100,line_width_range_2100-2400,line_width_range_2400-2700,line_width_range_2700-,inventory_id
58582,,,,,,,,,,4848,...,0,0,0,0,0,0,0,0,0,NL-HaNA_1.01.02_4848
58583,,,,,,,,,,4848,...,0,2,0,0,0,0,0,0,0,NL-HaNA_1.01.02_4848
58584,,,,,,,,,,4848,...,0,0,0,0,0,0,0,0,0,NL-HaNA_1.01.02_4848
58585,,,,,,,,,,4848,...,0,0,0,4,0,0,0,0,0,NL-HaNA_1.01.02_4848
58586,,,,,,,,,,4848,...,1,1,2,0,0,0,0,0,0,NL-HaNA_1.01.02_4848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2201343,,,,,,,,,,deelopname2.jpg,...,47,33,10,28,0,0,0,0,0,NL-HaNA_1.01.02_4781_0190_deelopname2.jpg
2201345,,,,,,,,,,deelopname1.jpg,...,31,27,14,0,0,0,0,0,0,NL-HaNA_1.01.02_4781_0191_deelopname1.jpg
2201346,,,,,,,,,,deelopname2.jpg,...,32,24,7,33,0,0,0,0,0,NL-HaNA_1.01.02_4781_0191_deelopname2.jpg
2201348,,,,,,,,,,deelopname1.jpg,...,16,3,5,17,0,1,10,0,0,NL-HaNA_1.01.02_4781_0192_deelopname1.jpg


TO DO: figure out why there are scans with no series information.

## Analysing Properties of Series

In [11]:
scans_per_series = ead_scans.groupby('series').series.count()
scans_per_series

series
De akten van de Staten-Generaal                                                           62878
De bijlagen bij de resoluties van de Staten-Generaal                                     476325
De depêcheboeken en de brievenboeken van de Staten-Generaal                              593650
Registers van leden en personeel en betreffende de titulatuur van de Staten-Generaal       4693
Resoluties van de Staten-Generaal                                                       1390885
Name: series, dtype: int64

In [12]:
ead_scans.columns

Index(['series', 'subseries_1', 'subseries_2', 'subseries_3', 'subseries_4',
       'filegroup', 'inventory_range', 'file', 'unitdate', 'inventory_num',
       'mets_file', 'doc_id', 'doc_num', 'doc_width', 'doc_height', 'lines',
       'words', 'text_regions', 'columns', 'extra', 'pages', 'num_words',
       'num_alpha_words', 'num_number_words', 'num_title_words',
       'num_non_title_words', 'num_stop_words', 'num_punctuation_words',
       'num_oversized_words', 'words_per_line_0-0', 'words_per_line_1-1',
       'words_per_line_2-2', 'words_per_line_3-3', 'words_per_line_4-5',
       'words_per_line_6-9', 'words_per_line_10-15', 'words_per_line_16-25',
       'words_per_line_26-42', 'words_per_line_43-70', 'words_per_line_71-100',
       'alpha_words_per_line_0-0', 'alpha_words_per_line_1-1',
       'alpha_words_per_line_2-2', 'alpha_words_per_line_3-3',
       'alpha_words_per_line_4-5', 'alpha_words_per_line_6-9',
       'alpha_words_per_line_10-15', 'alpha_words_per_line_16-25'

In [13]:
ead_scans.groupby('series').words.sum().div(scans_per_series)

series
De akten van de Staten-Generaal                                                         415.656096
De bijlagen bij de resoluties van de Staten-Generaal                                    243.825699
De depêcheboeken en de brievenboeken van de Staten-Generaal                             442.210612
Registers van leden en personeel en betreffende de titulatuur van de Staten-Generaal    182.808012
Resoluties van de Staten-Generaal                                                       349.137057
dtype: float64

In [14]:
ead_scans.groupby('series').num_title_words.sum().div(scans_per_series)

series
De akten van de Staten-Generaal                                                         77.504851
De bijlagen bij de resoluties van de Staten-Generaal                                    46.213163
De depêcheboeken en de brievenboeken van de Staten-Generaal                             78.089817
Registers van leden en personeel en betreffende de titulatuur van de Staten-Generaal    84.767313
Resoluties van de Staten-Generaal                                                       66.202636
dtype: float64

In [15]:
ead_scans.groupby('series').num_number_words.sum().div(scans_per_series)

series
De akten van de Staten-Generaal                                                         2.746557
De bijlagen bij de resoluties van de Staten-Generaal                                    4.023007
De depêcheboeken en de brievenboeken van de Staten-Generaal                             4.909445
Registers van leden en personeel en betreffende de titulatuur van de Staten-Generaal    3.213509
Resoluties van de Staten-Generaal                                                       3.745965
dtype: float64

In [33]:
avg_num_words_summary = ead_scans.groupby(['series','subseries_1','subseries_2']).num_number_words.sum() #.div(scans_per_series)
avg_num_words_summary

series                                                       subseries_1                                                                                  subseries_2                                                                                                                                                                                         
De bijlagen bij de resoluties van de Staten-Generaal         De ingekomen brieven en stukken en minuten van uitgaande brieven, verzameld in de "liassen"  1600-1795                                                                                                                                                                                                 13473
                                                                                                                                                          De Liassen Binnenland                                                                                                                

In [17]:
ead_scans.groupby('series').num_punctuation_words.sum().div(scans_per_series)

series
De akten van de Staten-Generaal                                                         1.328716
De bijlagen bij de resoluties van de Staten-Generaal                                    1.114636
De depêcheboeken en de brievenboeken van de Staten-Generaal                             0.982111
Registers van leden en personeel en betreffende de titulatuur van de Staten-Generaal    0.725123
Resoluties van de Staten-Generaal                                                       1.019734
dtype: float64

In [25]:
invnrs_summary = ead_scans.groupby(['series','subseries_1','subseries_2','subseries_3']).inventory_id.count()
invnrs_summary

series                                                       subseries_1                                                                                  subseries_2                                                                                                                             subseries_3                                                                                          
De bijlagen bij de resoluties van de Staten-Generaal         De ingekomen brieven en stukken en minuten van uitgaande brieven, verzameld in de "liassen"  De Liassen Binnenland                                                                                                                   1550-1796                                                                                                   537
                                                                                                                                                                                                              

In [28]:
%pip install plotly
import plotly.express as px

# invnrs_summary is already available from the previous cell.
# Convert the Series to a DataFrame for Plotly
df_for_plot = invnrs_summary.reset_index()

# The value column in df_for_plot is named 'inventory_id' (from .inventory_id.count()).
# Rename it for clarity in the plot.
df_for_plot.rename(columns={'inventory_id': 'inventory_nrs_count'}, inplace=True)

# Define the hierarchy path columns
path_cols = ['series', 'subseries_1', 'subseries_2', 'subseries_3']

# Fill NaN values in path columns with a placeholder string 'N/A'
# as Plotly's treemap path components should ideally be non-null strings.
for col in path_cols:
    df_for_plot[col] = df_for_plot[col].fillna('N/A')

# Create the treemap
# px.Constant("Total Scans") adds a root node.
# 'color="series"' colors the treemap by the top-level series.
# 'hover_data' formats the count display in the tooltip.
fig = px.treemap(df_for_plot,
                 path=[px.Constant("Total Scans")] + path_cols,
                 values='inventory_nrs_count',
                 color='series',
                 title='Hierarchical Distribution of Scans by Series and Subseries',
                 hover_data={'inventory_nrs_count':':,'}) # Format scan_count with comma for thousands

# Optional: Adjust margins for better appearance
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

fig.show()

Note: you may need to restart the kernel to use updated packages.


In [34]:
# Convert the Series to a DataFrame for Plotly
df_avg_words_plot = avg_num_words_summary.reset_index()

# The value column in df_avg_words_plot is named 'num_number_words'
# Rename it for clarity in the plot.
df_avg_words_plot.rename(columns={'num_number_words': 'avg_num_number_words'}, inplace=True)

# Define the hierarchy path columns for this summary
# avg_num_words_summary was grouped by ['series','subseries_1','subseries_2']
avg_words_path_cols = ['series', 'subseries_1', 'subseries_2']

# Fill NaN values in path columns with a placeholder string 'N/A'
for col in avg_words_path_cols:
    df_avg_words_plot[col] = df_avg_words_plot[col].fillna('N/A')

# Create the treemap
fig_avg_words = px.treemap(df_avg_words_plot,
                           path=[px.Constant("Total Average Number Words")] + avg_words_path_cols,
                           values='avg_num_number_words',
                           color='series',
                           title='Hierarchical Distribution of Average Number Words by Series and Subseries',
                           hover_data={'avg_num_number_words':':.2f'}) # Format avg_num_number_words

# Optional: Adjust margins for better appearance
fig_avg_words.update_layout(margin = dict(t=50, l=25, r=25, b=25))

fig_avg_words.show()