In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
import io
import base64

In [8]:
# Filter only if archives.json is available, otherwise continue in next cell
archives = pd.read_json("../data/archives.json", encoding="utf-8")
charters = pd.read_json("../data/df.json", encoding="utf-8")
archives["supercuration_name"] = archives["atom_id"].apply(lambda x: x.split("/")[-1])
relevant_archive_names = set(charters.supercuration_name.to_list())
relevant_archives = archives[archives["supercuration_name"].isin(relevant_archive_names)][["supercuration_name", "autform", "country", "longitude", "latitude"]]
relevant_archives.rename(columns={"autform": "name"}, inplace=True)
relevant_archives.to_json("../data/archives_selected.json", orient="records", force_ascii=False)

In [9]:
df_archives = pd.read_json("../data/archives_selected.json")
df = pd.read_json("../data/df.json") 

In [10]:
def filter_group(group):
    return len(group) >= 10

# Folium HTML map

In [11]:
df_inliers = df.groupby('decade').filter(filter_group)

min_decade = df_inliers['decade'].min()
max_decade = df_inliers['decade'].max()
decade_range = range(min_decade, 1410, 10)

center_lat = df_archives['latitude'].mean()
center_lon = df_archives['longitude'].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=6)

archive_cluster = MarkerCluster(name="Archives")

def create_mini_chart(archive_data):
    plt.figure(figsize=(4, 2))
    sns.barplot(x='decade', y='count', data=archive_data)
    plt.title('Charters per Decade', fontsize=10)
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.tight_layout()
    
    img = io.BytesIO()
    plt.savefig(img, format='png')
    img.seek(0)
    plt.close()
    
    return base64.b64encode(img.getvalue()).decode()

for idx, row in df_archives.iterrows():
    archive_name = row['supercuration_name']
    archive_name_full = row['name']
    archive_data = df_inliers[df_inliers['supercuration_name'] == archive_name]
    
    if len(archive_data) > 0:
        decade_counts = archive_data['decade'].value_counts().reset_index()
        decade_counts.columns = ['decade', 'count']
        decade_counts = decade_counts.sort_values('decade')
        
        chart_img = create_mini_chart(decade_counts)
        
        popup_content = f"""
        <h4>{archive_name}</h4>
        <p>Total Charters: {len(archive_data)}</p>
        <img src="data:image/png;base64,{chart_img}" alt="Charter distribution">
        """
        
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=folium.Popup(popup_content, max_width=300),
            tooltip=f"{archive_name}: {len(archive_data)} charters"
        ).add_to(archive_cluster)

m.add_child(archive_cluster)

m.save("out/archive_locations.html")

print(f"Total number of archives used in classification: {len(df_archives[df_archives['supercuration_name'].isin(df_inliers['supercuration_name'].unique())])}")
print(f"Total number of charters used in classification: {len(df_inliers)}")
print(f"\nNumber of charters per decade (Used in Classification):")
inlier_counts = df_inliers['decade'].value_counts().sort_index()
for decade in decade_range:
    count = inlier_counts.get(decade, 0)
    print(f"{decade}: {count}")

Total number of archives used in classification: 25
Total number of charters used in classification: 7271

Number of charters per decade (Used in Classification):
1270: 25
1280: 97
1290: 302
1300: 457
1310: 533
1320: 595
1330: 716
1340: 893
1350: 897
1360: 854
1370: 965
1380: 685
1390: 252
1400: 0


# Charts of distribution

In [12]:
plt.style.use('default') # Reset to default style
plt.rcParams.update({
    'font.family': 'STIXGeneral',
    'font.size': 24,
    'mathtext.fontset': 'stix',
    'legend.fontsize': 28,
    'legend.handlelength': 2.0,
    'legend.fancybox': True,
    'xtick.bottom': False,
    'xtick.top': False,
    'ytick.left': False,
    'ytick.right': False,
    'xtick.labelbottom': True,
    'ytick.labelleft': True,
    'axes.grid': True,
    'axes.linewidth': 1.6,
    'axes.formatter.use_locale': True,
    'axes.formatter.use_mathtext': True,
    'grid.linewidth': 0.5,
    'figure.figsize': [40, 20] # Okay for A1 poster
})

main_color = '#0072B2'

# Date distribution plot
df_date_inliers = df.groupby('decade').filter(filter_group)
min_decade = df_date_inliers['decade'].min()
max_decade = df_date_inliers['decade'].max()
decade_range = range(min_decade, max_decade + 10, 10)

plt.figure(figsize=(40, 20))
sns.barplot(x='decade', y='count', data=df_date_inliers.groupby('decade').size().reset_index(name='count'), color=main_color)
plt.xlabel('Decade', fontsize=32)
plt.ylabel('Number of Charters', fontsize=32)
plt.xticks(range(len(decade_range)), [str(d) for d in decade_range], fontsize=24, rotation=45)
plt.yticks(fontsize=24)
plt.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.savefig('out/charter_date_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"Date Distribution Summary:")
print(f"Earliest decade: {min_decade}")
print(f"Latest decade: {max_decade}")
print(f"Number of charters used in classification: {len(df_date_inliers)}")
print(f"\nNumber of charters per decade:")
inlier_counts = df_date_inliers['decade'].value_counts().sort_index()
for decade in decade_range:
    count = inlier_counts.get(decade, 0)
    print(f"{decade}: {count}")

# Archive distribution plot
df_archive_inliers = df[df['supercuration_name'] != "COLLECTIONS"].groupby('supercuration_name').filter(filter_group)
archive_counts = df_archive_inliers['supercuration_name'].value_counts()
top_archives = archive_counts.nlargest(20) # Top 20 archives

plt.figure(figsize=(40, 20))
sns.barplot(x=top_archives.values, y=top_archives.index, color=main_color)
plt.xlabel('Number of Charters', fontsize=32)
plt.ylabel('Archive', fontsize=32)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.grid(axis='x', linestyle='--', alpha=0.7)
sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.savefig('out/charter_archive_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"\nArchive Distribution Summary:")
print(f"Number of archives used in classification: {len(archive_counts)}")
print(f"\nNumber of charters per archive (Top 20):")
for archive, count in top_archives.items():
    print(f"{archive}: {count}")

Date Distribution Summary:
Earliest decade: 1270
Latest decade: 1390
Number of charters used in classification: 7271

Number of charters per decade:
1270: 25
1280: 97
1290: 302
1300: 457
1310: 533
1320: 595
1330: 716
1340: 893
1350: 897
1360: 854
1370: 965
1380: 685
1390: 252

Archive Distribution Summary:
Number of archives used in classification: 20

Number of charters per archive (Top 20):
AT-OOeLA: 356
AT-StiASF: 321
CH-StiASG: 247
AT-StiAH: 238
DE-BayHStA: 233
AT-StiAScho: 208
AT-StiAHe: 192
AT-HHStA: 177
AT-StiASei: 151
AT-StiAL: 150
CH-StaASG: 150
AT-StiAW: 131
AT-StiAR: 120
AT-StiAKr: 102
AT-StiAA: 94
AT-StiAG: 90
AT-StiASch: 75
AT-StaABdW: 56
AT-StiAK: 36
DE-AKR: 12
