This notebook enables the user to reproduce the analysis steps we have performed for our study *They crossed the valley of Catamarca: A study of narrative space in novel openings* based on the annotation data saved as [Catma](https://www.catma.de) project in `/CATMA_4AA4ADC0-4C28-54F9-B6A1-5DCEFF34B90B_DH2025_CANSpiN` and as `.tsv` data in `/canspin-deu-19`, `/canspin-deu-20`, `/canspin-lat-19`, and `/canspin-spa-19`.

If you wish to see the analysis results only, it is not necessary to execute this notebook. In this case, see our paper and the content of the `/results` folder.

To use the notebook, install the [gitma-canspin package](https://github.com/CANSpiNproject/gitma-canspin/tree/v1.6.2) first, following the instructions of its README.

In [1]:
# imports
from gitma_canspin.canspin import AnnotationAnalyzer

import pandas as pd
import plotly
import json
import os
import itertools

from typing import Tuple

In [2]:
# load the analyzer with the catma project from the CATMA_4AA4ADC0-4C28-54F9-B6A1-5DCEFF34B90B_DH2025_CANSpiN folder
analyzer = AnnotationAnalyzer(init_settings={'project_name': 'CATMA_4AA4ADC0-4C28-54F9-B6A1-5DCEFF34B90B_DH2025_CANSpiN'})

gitma_canspin.project - INFO - Loading tagsets ...
gitma_canspin.project - INFO - 	Found 3 tagset(s).
gitma_canspin.project - INFO - Loading documents ...
gitma_canspin.project - INFO - 	Found 6 document(s).
gitma_canspin.project - INFO - Loading annotation collections ...
gitma_canspin.project - INFO - 	Found 6 annotation collection(s).
gitma_canspin.project - INFO - 	Annotation collection "Collection CS1 v1.1.0 - Gold Standard" for document "El pozo del Yocci"
gitma_canspin.project - INFO - 		Annotations: 201
gitma_canspin.project - INFO - 	Annotation collection "CS1 v1.1.0 - Nils (Gold: 1)" for document "El Señor de Bembibre"
gitma_canspin.project - INFO - 		Annotations: 285
gitma_canspin.project - INFO - 	Annotation collection "CS1 v1.1.0 - Ulrike (Gold standard: 1)" for document "CANSpiN-spa-19-008"
gitma_canspin.project - INFO - 		Annotations: 1332
gitma_canspin.project - INFO - 	Annotation collection "Nils -- CS1 1.1.0 (Gold: 1-1-1)" for document "DEU-19_001"
gitma_canspin.proje

In [3]:
# display loaded tsv annotations from corpus folders
analyzer.print_tsv_annotations_overview()

gitma_canspin.canspin - INFO - tsv files found in canspin project!



overview:
- schema "cs1"
	CANSpiN-deu-19_001_1-1-1.tsv
	CANSpiN-deu-19_030_1-1-1.tsv
	CANSpiN-deu-20_002_1_shuffled.tsv
	CANSpiN-deu-20_021_1_shuffled.tsv
	CANSpiN-lat-19_004_1.tsv
	CANSpiN-lat-19_041_1.tsv
	CANSpiN-spa-19_001_1.tsv
	CANSpiN-spa-19_008_1.tsv


In [4]:
# display loaded annotation collections from catma project
analyzer.print_projects_annotation_collection_list()


Annotation collection list:
index	collection_name	text_title
0	Collection CS1 v1.1.0 - Gold Standard	El pozo del Yocci
1	CS1 v1.1.0 - Nils (Gold: 1)	El Señor de Bembibre
2	CS1 v1.1.0 - Ulrike (Gold standard: 1)	CANSpiN-spa-19-008
3	Nils -- CS1 1.1.0 (Gold: 1-1-1)	DEU-19_001
4	Nils -- CS1 V.1.1.0 (Gold:1-1-1)	DEU-19_030
5	Collection CS1 v1.1.0 - Gold Standard	El falso Inca


## steps
Perform all steps in the specified order to obtain a correct result. You must not skip any steps in between.

### 1 - get CS1 annotation statistics as JSON file

In [None]:
# get the annotation statistics from the tsv files (for the entire chapters and for the first 1000 tokens respectively)
# and translate the class names into English

token_selection: Tuple[int, int] = (0, 1000)

results: dict = {
    'whole_chapters': analyzer.get_corpus_annotation_statistics(),
    'first_1000_token': analyzer.get_corpus_annotation_statistics({
        'calculations': {
            'amount_of_annotations': True,
            'amount_of_annotations_by_class': True,
            'amount_of_token': True,
            'amount_of_annotated_token': True,
            'amount_of_annotated_token_by_class': True,
            'ratios': True,
            'word_lists_by_class': True
        },
        'custom_grouping': None,
        'text_borders': token_selection
    })
}

key_translation = {
    'Ort-Container': 'Place-Container',
    'Ort-Container-BK': 'Place-Container-MC',
    'Ort-Objekt': 'Place-Object',
    'Ort-Objekt-BK': 'Place-Object-MC',
    'Ort-Abstrakt': 'Place-Abstract',
    'Ort-Abstrakt-BK': 'Place-Abstract-MC',
    'Ort-ALT': 'Place-ALT',
    'Bewegung-Subjekt': 'Movement-Subject',
    'Bewegung-Objekt': 'Movement-Object',
    'Bewegung-Licht': 'Movement-Light',
    'Bewegung-Schall': 'Movement-Sound',
    'Bewegung-Geruch': 'Movement-Smell',
    'Bewegung-ALT': 'Movement-ALT',
    'Dimensionierung-Groesse': 'Dimensioning-Size',
    'Dimensionierung-Abstand': 'Dimensioning-Distance',
    'Dimensionierung-Menge': 'Dimensioning-Amount',
    'Dimensionierung-ALT': 'Dimensioning-ALT',
    'Positionierung': 'Positioning',
    'Positionierung-ALT': 'Positioning-ALT',
    'Richtung': 'Direction',
    'Richtung-ALT': 'Direction-ALT',
    'Lugar-Contenedor': 'Place-Container',
    'Lugar-Contenedor-CM': 'Place-Container-MC',
    'Lugar-Objeto': 'Place-Object',
    'Lugar-Objeto-CM': 'Place-Object-MC',
    'Lugar-Abstracto': 'Place-Abstract',
    'Lugar-Abstracto-CM': 'Place-Abstract-MC',
    'Lugar-ALT': 'Place-ALT',
    'Movimiento-Sujeto': 'Movement-Subject',
    'Movimiento-Objeto': 'Movement-Object',
    'Movimiento-Luz': 'Movement-Light',
    'Movimiento-Sonido': 'Movement-Sound',
    'Movimiento-Olfato': 'Movement-Smell',
    'Movimiento-ALT': 'Movement-ALT',
    'Dimensionamiento-Tamaño': 'Dimensioning-Size',
    'Dimensionamiento-Distancia': 'Dimensioning-Distance',
    'Dimensionamiento-Cantitad': 'Dimensioning-Amount',
    'Dimensionamiento-ALT': 'Dimensioning-ALT',
    'Posicionamiento': 'Positioning',
    'Posicionamiento-ALT': 'Positioning-ALT',
    'Dirección': 'Direction',
    'Dirección-ALT': 'Direction-ALT'
}

def merge_word_list_by_class_dicts(first: dict, second: dict) -> dict:
    result = first
    for token, token_amount in second.items():
        if token not in result:
            result[token] = token_amount
            continue
        result[token] = result[token] + token_amount
    result = dict(sorted(result.items(), key=lambda x: int(x[1]), reverse=True))
    return result

def translate_dict(input: dict, translation: dict) -> dict:
    translated = {}
    if len([key for key in input if key in key_translation]) == len(key_translation):
        # for translating schema totals per class with mixed german and spanish classes...
        if isinstance(input[list(key_translation.keys())[0]], int):
            # ...in case of class instances amounts
            for key, value in input.items():
                translated_key: str = key_translation[key]
                translated[translated_key] = value if not translated.get(translated_key) else translated[translated_key] + value
            translated = dict(sorted(translated.items(), key=lambda x: int(x[1]), reverse=True))
        elif isinstance(input[list(key_translation.keys())[0]], dict):
            # ...in case of word lists
            for key, value in input.items():
                translated_key: str = key_translation[key]
                translated[translated_key] = value if not translated.get(translated_key) else merge_word_list_by_class_dicts(translated[translated_key], value)
    else:
        # for translating everything else
        translated = dict([(translation.get(k, k), v) for k, v in input.items()])
    for key, value in translated.items():
        if isinstance(value, dict):
            translated[key] = translate_dict(value, translation)
    return translated

for result_type in results:
    results[result_type] = translate_dict(input=results[result_type], translation=key_translation)


In [6]:
# print annotation statistics

for result_type in results:
    print(
        json.dumps(
            results[result_type],
            indent=2, 
            sort_keys=False, 
            ensure_ascii=False
        )
    )


{
  "amount_of_annotations": {
    "cs1": {
      "canspin-lat-19": {
        "CANSpiN-lat-19_004_1.tsv": 128,
        "CANSpiN-lat-19_041_1.tsv": 199,
        "TOTAL": 327
      },
      "canspin-deu-19": {
        "CANSpiN-deu-19_001_1-1-1.tsv": 898,
        "CANSpiN-deu-19_030_1-1-1.tsv": 1056,
        "TOTAL": 1954
      },
      "canspin-deu-20": {
        "CANSpiN-deu-20_002_1_shuffled.tsv": 412,
        "CANSpiN-deu-20_021_1_shuffled.tsv": 154,
        "TOTAL": 566
      },
      "canspin-spa-19": {
        "CANSpiN-spa-19_001_1.tsv": 269,
        "CANSpiN-spa-19_008_1.tsv": 728,
        "TOTAL": 997
      },
      "TOTAL": 3844
    }
  },
  "amount_of_annotations_by_class": {
    "cs1": {
      "canspin-lat-19": {
        "CANSpiN-lat-19_004_1.tsv": {
          "Movement-ALT": 20,
          "Movement-Subject": 19,
          "Place-Container": 16,
          "Positioning": 16,
          "Dimensioning-Amount": 10,
          "Movement-Object": 9,
          "Movement-Sound": 8,
    

In [7]:
# safe annotation statistics to files in root of dh2025 repo

for result_type in results:
    filename: str = f'annotation_statistics__{result_type}.json'
    json_file_str: str = json.dumps(results[result_type], indent=2, sort_keys=False, ensure_ascii=False)

    if (os.path.isfile(filename)):
        print(f'JSON file {filename} already exists and will be overwritten.')

    with open(filename, 'w') as file:
        file.write(json_file_str)
        print(f'JSON file {filename} successfully created.')

JSON file annotation_statistics__whole_chapters.json already exists and will be overwritten.
JSON file annotation_statistics__whole_chapters.json successfully created.
JSON file annotation_statistics__first_1000_token.json already exists and will be overwritten.
JSON file annotation_statistics__first_1000_token.json successfully created.


### 2 - get bar charts with annotation amounts of all chapters normalized to their token amount

In [8]:
# for annotation data with all tokens
# you may manipulate and save the plotly diagram as png image file by using the interface of the html output

data_dict: dict = {
    'text': list(itertools.chain(*[[item] * 21 for item in [
        'DEU19_030', 
        'DEU19_001', 
        'DEU20_002', 
        'DEU20_021', 
        'SPA19_001', 
        'SPA19_008', 
        'LAT19_004', 
        'LAT19_041'
    ]])),
    'annotation_class': [
        'Place-Container', 
        'Place-Container-MC', 
        'Place-Object', 
        'Place-Object-MC', 
        'Place-Abstract', 
        'Place-Abstract-MC', 
        'Place-ALT',
        'Movement-Subject',
        'Movement-Object',
        'Movement-Light',
        'Movement-Sound',
        'Movement-Smell',
        'Movement-ALT',
        'Dimensioning-Size',
        'Dimensioning-Distance',
        'Dimensioning-Amount',
        'Dimensioning-ALT',
        'Positioning',
        'Positioning-ALT',
        'Direction',
        'Direction-ALT'
    ] * 8,
    'amount': []
}
data_dict['amount'] = list(itertools.chain(*[[results['whole_chapters']['ratios']['cs1'][corpus_file[0]][corpus_file[1]]['annotations_by_class_in_file:total_token_amount_in_file'].get(annotation_class, 0) * 100] for corpus_file in [
    ('canspin-deu-19', 'CANSpiN-deu-19_030_1-1-1.tsv'), 
    ('canspin-deu-19', 'CANSpiN-deu-19_001_1-1-1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_002_1_shuffled.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_021_1_shuffled.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_001_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_008_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_004_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_041_1.tsv')
] for annotation_class in data_dict['annotation_class'][:21]]))

data: pd.DataFrame = pd.DataFrame(data_dict)

figure: plotly.graph_objects.Figure = plotly.express.bar(
    data_frame=data,
    x='text',
    y='amount',
    color='annotation_class',
    labels={
        "text": "texts",
        "amount": "annotation amount (in %)",
        "annotation_class": "annotation classes"
    },
    title='CS1 annotation amounts inside the initial chapters with all tokens <br><sub>(in percentage of the total token amount of the respective chapter)</sub>',
    color_discrete_map={
        'Place-Container': '#B6D3FF',
        'Place-Container-MC': '#CCDEFF',
        'Place-Object': '#D4EAFF',
        'Place-Object-MC': '#E6F2FF',
        'Place-Abstract': '#89A8F6',
        'Place-Abstract-MC': '#98C3FA',
        'Place-ALT': '#90A6C7',
        'Movement-Subject': '#FF6D6D',
        'Movement-Object': '#F60D00',
        'Movement-Sound': '#FF4949',
        'Movement-Light': '#CA0B0B',
        'Movement-Smell': '#B60000',
        'Movement-ALT': '#960000',
        'Direction': '#92FFBD',
        'Direction-ALT': '#75CC96',
        'Positioning': '#DB8300',
        'Positioning-ALT': '#B56A01',
        'Dimensioning-Distance': '#8AB6AD',
        'Dimensioning-Size': '#7CD3C0',
        'Dimensioning-Amount': '#7EF5D9',
        'Dimensioning-ALT': '#60847B'
    },
    width=1400,
    height=800
)

figure.update_layout(font={'size': 18})

figure.show()

In [9]:
# for annotation data with 1000 or a maximum of 1000 tokens
# you may manipulate and save the plotly diagram as png image file by using the interface of the html output

data_dict: dict = {
    'text': list(itertools.chain(*[[item] * 21 for item in [
        'DEU19_030', 
        'DEU19_001', 
        'DEU20_002', 
        'DEU20_021', 
        'SPA19_001', 
        'SPA19_008', 
        'LAT19_004', 
        'LAT19_041'
    ]])),
    'annotation_class': [
        'Place-Container', 
        'Place-Container-MC', 
        'Place-Object', 
        'Place-Object-MC', 
        'Place-Abstract', 
        'Place-Abstract-MC', 
        'Place-ALT',
        'Movement-Subject',
        'Movement-Object',
        'Movement-Light',
        'Movement-Sound',
        'Movement-Smell',
        'Movement-ALT',
        'Dimensioning-Size',
        'Dimensioning-Distance',
        'Dimensioning-Amount',
        'Dimensioning-ALT',
        'Positioning',
        'Positioning-ALT',
        'Direction',
        'Direction-ALT'
    ] * 8,
    'amount': []
}
data_dict['amount'] = list(itertools.chain(*[[results['first_1000_token']['ratios']['cs1'][corpus_file[0]][corpus_file[1]]['annotations_by_class_in_file:total_token_amount_in_file'].get(annotation_class, 0) * 100] for corpus_file in [
    ('canspin-deu-19', 'CANSpiN-deu-19_030_1-1-1.tsv'), 
    ('canspin-deu-19', 'CANSpiN-deu-19_001_1-1-1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_002_1_shuffled.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_021_1_shuffled.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_001_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_008_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_004_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_041_1.tsv')
] for annotation_class in data_dict['annotation_class'][:21]]))

data: pd.DataFrame = pd.DataFrame(data_dict)

figure: plotly.graph_objects.Figure = plotly.express.bar(
    data_frame=data,
    x='text',
    y='amount',
    color='annotation_class',
    labels={
        "text": "texts",
        "amount": "annotation amount (in %)",
        "annotation_class": "annotation classes"
    },
    title=f'CS1 annotation amounts inside the initial chapters with {token_selection[1] - token_selection[0]} tokens <br><sub>(in percentage of the selected token amount of the respective chapter)</sub>',
    color_discrete_map={
        'Place-Container': '#B6D3FF',
        'Place-Container-MC': '#CCDEFF',
        'Place-Object': '#D4EAFF',
        'Place-Object-MC': '#E6F2FF',
        'Place-Abstract': '#89A8F6',
        'Place-Abstract-MC': '#98C3FA',
        'Place-ALT': '#90A6C7',
        'Movement-Subject': '#FF6D6D',
        'Movement-Object': '#F60D00',
        'Movement-Sound': '#FF4949',
        'Movement-Light': '#CA0B0B',
        'Movement-Smell': '#B60000',
        'Movement-ALT': '#960000',
        'Direction': '#92FFBD',
        'Direction-ALT': '#75CC96',
        'Positioning': '#DB8300',
        'Positioning-ALT': '#B56A01',
        'Dimensioning-Distance': '#8AB6AD',
        'Dimensioning-Size': '#7CD3C0',
        'Dimensioning-Amount': '#7EF5D9',
        'Dimensioning-ALT': '#60847B'
    },
    width=1400,
    height=800
)

figure.update_layout(font={'size': 18})

figure.show()