In [1]:
# imports
from gitma_canspin.canspin import AnnotationAnalyzer

import pandas as pd
import plotly
import json
import os
import itertools

from typing import Tuple

In [2]:
# load the analyzer with the selected catma project from the catma project folder
analyzer = AnnotationAnalyzer(init_settings={'project_name': 'CATMA_4AA4ADC0-4C28-54F9-B6A1-5DCEFF34B90B_DH2025_CANSpiN'})

gitma_canspin.project - INFO - Loading tagsets ...
gitma_canspin.project - INFO - 	Found 3 tagset(s).
gitma_canspin.project - INFO - Loading documents ...
gitma_canspin.project - INFO - 	Found 6 document(s).
gitma_canspin.project - INFO - Loading annotation collections ...
gitma_canspin.project - INFO - 	Found 6 annotation collection(s).
gitma_canspin.project - INFO - 	Annotation collection "Collection CS1 v1.1.0 - Gold Standard" for document "El pozo del Yocci"
gitma_canspin.project - INFO - 		Annotations: 201
gitma_canspin.project - INFO - 	Annotation collection "CS1 v1.1.0 - Nils (Gold: 1)" for document "El Señor de Bembibre"
gitma_canspin.project - INFO - 		Annotations: 285
gitma_canspin.project - INFO - 	Annotation collection "CS1 v1.1.0 - Ulrike (Gold standard: 1)" for document "CANSpiN-spa-19-008"
gitma_canspin.project - INFO - 		Annotations: 1332
gitma_canspin.project - INFO - 	Annotation collection "Nils -- CS1 1.1.0 (Gold: 1-1-1)" for document "DEU-19_001"
gitma_canspin.proje

In [3]:
# display loaded tsv annotations from corpus folders
analyzer.print_tsv_annotations_overview()

gitma_canspin.canspin - INFO - tsv files found in canspin project!



overview:
- schema "cs1"
	CANSpiN-deu-19_001_1-1-1.tsv
	CANSpiN-deu-19_030_1-1-1.tsv
	CANSpiN-deu-20_002_1.tsv
	CANSpiN-deu-20_021_1.tsv
	CANSpiN-lat-19_004_1.tsv
	CANSpiN-lat-19_041_1.tsv
	CANSpiN-spa-19_001_1.tsv
	CANSpiN-spa-19_008_1.tsv


In [4]:
# get the annotation statistics from the tsv files (for the entire chapters and for the first 1000 tokens respectively)
# and translate the class names into English

# TODO: fix bug in translation of schema totals by class: translation process overwrites results for german texts with the spanish ones,
#       so that the TOTAL by-class-dictionaries refering to the cs1 schema are wrong

token_selection: Tuple[int, int] = (0, 1000)

results: dict = {
    'whole_chapters': analyzer.get_corpus_annotation_statistics(),
    'first_1000_token': analyzer.get_corpus_annotation_statistics({
        'calculations': {
            'amount_of_annotations': True,
            'amount_of_annotations_by_class': True,
            'amount_of_token': True,
            'amount_of_annotated_token': True,
            'amount_of_annotated_token_by_class': True,
            'ratios': True,
            'word_lists_by_class': True
        },
        'custom_grouping': None,
        'text_borders': token_selection
    })
}

def translate_dict(input: dict, translation: dict) -> dict:
    translated: dict = dict([(translation.get(k, k), v) for k, v in input.items()])
    for key, value in translated.items():
        if isinstance(value, dict):
            translated[key] = translate_dict(value, translation)
    return translated

key_translation = {
    'Ort-Container': 'Place-Container',
    'Ort-Container-BK': 'Place-Container-MC',
    'Ort-Objekt': 'Place-Object',
    'Ort-Objekt-BK': 'Place-Object-MC',
    'Ort-Abstrakt': 'Place-Abstract',
    'Ort-Abstrakt-BK': 'Place-Abstract-MC',
    'Ort-ALT': 'Place-ALT',
    'Bewegung-Subjekt': 'Movement-Subject',
    'Bewegung-Objekt': 'Movement-Object',
    'Bewegung-Licht': 'Movement-Light',
    'Bewegung-Schall': 'Movement-Sound',
    'Bewegung-Geruch': 'Movement-Smell',
    'Bewegung-ALT': 'Movement-ALT',
    'Dimensionierung-Groesse': 'Dimensioning-Size',
    'Dimensionierung-Abstand': 'Dimensioning-Distance',
    'Dimensionierung-Menge': 'Dimensioning-Amount',
    'Dimensionierung-ALT': 'Dimensioning-ALT',
    'Positionierung': 'Positioning',
    'Positionierung-ALT': 'Positioning-ALT',
    'Richtung': 'Direction',
    'Richtung-ALT': 'Direction-ALT',
    'Lugar-Contenedor': 'Place-Container',
    'Lugar-Contenedor-CM': 'Place-Container-MC',
    'Lugar-Objeto': 'Place-Object',
    'Lugar-Objeto-CM': 'Place-Object-MC',
    'Lugar-Abstracto': 'Place-Abstract',
    'Lugar-Abstracto-CM': 'Place-Abstract-MC',
    'Lugar-ALT': 'Place-ALT',
    'Movimiento-Sujeto': 'Movement-Subject',
    'Movimiento-Objeto': 'Movement-Object',
    'Movimiento-Luz': 'Movement-Light',
    'Movimiento-Sonido': 'Movement-Sound',
    'Movimiento-Olfato': 'Movement-Smell',
    'Movimiento-ALT': 'Movement-ALT',
    'Dimensionamiento-Tamaño': 'Dimensioning-Size',
    'Dimensionamiento-Distancia': 'Dimensioning-Distance',
    'Dimensionamiento-Cantitad': 'Dimensioning-Amount',
    'Dimensionamiento-ALT': 'Dimensioning-ALT',
    'Posicionamiento': 'Positioning',
    'Posicionamiento-ALT': 'Positioning-ALT',
    'Dirección': 'Direction',
    'Dirección-ALT': 'Direction-ALT'
}

for result_type in results:
    results[result_type] = translate_dict(input=results[result_type], translation=key_translation)


In [5]:
# print annotation statistics

for result_type in results:
    print(
        json.dumps(
            results[result_type],
            indent=2, 
            sort_keys=False, 
            ensure_ascii=False
        )
    )


{
  "amount_of_annotations": {
    "cs1": {
      "canspin-spa-19": {
        "CANSpiN-spa-19_001_1.tsv": 269,
        "CANSpiN-spa-19_008_1.tsv": 728,
        "TOTAL": 997
      },
      "canspin-lat-19": {
        "CANSpiN-lat-19_004_1.tsv": 128,
        "CANSpiN-lat-19_041_1.tsv": 199,
        "TOTAL": 327
      },
      "canspin-deu-20": {
        "CANSpiN-deu-20_002_1.tsv": 412,
        "CANSpiN-deu-20_021_1.tsv": 154,
        "TOTAL": 566
      },
      "canspin-deu-19": {
        "CANSpiN-deu-19_001_1-1-1.tsv": 898,
        "CANSpiN-deu-19_030_1-1-1.tsv": 1056,
        "TOTAL": 1954
      },
      "TOTAL": 3844
    }
  },
  "amount_of_annotations_by_class": {
    "cs1": {
      "canspin-spa-19": {
        "CANSpiN-spa-19_001_1.tsv": {
          "Movement-ALT": 72,
          "Movement-Subject": 37,
          "Direction": 35,
          "Place-Container": 24,
          "Movement-Sound": 20,
          "Positioning": 15,
          "Direction-ALT": 12,
          "Place-Container-MC": 

In [6]:
# safe annotation statistics to files

for result_type in results:
    filename: str = f'annotation_statistics__{result_type}.json'
    json_file_str: str = json.dumps(results[result_type], indent=2, sort_keys=False, ensure_ascii=False)

    if (os.path.isfile(filename)):
        print(f'JSON file {filename} already exists and will be overwritten.')

    with open(filename, 'w') as file:
        file.write(json_file_str)
        print(f'JSON file {filename} successfully created.')

JSON file annotation_statistics__whole_chapters.json already exists and will be overwritten.
JSON file annotation_statistics__whole_chapters.json successfully created.
JSON file annotation_statistics__first_1000_token.json already exists and will be overwritten.
JSON file annotation_statistics__first_1000_token.json successfully created.


In [7]:
# create bar chart with annotation amounts of all first chapters normalized to their token amount: all tokens

data_dict: dict = {
    'text': list(itertools.chain(*[[item] * 21 for item in [
        'DEU19_030', 
        'DEU19_001', 
        'DEU20_002', 
        'DEU20_021', 
        'SPA19_001', 
        'SPA19_008', 
        'LAT19_004', 
        'LAT19_041'
    ]])),
    'annotation_class': [
        'Place-Container', 
        'Place-Container-MC', 
        'Place-Object', 
        'Place-Object-MC', 
        'Place-Abstract', 
        'Place-Abstract-MC', 
        'Place-ALT',
        'Movement-Subject',
        'Movement-Object',
        'Movement-Light',
        'Movement-Sound',
        'Movement-Smell',
        'Movement-ALT',
        'Dimensioning-Size',
        'Dimensioning-Distance',
        'Dimensioning-Amount',
        'Dimensioning-ALT',
        'Positioning',
        'Positioning-ALT',
        'Direction',
        'Direction-ALT'
    ] * 8,
    'amount': []
}
data_dict['amount'] = list(itertools.chain(*[[results['whole_chapters']['ratios']['cs1'][corpus_file[0]][corpus_file[1]]['annotations_by_class_in_file:total_token_amount_in_file'].get(annotation_class, 0) * 100] for corpus_file in [
    ('canspin-deu-19', 'CANSpiN-deu-19_030_1-1-1.tsv'), 
    ('canspin-deu-19', 'CANSpiN-deu-19_001_1-1-1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_002_1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_021_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_001_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_008_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_004_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_041_1.tsv')
] for annotation_class in data_dict['annotation_class'][:21]]))

data: pd.DataFrame = pd.DataFrame(data_dict)

figure: plotly.graph_objects.Figure = plotly.express.bar(
    data_frame=data,
    x='text',
    y='amount',
    color='annotation_class',
    labels={
        "text": "texts",
        "amount": "annotation amount (in %)",
        "annotation_class": "annotation classes"
    },
    title='CS1 annotation amounts inside the initial chapters with all tokens <br><sub>(in percentage of the total token amount of the respective chapter)</sub>',
    color_discrete_map={
        'Place-Container': '#B6D3FF',
        'Place-Container-MC': '#CCDEFF',
        'Place-Object': '#D4EAFF',
        'Place-Object-MC': '#E6F2FF',
        'Place-Abstract': '#89A8F6',
        'Place-Abstract-MC': '#98C3FA',
        'Place-ALT': '#90A6C7',
        'Movement-Subject': '#FF6D6D',
        'Movement-Object': '#F60D00',
        'Movement-Sound': '#FF4949',
        'Movement-Light': '#CA0B0B',
        'Movement-Smell': '#B60000',
        'Movement-ALT': '#960000',
        'Direction': '#92FFBD',
        'Direction-ALT': '#75CC96',
        'Positioning': '#DB8300',
        'Positioning-ALT': '#B56A01',
        'Dimensioning-Distance': '#8AB6AD',
        'Dimensioning-Size': '#7CD3C0',
        'Dimensioning-Amount': '#7EF5D9',
        'Dimensioning-ALT': '#60847B'
    },
    width=1400,
    height=800
)

figure.update_layout(font={'size': 18})

figure.show()

In [8]:
# create bar chart with annotation amounts of all first chapters normalized to their token amount: selected tokens

data_dict: dict = {
    'text': list(itertools.chain(*[[item] * 21 for item in [
        'DEU19_030', 
        'DEU19_001', 
        'DEU20_002', 
        'DEU20_021', 
        'SPA19_001', 
        'SPA19_008', 
        'LAT19_004', 
        'LAT19_041'
    ]])),
    'annotation_class': [
        'Place-Container', 
        'Place-Container-MC', 
        'Place-Object', 
        'Place-Object-MC', 
        'Place-Abstract', 
        'Place-Abstract-MC', 
        'Place-ALT',
        'Movement-Subject',
        'Movement-Object',
        'Movement-Light',
        'Movement-Sound',
        'Movement-Smell',
        'Movement-ALT',
        'Dimensioning-Size',
        'Dimensioning-Distance',
        'Dimensioning-Amount',
        'Dimensioning-ALT',
        'Positioning',
        'Positioning-ALT',
        'Direction',
        'Direction-ALT'
    ] * 8,
    'amount': []
}
data_dict['amount'] = list(itertools.chain(*[[results['first_1000_token']['ratios']['cs1'][corpus_file[0]][corpus_file[1]]['annotations_by_class_in_file:total_token_amount_in_file'].get(annotation_class, 0) * 100] for corpus_file in [
    ('canspin-deu-19', 'CANSpiN-deu-19_030_1-1-1.tsv'), 
    ('canspin-deu-19', 'CANSpiN-deu-19_001_1-1-1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_002_1.tsv'),
    ('canspin-deu-20', 'CANSpiN-deu-20_021_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_001_1.tsv'),
    ('canspin-spa-19', 'CANSpiN-spa-19_008_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_004_1.tsv'),
    ('canspin-lat-19', 'CANSpiN-lat-19_041_1.tsv')
] for annotation_class in data_dict['annotation_class'][:21]]))

data: pd.DataFrame = pd.DataFrame(data_dict)

figure: plotly.graph_objects.Figure = plotly.express.bar(
    data_frame=data,
    x='text',
    y='amount',
    color='annotation_class',
    labels={
        "text": "texts",
        "amount": "annotation amount (in %)",
        "annotation_class": "annotation classes"
    },
    title=f'CS1 annotation amounts inside the initial chapters with {token_selection[1] - token_selection[0]} tokens <br><sub>(in percentage of the selected token amount of the respective chapter)</sub>',
    color_discrete_map={
        'Place-Container': '#B6D3FF',
        'Place-Container-MC': '#CCDEFF',
        'Place-Object': '#D4EAFF',
        'Place-Object-MC': '#E6F2FF',
        'Place-Abstract': '#89A8F6',
        'Place-Abstract-MC': '#98C3FA',
        'Place-ALT': '#90A6C7',
        'Movement-Subject': '#FF6D6D',
        'Movement-Object': '#F60D00',
        'Movement-Sound': '#FF4949',
        'Movement-Light': '#CA0B0B',
        'Movement-Smell': '#B60000',
        'Movement-ALT': '#960000',
        'Direction': '#92FFBD',
        'Direction-ALT': '#75CC96',
        'Positioning': '#DB8300',
        'Positioning-ALT': '#B56A01',
        'Dimensioning-Distance': '#8AB6AD',
        'Dimensioning-Size': '#7CD3C0',
        'Dimensioning-Amount': '#7EF5D9',
        'Dimensioning-ALT': '#60847B'
    },
    width=1400,
    height=800
)

figure.show()