In [1]:
import pandas as pd
import numpy as np
import difflib
import random

In [2]:
df = pd.read_json("../data/df.json")

## Tokens

In [3]:
df['token_count'] = df['text'].apply(lambda x: len(x.split()))
total_token_count = df['token_count'].sum()
print(total_token_count)

2808918


# Class imbalance

## Dating

In [4]:
class_counts = df['decade'].value_counts()

most_frequent_class = class_counts.max()
least_frequent_class = class_counts.min()
imbalance_ratio = most_frequent_class / least_frequent_class

total_samples = class_counts.sum()
num_classes = class_counts.count()

print(f"Class counts:\n{class_counts}\n")
print(f"Most frequent class count: {most_frequent_class}")
print(f"Least frequent class count: {least_frequent_class}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")
print(f"Total number of samples: {total_samples}")
print(f"Total number of classes: {num_classes}")

gini_coefficient = 1 - sum((class_counts / total_samples) ** 2)
print(f"Gini coefficient: {gini_coefficient:.2f}")

class_probabilities = class_counts / total_samples
entropy = -sum(class_probabilities * np.log2(class_probabilities))
print(f"Entropy: {entropy:.2f}")

Class counts:
decade
1370    965
1350    897
1340    893
1360    854
1330    716
1380    685
1320    595
1310    533
1300    457
1290    302
1390    252
1280     97
1270     25
1260      6
1250      4
1220      3
1230      1
1130      1
Name: count, dtype: int64

Most frequent class count: 965
Least frequent class count: 1
Imbalance ratio: 965.00
Total number of samples: 7286
Total number of classes: 18
Gini coefficient: 0.90
Entropy: 3.46


In [5]:
median_count = class_counts.median()
q75 = class_counts.quantile(0.75)
q25 = class_counts.quantile(0.25)
iqr = q75 - q25

print(f"Median class count: {median_count}")
print(f"IQR: {iqr}")

Median class count: 379.5
IQR: 697.5


In [6]:
df.groupby('decade').filter(lambda x: len(x) >= 10).decade.value_counts()

decade
1370    965
1350    897
1340    893
1360    854
1330    716
1380    685
1320    595
1310    533
1300    457
1290    302
1390    252
1280     97
1270     25
Name: count, dtype: int64

## Locating

In [7]:
class_counts = df['supercuration_name'].value_counts()

most_frequent_class = class_counts.max()
least_frequent_class = class_counts.min()
imbalance_ratio = most_frequent_class / least_frequent_class

total_samples = class_counts.sum()
num_classes = class_counts.count()

print(f"Class counts:\n{class_counts}\n")
print(f"Most frequent class count: {most_frequent_class}")
print(f"Least frequent class count: {least_frequent_class}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")
print(f"Total number of samples: {total_samples}")
print(f"Total number of classes: {num_classes}")

gini_coefficient = 1 - sum((class_counts / total_samples) ** 2)
print(f"Gini coefficient: {gini_coefficient:.2f}")

class_probabilities = class_counts / total_samples
entropy = -sum(class_probabilities * np.log2(class_probabilities))
print(f"Entropy: {entropy:.2f}")

Class counts:
supercuration_name
COLLECTIONS    4129
AT-OOeLA        356
AT-StiASF       321
CH-StiASG       247
AT-StiAH        238
DE-BayHStA      233
AT-StiAScho     208
AT-StiAHe       192
AT-HHStA        177
AT-StiASei      151
AT-StiAL        150
CH-StaASG       150
AT-StiAW        131
AT-StiAR        120
AT-StiAKr       102
AT-StiAA         94
AT-StiAG         90
AT-StiASch       75
AT-StaABdW       56
AT-StiAK         36
DE-AKR           12
SI-PAM            6
AT-StiAMh         6
DE-AEK            3
AT-StiAGe         2
AT-StiAM          1
Name: count, dtype: int64

Most frequent class count: 4129
Least frequent class count: 1
Imbalance ratio: 4129.00
Total number of samples: 7286
Total number of classes: 26
Gini coefficient: 0.67
Entropy: 2.77


In [8]:
median_count = class_counts.median()
q75 = class_counts.quantile(0.75)
q25 = class_counts.quantile(0.25)
iqr = q75 - q25

print(f"Median class count: {median_count}")
print(f"IQR: {iqr}")

Median class count: 125.5
IQR: 163.0


In [9]:
df.groupby('supercuration_name').filter(lambda x: len(x) >= 10).supercuration_name.value_counts()

supercuration_name
COLLECTIONS    4129
AT-OOeLA        356
AT-StiASF       321
CH-StiASG       247
AT-StiAH        238
DE-BayHStA      233
AT-StiAScho     208
AT-StiAHe       192
AT-HHStA        177
AT-StiASei      151
AT-StiAL        150
CH-StaASG       150
AT-StiAW        131
AT-StiAR        120
AT-StiAKr       102
AT-StiAA         94
AT-StiAG         90
AT-StiASch       75
AT-StaABdW       56
AT-StiAK         36
DE-AKR           12
Name: count, dtype: int64

# Deltas

In [10]:
df["same_text"] = df["text"] == df["text_normalized"]
print(len(df))
print(len(df[df["same_text"] == True]))

7286
1224


In [11]:
def get_deltas(original, normalized):
    s = difflib.SequenceMatcher(None, original, normalized)
    differences = []
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        differences.append((tag, original[i1:i2], normalized[j1:j2]))
    return differences

df["deltas"] = df.apply(lambda row: get_deltas(row["text"], row["text_normalized"]), axis=1)

In [12]:
differences_dict = {"replace": 0, "delete": 0, "insert": 0, "equal": 0}

for differences in df["deltas"]:
    for diff in differences:
        tag = diff[0]
        
        differences_dict[tag] += 1

differences_dict

{'replace': 68112, 'delete': 15217, 'insert': 115, 'equal': 90587}

In [13]:
n = 10
for difference_type in differences_dict.keys():
    all_replacements = []
    for deltas in df["deltas"]:
        replacements = [diff for diff in deltas if diff[0] == difference_type]
        all_replacements.extend(replacements)

    if len(all_replacements) >= n:
        random_replacements = random.sample(all_replacements, 10)
    else:
        random_replacements = all_replacements

    print(f"Random diffs for type '{difference_type}':")
    for replacement in random_replacements:
        print(replacement)

Random diffs for type 'replace':
('replace', 'ü', 'u')
('replace', 'úr vns vnd vnserv´ geswistergide, daz ietz genande gůt zevertegende, alle die wile wir vnd vnserv´ geswistergide ze vnsern tagen nicht sint comen, vnd daz wir daz selbe tů', 'ur vns vnd vnserv geswistergide, daz ietz genande gut zevertegende, alle die wile wir vnd vnserv geswistergide ze vnsern tagen nicht sint comen, vnd daz wir daz selbe tu')
('replace', 'ú', 'u')
('replace', 'ů', 'u')
('replace', 'ů', 'u')
('replace', 'æut mæ', 'aeut mae')
('replace', '|| wir angesehen haben die núclichen willigen dienst, so vns vnser lieber ohan der wolerboren ||', 'wir angesehen haben die nuclichen willigen dienst, so vns vnser lieber ohan der wolerboren')
('replace', 'ů', 'u')
('replace', '´ns als komenlich vnd fůgklich ist, mit v´', 'ns als komenlich vnd fugklich ist, mit v')
('replace', 'ů', 'u')
Random diffs for type 'delete':
('delete', '(ich)', '')
('delete', ' ', '')
('delete', '´', '')
('delete', '|| ', '')
('delete', '||'