In [10]:
import pandas as pd
import numpy as np
import difflib
import random

In [2]:
df = pd.read_json("../data/df.json")

## Tokens

In [None]:
df['token_count'] = df['text'].apply(lambda x: len(x.split()))
total_token_count = df['token_count'].sum()
print(total_token_count)

# Class imbalance

## Dating

In [None]:
class_counts = df['decade'].value_counts()

most_frequent_class = class_counts.max()
least_frequent_class = class_counts.min()
imbalance_ratio = most_frequent_class / least_frequent_class

total_samples = class_counts.sum()
num_classes = class_counts.count()

print(f"Class counts:\n{class_counts}\n")
print(f"Most frequent class count: {most_frequent_class}")
print(f"Least frequent class count: {least_frequent_class}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")
print(f"Total number of samples: {total_samples}")
print(f"Total number of classes: {num_classes}")

gini_coefficient = 1 - sum((class_counts / total_samples) ** 2)
print(f"Gini coefficient: {gini_coefficient:.2f}")

class_probabilities = class_counts / total_samples
entropy = -sum(class_probabilities * np.log2(class_probabilities))
print(f"Entropy: {entropy:.2f}")

In [None]:
median_count = class_counts.median()
q75 = class_counts.quantile(0.75)
q25 = class_counts.quantile(0.25)
iqr = q75 - q25

print(f"Median class count: {median_count}")
print(f"IQR: {iqr}")

In [None]:
df.groupby('decade').filter(lambda x: len(x) >= 10).decade.value_counts()

## Locating

In [None]:
class_counts = df['supercuration_name'].value_counts()

most_frequent_class = class_counts.max()
least_frequent_class = class_counts.min()
imbalance_ratio = most_frequent_class / least_frequent_class

total_samples = class_counts.sum()
num_classes = class_counts.count()

print(f"Class counts:\n{class_counts}\n")
print(f"Most frequent class count: {most_frequent_class}")
print(f"Least frequent class count: {least_frequent_class}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")
print(f"Total number of samples: {total_samples}")
print(f"Total number of classes: {num_classes}")

gini_coefficient = 1 - sum((class_counts / total_samples) ** 2)
print(f"Gini coefficient: {gini_coefficient:.2f}")

class_probabilities = class_counts / total_samples
entropy = -sum(class_probabilities * np.log2(class_probabilities))
print(f"Entropy: {entropy:.2f}")

In [None]:
median_count = class_counts.median()
q75 = class_counts.quantile(0.75)
q25 = class_counts.quantile(0.25)
iqr = q75 - q25

print(f"Median class count: {median_count}")
print(f"IQR: {iqr}")

In [None]:
df.groupby('supercuration_name').filter(lambda x: len(x) >= 10).supercuration_name.value_counts()

# Deltas

In [None]:
df["same_text"] = df["text"] == df["text_normalized"]
print(len(df))
print(len(df[df["same_text"] == True]))

In [29]:
def get_deltas(original, normalized):
    s = difflib.SequenceMatcher(None, original, normalized)
    differences = []
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        differences.append((tag, original[i1:i2], normalized[j1:j2]))
    return differences

df["deltas"] = df.apply(lambda row: get_deltas(row["text"], row["text_normalized"]), axis=1)

In [None]:
differences_dict = {"replace": 0, "delete": 0, "insert": 0, "equal": 0}

for differences in df["deltas"]:
    for diff in differences:
        tag = diff[0]
        
        differences_dict[tag] += 1

differences_dict

In [None]:
n = 10
for difference_type in differences_dict.keys():
    all_replacements = []
    for deltas in df["deltas"]:
        replacements = [diff for diff in deltas if diff[0] == difference_type]
        all_replacements.extend(replacements)

    if len(all_replacements) >= n:
        random_replacements = random.sample(all_replacements, 10)
    else:
        random_replacements = all_replacements

    print(f"Random diffs for type '{difference_type}':")
    for replacement in random_replacements:
        print(replacement)