# Calculate kappa score

In [13]:
import json
from sklearn.metrics import cohen_kappa_score
from collections import defaultdict, Counter
import numpy as np

def extract_labels_from_file(file_path):
    """
    Extract labels from a Label Studio annotation file.
    Returns a dictionary mapping item IDs to their labels.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    labels_1 = {}
    labels_2 = {}

    for item in data:
        item_id = item['id']
        annotations = item['annotations']  # Fixed the typo here

        # Get the first annotation (assuming each item has at least one)
        if annotations:
            first_annotation = annotations[0]
            result_1 = first_annotation['result']

            # Find the choice label (assuming there's one choice field per annotation)
            for r in result_1:
                if r['type'] == 'choices':
                    labels_1[item_id] = r['value']['choices'][0]
                    break

            second_annotation = annotations[1]
            result_2 = first_annotation['result']
            for r in result_2:
                if r['type'] == 'choices':
                    labels_2[item_id] = r['value']['choices'][0]
                    break

    return labels_1 , labels_2

def calculate_kappa(file1_path):
    """
    Calculate Cohen's Kappa score between two Label Studio annotation files.
    """
    # Extract labels from both files
    labels1 , labels2 = extract_labels_from_file(file1_path)


    # Find common items
    common_ids = set(labels1.keys()) & set(labels2.keys())

    if not common_ids:
        raise ValueError("No common items found between the two files")

    # Prepare data for kappa calculation
    y1 = []
    y2 = []

    for item_id in common_ids:
        y1.append(labels1[item_id])
        y2.append(labels2[item_id])

    # Get all possible labels to handle single-label cases
    all_labels = sorted(list(set(y1 + y2)))

    # Calculate kappa score with explicit labels parameter
    try:
        kappa = cohen_kappa_score(y1, y2, labels=all_labels)
    except Exception as e:
        print(f"Warning: Could not calculate kappa score properly: {str(e)}")
        # Fallback calculation for edge cases
        agreement = sum(1 for a, b in zip(y1, y2) if a == b) / len(y1)
        expected_agreement = sum((y1.count(l)/len(y1)) * (y2.count(l)/len(y2)) for l in all_labels)
        kappa = (agreement - expected_agreement) / (1 - expected_agreement) if expected_agreement < 1 else np.nan

    return kappa



In [None]:

# Example usage
if __name__ == "__main__":
    file1 = "/content/output_with_lable.json"

    try:
        kappa_score = calculate_kappa(file1)
        print(f"Cohen's Kappa Score: {kappa_score:.3f}")
    except Exception as e:
        print(f"Error: {str(e)}")


Cohen's Kappa Score: 0.947


# Computing statical parameters

loading the data set

In [14]:
import json
from collections import defaultdict

# Load your JSON data
with open('/content/augmented-data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Make sure it's a list of objects
if isinstance(data, dict):
    data = [data]


computing number of all words

In [15]:
# ۲. تعداد کل کلمات
def count_words(value):
    if isinstance(value, str):
        return len(value.split())
    elif isinstance(value, list):
        return sum(count_words(item) for item in value)
    elif isinstance(value, dict):
        return sum(count_words(v) for v in value.values())
    return 0

calculate average string length of each field

In [16]:
# ۳. میانگین طول رشته‌ها برای هر ویژگی
def get_string_lengths(value):
    if isinstance(value, str):
        return [len(value)]
    elif isinstance(value, list):
        lengths = []
        for item in value:
            lengths.extend(get_string_lengths(item))
        return lengths
    elif isinstance(value, dict):
        lengths = []
        for v in value.values():
            lengths.extend(get_string_lengths(v))
        return lengths
    return []


compute all parameter

In [17]:
string_lengths_by_field = defaultdict(list)

for obj in data:
    for key, value in obj.items():
        lengths = get_string_lengths(value)
        string_lengths_by_field[key].extend(lengths)

average_lengths_by_field = {
    key: (sum(lengths) / len(lengths) if lengths else 0)
    for key, lengths in string_lengths_by_field.items()
}

record_count = len(data)
total_word_count = sum(count_words(obj) for obj in data)

print("تعداد رکوردها:", record_count)
print("تعداد کل کلمات:", total_word_count)
print("میانگین طول رشته برای هر ویژگی:")
for key, avg_len in average_lengths_by_field.items():
    if (key not in ["sex", "birth","death","image"]):
      print(f"  {key}: {avg_len:.2f}")


تعداد رکوردها: 983
تعداد کل کلمات: 32403
میانگین طول رشته برای هر ویژگی:
  name: 15.21
  nick-names: 12.24
  era: 7.31
  occupation: 14.35
  works: 28.82
  events: 38.20
