# OLID-BR (Build Dataset)

In this notebook, we will build the OLID-BR dataset from the processed data.

## Imports

In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [2]:
from dotenv import load_dotenv

# Initialize the env vars
load_dotenv("../../.env")

True

In [3]:
import os
import json
import datetime
import pandas as pd
from pandas_profiling import ProfileReport
from kaggle.api.kaggle_api_extended import KaggleApi
from src.s3 import Bucket
from src.settings import AppSettings

  from pandas_profiling import ProfileReport


In [35]:
args = AppSettings()

version = "v1.1"

bucket = Bucket(args.AWS_S3_BUCKET)

bucket.get_session_from_aksk(
    args.AWS_ACCESS_KEY_ID,
    args.AWS_SECRET_ACCESS_KEY)

## Load data

In the next cells, we will load all processed data.

In [5]:
iterations = [
    {
        "data": "processed/olid-br/iterations/2/olidbr.json",
        "metadata": "processed/olid-br/iterations/2/metadata.json",
        "full_data": "processed/olid-br/iterations/2/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/3/olidbr.json",
        "metadata": "processed/olid-br/iterations/3/metadata.json",
        "full_data": "processed/olid-br/iterations/3/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/4/olidbr.json",
        "metadata": "processed/olid-br/iterations/4/metadata.json",
        "full_data": "processed/olid-br/iterations/4/full_olidbr.json",
        "additional_data": "processed/olid-br/iterations/4/additional_texts.json"
    }
]

In [40]:
data = []
metadata = []
full_data = []
additional_data = []

for iteration in iterations:
    if iteration.get("data"):
        print(f"Loading {iteration['data']}")
        iteration_data = bucket.download_json(key=iteration["data"])
        data.extend(iteration_data)
        print(f"Data iteration size: {len(iteration_data)}")

    if iteration.get("metadata"):
        iteration_metadata = bucket.download_json(key=iteration["metadata"])
        metadata.extend(iteration_metadata)
        print(f"Metadata iteration size: {len(iteration_metadata)}")

    if iteration.get("full_data"):
        iteration_full_data = bucket.download_json(key=iteration["full_data"])
        full_data.extend(iteration_full_data)
        print(f"Full data iteration size: {len(iteration_full_data)}")

    if iteration.get("additional_data"):
        iteration_additional_data = bucket.download_json(key=iteration["additional_data"])
        additional_data.extend(iteration_additional_data)
        print(f"Additional data iteration size: {len(iteration_additional_data)}")

    print("\n")
    
print(f"Data: {len(data)}")
print(f"Metadata: {len(metadata)}")
print(f"Full data: {len(full_data)}")
print(f"Additional data: {len(additional_data)}")

Loading processed/olid-br/iterations/2/olidbr.json
Data iteration size: 2996
Metadata iteration size: 11984
Full data iteration size: 2996


Loading processed/olid-br/iterations/3/olidbr.json
Data iteration size: 2987
Metadata iteration size: 11948
Full data iteration size: 2987


Loading processed/olid-br/iterations/4/olidbr.json
Data iteration size: 1995
Metadata iteration size: 7980
Full data iteration size: 1995
Additional data iteration size: 6478


Data: 7978
Metadata: 31912
Full data: 7978
Additional data: 6478


## Iteration 1

The first iteration has only the processed data. So, we will need to prepare it specifically.

In [41]:
iteration_1 = {
    "data": "processed/olid-br/iterations/1/olidbr.json",
    "metadata": "processed/olid-br/iterations/1/metadata.json"
}

iteration_1_data = bucket.download_json(key=iteration_1["data"])
iteration_1_metadata = bucket.download_json(key=iteration_1["metadata"])

print(f"Data iteration 1 size: {len(iteration_1_data)}")
print(f"Metadata iteration 1 size: {len(iteration_1_metadata)}")

Data iteration 1 size: 706
Metadata iteration 1 size: 1520


In [42]:
df_metadata = pd.DataFrame(iteration_1_metadata)

for i in iteration_1_data:
    item = {
        "id": i["id"],
        "text": i["text"],
        "metadata": {},
        "annotations": []
    }

    # Get the metadata
    i_metadata = df_metadata[
        df_metadata["id"] == i["id"]].to_dict(orient="records")

    for m in i_metadata:
        annotators = df_metadata[df_metadata["id"] == i["id"]]["annotator_id"].unique()
        if 1.0 in annotators:
            main_annotator = 1.0
        elif 32.0 in annotators:
            main_annotator = 32.0
        else:
            ValueError(
                f"Annotator not found for {i['id']} - Annotators: {annotators}")
        
        if not pd.isnull(m["source"]):
            for key in ["source",
                        "created_at",
                        "collected_at",
                        "toxicity_score"]:
                item["metadata"][key] = m[key]

        elif (
                not pd.isnull(m["annotator_id"])
                and m["annotator_id"] == main_annotator):
            annotations = {
                "annotator_id": m["annotator_id"],
            }

            for key in ["is_offensive",
                        "is_targeted",
                        "targeted_type",
                        "toxic_spans",
                        "health",
                        "ideology",
                        "insult",
                        "lgbtqphobia",
                        "other_lifestyle",
                        "physical_aspects",
                        "profanity_obscene",
                        "racism",
                        "religious_intolerance",
                        "sexism",
                        "xenophobia"]:
                annotations[key] = i[key]

            item["annotations"].append(annotations)

            assert len(annotations) == 16
            assert len(item["annotations"]) == 1
            assert len(item["metadata"]) == 4
            
    additional_data.append(item)

print(f"Additional data: {len(additional_data)}")

Additional data: 7184


# Add additional data when 2 annotations are equal

In this section, we will add additional data when the 2 annotations are equal.

In [43]:
# Examples with 2 annotations
items_with_2_annotations = [item for item in additional_data if len(item["annotations"]) == 2]

print(
    f"Count: {len(items_with_2_annotations)} "
    f"({len(items_with_2_annotations) / len(additional_data) * 100:.2f}%)"
)

Count: 3134 (43.62%)


In [44]:
from typing import Dict, List

def match_annotations(
    annotations: List[Dict[str, str]],
    columns: List[str]
) -> bool:
    """Check if all annotations match the given columns.
    
    Args:
    - annotations: List of annotations.
    - columns: List of columns to match.

    Returns:
    - True if all annotations match the given columns.
    """
    for column in columns:
        values = [annotation[column] for annotation in annotations]
        if len(set(values)) > 1:
            return False
    return True

In [45]:
questions = []
for item in items_with_2_annotations:
    item_questions = {
        "id": item["id"],
        "is_offensive": match_annotations(
            item["annotations"],
            ["is_offensive"]
        ),
        "is_targeted": match_annotations(
            item["annotations"],
            ["is_targeted"]
        ),
        "targeted_type": match_annotations(
            item["annotations"],
            ["targeted_type"]
        ),
        "toxicity_labels": match_annotations(
            item["annotations"],
            [
                "health",
                "ideology",
                "insult",
                "lgbtqphobia",
                "other_lifestyle",
                "physical_aspects",
                "profanity_obscene",
                "racism",
                "religious_intolerance",
                "sexism",
                "xenophobia"
            ]
        )
    }
    questions.append(item_questions)

questions = pd.DataFrame(questions)

# Add column when all questions are the same
questions["all"] = questions.drop("id", axis=1).all(axis=1)

questions.head()

Unnamed: 0,id,is_offensive,is_targeted,targeted_type,toxicity_labels,all
0,1cf4ad796f424bc3a65c1ef9a9642e6d,True,True,False,False,False
1,a15e06a6409e4c53b4aba057d5e03f38,True,True,False,False,False
2,7bda1b737c1a4e3c9f572bad82ff7611,True,False,False,True,False
3,8d2af58a56cf4894bfa88482e5dece74,False,False,False,False,False
4,d46136cdbfb349e3bbff488948a6c04d,True,True,True,True,True


In [46]:
questions.drop(columns=["id"]).sum() / len(questions) * 100

is_offensive       64.645820
is_targeted        56.924059
targeted_type      51.084876
toxicity_labels    43.044033
all                24.090619
dtype: float64

In [47]:
selected_items = [
    item for item in additional_data
    if item["id"] in questions[questions["all"] == True]["id"].values
]

print(f"Count: {len(selected_items)}")

Count: 755


In [48]:
full_data.extend(selected_items)

print(f"Count: {len(full_data)}")

Count: 8733


In [49]:
from src.labeling.assignment import all_labeled_spans

for item in selected_items:
    data.append(
        {
            "id": item["id"],
            "text": item["text"],
            "is_offensive": item["annotations"][0]["is_offensive"],
            "is_targeted": item["annotations"][0]["is_targeted"],
            "targeted_type": item["annotations"][0]["targeted_type"],
            "toxic_spans": all_labeled_spans(
                [i["toxic_spans"] for i in item["annotations"]]
            ),
            "health": item["annotations"][0]["health"],
            "ideology": item["annotations"][0]["ideology"],
            "insult": item["annotations"][0]["insult"],
            "lgbtqphobia": item["annotations"][0]["lgbtqphobia"],
            "other_lifestyle": item["annotations"][0]["other_lifestyle"],
            "physical_aspects": item["annotations"][0]["physical_aspects"],
            "profanity_obscene": item["annotations"][0]["profanity_obscene"],
            "racism": item["annotations"][0]["racism"],
            "religious_intolerance": item["annotations"][0]["religious_intolerance"],
            "sexism": item["annotations"][0]["sexism"],
            "xenophobia": item["annotations"][0]["xenophobia"]
        }
    )

print(f"Count: {len(data)}")   

Count: 8733


In [51]:
assert len(data) == len(full_data), "Data and full data should have the same length"

In [50]:
# Remove selected items from additional data
additional_data = [
    item for item in additional_data
    if item["id"] not in questions[questions["all"] == True]["id"].values
]

print(f"Count: {len(additional_data)}")

Count: 6429


## Data processing

In this section, we will perform some data processing in order to clean and fix some issues in the dataset.

### Remove duplicated entries

In [53]:
df = pd.DataFrame(data)
print(f"Duplicated text: {df['text'].duplicated().sum()}")

df.drop_duplicates(subset="text", inplace=True)
print(df.shape)

data = df.to_dict("records")

Duplicated text: 43
(8690, 17)


In [54]:
# Remove duplicated texts from full data
full_data = [i for i in full_data if i["id"] in df["id"].values]

print(f"Full data: {len(full_data)}")

Full data: 8690


In [55]:
# Remove duplicated texts from metadata
print(f"Count metadata (before): {len(metadata)}")
metadata = [i for i in metadata if i["id"] in df["id"].values]
print(f"Count metadata (after): {len(metadata)}")

Count metadata (before): 31912
Count metadata (after): 31772


## Profiling Report

In this section, we will generate a profiling report for the dataset.

In [56]:
profile = ProfileReport(
    pd.DataFrame(data),
    title=f"OLID-BR {version}",
    explorative=True)

profile.to_file(f"../../docs/reports/olidbr_{version}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Split into train, validation and test sets

- **Training set**: 60% of the dataset.
- **Validation set**: 20% of the dataset.
- **Test set**: 20% of the dataset.

Convert `is_offensive`, `is_targeted`, and `targeted_type` categories values to int

In [57]:
# is_offensive
df["is_offensive"] = df["is_offensive"].replace({
    "OFF": 1,
    "NOT": 0
})

# is_targeted
df["is_targeted"] = df["is_targeted"].replace({
    "TIN": 1,
    "UNT": 0
})

# targeted_type
df["targeted_type"].fillna(0, inplace=True)
df["targeted_type"] = df["targeted_type"].replace({
    "IND": 1,
    "GRP": 2,
    "OTH": 3
})

In [58]:
params = {
    "seed": 1993,
    "test_size": 0.2,
    "val_size": 0.25 # 0.25 x 0.8 = 0.2
}

labels = [
    "is_offensive",
    "is_targeted",
    "targeted_type",
    "health",
    "ideology",
    "insult",
    "lgbtqphobia",
    "other_lifestyle",
    "physical_aspects",
    "profanity_obscene",
    "racism",
    "religious_intolerance",
    "sexism",
    "xenophobia"
]

X = df[["id", "text"]].values
y = df[labels].astype(int).values

In [59]:
from src.modeling.selection import multilabel_train_test_split

X_train, X_test, y_train, y_test = multilabel_train_test_split(
    X, y, test_size=params["test_size"],
    random_state=params["seed"], stratify=y)

X_train, X_val, y_train, y_val = multilabel_train_test_split(
    X_train, y_train, test_size=params["val_size"],
    random_state=params["seed"], stratify=y_train)

print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")
print(f"Test: {X_test.shape}")

Train: (5214, 2)
Validation: (1738, 2)
Test: (1738, 2)


In [60]:
train_data = [i for i in data if i["id"] in X_train[:, 0]]
val_data = [i for i in data if i["id"] in X_val[:, 0]]
test_data = [i for i in data if i["id"] in X_test[:, 0]]

train_metadata = [i for i in metadata if i["id"] in [i["id"] for i in train_data]]
val_metadata = [i for i in metadata if i["id"] in [i["id"] for i in val_data]]
test_metadata = [i for i in metadata if i["id"] in [i["id"] for i in test_data]]

full_train_data = [i for i in full_data if i["id"] in X_train[:, 0]]
full_val_data = [i for i in full_data if i["id"] in X_val[:, 0]]
full_test_data = [i for i in full_data if i["id"] in X_test[:, 0]]

print(f"Train: {len(train_data)}")
print(f"Validation: {len(val_data)}")
print(f"Test: {len(test_data)}")

print(f"Full train: {len(full_train_data)}")
print(f"Full validation: {len(full_val_data)}")
print(f"Full test: {len(full_test_data)}")

Train: 5214
Validation: 1738
Test: 1738
Full train: 5214
Full validation: 1738
Full test: 1738


## Sanity check

In [61]:
assert len(full_data) == len(train_data) + len(val_data) + len(test_data)

assert len(metadata) == len(train_metadata) + len(val_metadata) + len(test_metadata)

## Upload data to S3

In [62]:
files = {
    "train.csv": pd.DataFrame(train_data),
    "validation.csv": pd.DataFrame(val_data),
    "test.csv": pd.DataFrame(test_data),
    "train_metadata.csv": pd.DataFrame(train_metadata),
    "validation_metadata.csv": pd.DataFrame(val_metadata),
    "test_metadata.csv": pd.DataFrame(test_metadata),
    "train.json": full_train_data,
    "validation.json": full_val_data,
    "test.json": full_test_data,
    "additional_data.json": additional_data
}

for file, obj in files.items():
    print(f"Saving {file}", end="")
    if file.endswith(".csv"):
        bucket.upload_csv(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    elif file.endswith(".json"):
        bucket.upload_json(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    else:
        raise ValueError("Invalid file format")
    print(" - Done")

print("All files uploaded.")

Saving train.csv - Done
Saving validation.csv - Done
Saving test.csv - Done
Saving train_metadata.csv - Done
Saving validation_metadata.csv - Done
Saving test_metadata.csv - Done
Saving train.json - Done
Saving validation.json - Done
Saving test.json - Done
Saving additional_data.json - Done
All files uploaded.


## Upload dataset to Kaggle and Hugging Face

Kaggle

In [63]:
date = datetime.datetime.now().strftime('%Y-%m-%d')

temp_dir = "temp"

In [64]:
private_files = [
    "validation.csv",
    "validation_metadata.csv",
    "validation.json"
]

dataset_metadata = {
    "id": "dougtrajano/olidbr",
    "licenses": [{"name": "CC BY 4.0"}],
    "title": "OLID-BR"
}

os.makedirs(temp_dir, exist_ok=True)
if not os.path.exists(f"{temp_dir}/dataset-metadata.json"):
    with open(os.path.join(temp_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)

for file, obj in files.items():
    if file.endswith(".csv") and file not in private_files:
        obj.to_csv(f"{temp_dir}/{file}", index=False, encoding="utf-8")
    elif file.endswith(".json") and file not in private_files:
        with open(f"{temp_dir}/{file}", "w") as f:
            json.dump(obj, f, indent=4)

In [65]:
api = KaggleApi()
api.authenticate()

api.dataset_create_version(
    folder=temp_dir,
    version_notes=f"OLID-BR {version} - {date}",
    convert_to_csv=False,
    delete_old_versions=True
)

print("Dataset uploaded to Kaggle.")

Starting upload for file additional_data.json


100%|██████████| 9.66M/9.66M [00:02<00:00, 4.16MB/s]


Upload successful: additional_data.json (10MB)
Starting upload for file test.csv


100%|██████████| 542k/542k [00:02<00:00, 269kB/s]  


Upload successful: test.csv (542KB)
Starting upload for file test.json


100%|██████████| 5.19M/5.19M [00:01<00:00, 2.72MB/s]


Upload successful: test.json (5MB)
Starting upload for file test_metadata.csv


100%|██████████| 600k/600k [00:01<00:00, 327kB/s]  


Upload successful: test_metadata.csv (600KB)
Starting upload for file train.csv


100%|██████████| 1.58M/1.58M [00:01<00:00, 880kB/s] 


Upload successful: train.csv (2MB)
Starting upload for file train.json


100%|██████████| 15.6M/15.6M [00:03<00:00, 4.85MB/s]


Upload successful: train.json (16MB)
Starting upload for file train_metadata.csv


100%|██████████| 1.76M/1.76M [00:01<00:00, 972kB/s] 


Upload successful: train_metadata.csv (2MB)
Dataset uploaded to Kaggle.


Hugging Face

In [66]:
import datasets

dataset = datasets.dataset_dict.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(train_data), split="train"),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(test_data), split="test"),
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 5214
    })
    test: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 1738
    })
})

In [67]:
dataset.push_to_hub(
    repo_id="dougtrajano/olid-br",
    private=True,
    token=args.HUGGINGFACE_HUB_TOKEN)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Delete local files

In [68]:
import shutil
shutil.rmtree(temp_dir)

## Inter-Rater Reliability (IRR) analysis

a.k.a inter-rater agreement (IRA) or concordance.

In the next cells, we will perform an agreement analysis to check if the annotations are consistent.

See [Inter-Rater Reliability - OLID-BR](https://dougtrajano.github.io/olid-br/annotation/inter-rater-reliability.html) for more details.

In [94]:
from typing import Any, Dict, List

from irrCAC.raw import CAC
from src.labeling.metrics import percent_agreement
from src.utils import get_annotations_by_rater

import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance, jaccard_distance

def get_annotations(
    annotations: Dict[str, Any],
    label: str,
    annotation_id_key: str = "annotator_id") -> List[Any]:
    """
    Get annotations for a given label.

    Args:
    - annotations: Dictionary with annotations.
    - label: Label to get annotations for.
    - annotation_id_key: Key to use for annotation id.

    Returns:
    - List of annotations for the given label.
    """
    data = {}
    for annotation in annotations:
        for key, value in annotation.items():
            if key == label:
                data[annotation[annotation_id_key]] = value
                
    
    # Replace annotator ids to A, B, C, etc.
    annotators = {
        126: "A",
        127: "B",
        128: "C",
        260: "C",
        504: "A",
    }

    return {
        annotators.get(key, key): value
        for key, value in data.items()
    }

### `is_offensive`

In [70]:
is_offensive = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="is_offensive",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

is_offensive.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,OFF,OFF,...,OFF,OFF,NOT,NOT,NOT,OFF,OFF,NOT,NOT,NOT
B,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,NOT,OFF,...,,,,,,,,,,
C,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,OFF,OFF,...,OFF,OFF,NOT,NOT,NOT,OFF,OFF,NOT,NOT,NOT


In [71]:
cac = CAC(is_offensive)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: ['NOT', 'OFF'], Weights: "identity">
Percent agreement: 0.6070
Krippendorff's alpha: 0.3180
Gwet's AC1: 0.6967


### `is_targeted`

In [72]:
is_targeted = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="is_targeted",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

is_targeted.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,...,TIN,TIN,UNT,UNT,UNT,TIN,TIN,UNT,UNT,UNT
B,UNT,UNT,UNT,UNT,TIN,UNT,UNT,UNT,UNT,UNT,...,,,,,,,,,,
C,UNT,TIN,UNT,UNT,UNT,UNT,UNT,TIN,UNT,UNT,...,TIN,TIN,UNT,UNT,UNT,TIN,TIN,UNT,UNT,UNT


In [73]:
cac = CAC(is_targeted)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: ['TIN', 'UNT'], Weights: "identity">
Percent agreement: 0.2742
Krippendorff's alpha: 0.1017
Gwet's AC1: 0.1623


### `targeted_type`

In [74]:
targeted_type = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="targeted_type",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

targeted_type.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,IND,GRP,GRP,GRP,GRP,OTH,GRP,GRP,IND,IND,...,IND,IND,,,,IND,OTH,,,
B,,,,,GRP,,,,,,...,,,,,,,,,,
C,,IND,,,,,,IND,,,...,IND,IND,,,,IND,OTH,,,


In [75]:
cac = CAC(targeted_type)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7570, Raters: 3, Categories: ['GRP', 'IND', 'OTH'], Weights: "identity">
Percent agreement: 0.1456
Krippendorff's alpha: 0.4380
Gwet's AC1: 0.5897


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.ratings.replace(to_replace="", value=np.nan, inplace=True)


### `toxic_spans`

In [108]:
toxic_spans = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="toxic_spans",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

# fill missing values with empty list
toxic_spans = toxic_spans.fillna("").applymap(lambda x: [] if x == "" else x)

toxic_spans.head()

Unnamed: 0,A,B,C
0,[],"[52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...",[]
1,"[20, 21, 22, 23, 24, 25, 93, 94, 95, 96, 97]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
2,"[14, 15, 16, 17, 18]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
3,"[10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 2...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
4,"[10, 11, 12, 13, 14, 165, 166, 167, 176, 177, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[10, 11, 12, 13, 14, 176, 177, 178, 179, 180, ..."


In [112]:
task_data = []
for annotator in toxic_spans.columns:
    for item in range(len(toxic_spans)):
        temp = toxic_spans.iloc[item][annotator]
        if temp != []:
            task_data.append((
                annotator,
                item,
                frozenset(temp)
            ))

jaccard_task = AnnotationTask(distance=jaccard_distance)
masi_task = AnnotationTask(distance=masi_distance)

for task in [jaccard_task, masi_task]:
    task.load_array(task_data)
    print(f"Krippendorff's alpha using {task.distance}")
    print(f"Krippendorff's alpha: {task.alpha():.4f}", "\n")

print(f"Percent agreement: {percent_agreement(toxic_spans):.4f}")

Krippendorff's alpha using <function jaccard_distance at 0x0000016A3B1DDFC0>
Krippendorff's alpha: 0.5444 

Krippendorff's alpha using <function masi_distance at 0x0000016A3B1DE050>
Krippendorff's alpha: 0.3961 

Percent agreement: 0.2110


### `health`

In [113]:
health = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="health",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

health.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [114]:
cac = CAC(health)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8910
Krippendorff's alpha: 0.1823
Gwet's AC1: 0.9844


### `ideology`

In [115]:
ideology = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="ideology",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

ideology.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [116]:
cac = CAC(ideology)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.7545
Krippendorff's alpha: 0.3698
Gwet's AC1: 0.8719


### `insult`

In [117]:
insult = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="insult",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

insult.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,True,True,False,True,True,True,True,False,False,True,...,True,True,False,False,False,True,False,False,False,False
B,True,True,True,True,False,False,True,True,False,True,...,,,,,,,,,,
C,True,True,True,True,True,False,True,False,True,True,...,True,True,False,False,False,True,False,False,False,False


In [118]:
cac = CAC(insult)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.4629
Krippendorff's alpha: 0.3154
Gwet's AC1: 0.4451


### `lgbtqphobia`

In [119]:
lgbtqphobia = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="lgbtqphobia",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

lgbtqphobia.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [120]:
cac = CAC(lgbtqphobia)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8719
Krippendorff's alpha: 0.5837
Gwet's AC1: 0.9699


### `other_lifestyle`

In [121]:
other_lifestyle = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="other_lifestyle",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

other_lifestyle.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [122]:
cac = CAC(other_lifestyle)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8952
Krippendorff's alpha: 0.3214
Gwet's AC1: 0.9872


### `physical_aspects`

In [123]:
physical_aspects = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="physical_aspects",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

physical_aspects.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [124]:
cac = CAC(physical_aspects)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8707
Krippendorff's alpha: 0.3786
Gwet's AC1: 0.9697


### `profanity_obscene`

In [125]:
profanity_obscene = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="profanity_obscene",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

profanity_obscene.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,True,False,False,False,False,False,False,True,False,False,...,True,True,False,False,False,False,True,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,True,True,False,False,...,True,True,False,False,False,False,True,False,False,False


In [126]:
cac = CAC(profanity_obscene)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.6471
Krippendorff's alpha: 0.4354
Gwet's AC1: 0.7378


### `racism`

In [127]:
racism = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="racism",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

racism.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [128]:
cac = CAC(racism)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9013
Krippendorff's alpha: 0.2931
Gwet's AC1: 0.9914


### `religious_intolerance`

In [129]:
religious_intolerance = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="religious_intolerance",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

religious_intolerance.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [130]:
cac = CAC(religious_intolerance)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False], Weights: "identity">
Percent agreement: 0.9140
Krippendorff's alpha: 1.0000
Gwet's AC1: 1.0000


  p_value = 2 * (1 - stats.t.cdf(abs(krippen_alpha / stderr), n - 1))
  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc
  self.z = round(krippen_alpha_est / stderr, self.digits)
  (weights_mat_sum / (self.q * (self.q - 1)))
  (weights_mat_sum / (self.q * (self.q - 1)))


### `sexism`

In [132]:
sexism = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="sexism",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

sexism.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [133]:
cac = CAC(sexism)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8475
Krippendorff's alpha: 0.1826
Gwet's AC1: 0.9531


### `xenophobia`

In [134]:
xenophobia = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="xenophobia",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

xenophobia.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,True,False,False,False,False,False,False,False,...,,,,,,,,,,
C,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [135]:
cac = CAC(xenophobia)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 8690, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8954
Krippendorff's alpha: 0.1956
Gwet's AC1: 0.9874


### Krispendorff's alpha Multi-Label

In the next cells, we will calculate the Krippendorff's alpha considering as a multi-label problem instead of several binary problems.

In [136]:
ratings = {
    "health": health,
    "ideology": ideology,
    "insult": insult,
    "lgbtqphobia": lgbtqphobia,
    "other_lifestyle": other_lifestyle,
    "physical_aspects": physical_aspects,
    "profanity_obscene": profanity_obscene,
    "racism": racism,
    "religious_intolerance": religious_intolerance,
    "sexism": sexism,
    "xenophobia": xenophobia
}

task_data = []
for annotator in health.columns.tolist():
    for item in range(len(health)):
        temp = get_annotations_by_rater(ratings, annotator, item)
        if temp != []:
            task_data.append((
                annotator,
                item,
                frozenset(temp)
            ))

jaccard_task = AnnotationTask(distance=jaccard_distance)
masi_task = AnnotationTask(distance=masi_distance)

for task in [jaccard_task, masi_task]:
    task.load_array(task_data)
    print(f"Krippendorff's alpha using {task.distance}")
    print(f"Krippendorff's alpha: {task.alpha():.4f}", "\n")

pa_mlabels = {}
for item in range(len(health)):
    for annotator in health.columns.tolist():
        temp = get_annotations_by_rater(ratings, annotator, item)
        
        if annotator not in pa_mlabels.keys():
            pa_mlabels[annotator] = []
        
        pa_mlabels[annotator].append(temp)

print(f"Percent agreement: {percent_agreement(pd.DataFrame(pa_mlabels)):.4f}")

Krippendorff's alpha using <function jaccard_distance at 0x0000016A3B1DDFC0>
Krippendorff's alpha: 0.4096 

Krippendorff's alpha using <function masi_distance at 0x0000016A3B1DE050>
Krippendorff's alpha: 0.3773 

Percent agreement: 0.2799
