# OLID-BR (Build Dataset)

In this notebook, we will build the OLID-BR dataset from the processed data.

## Imports

In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [2]:
from dotenv import load_dotenv

# Initialize the env vars
load_dotenv("../../.env")

True

In [3]:
import os
import json
import datetime
import pandas as pd
from pandas_profiling import ProfileReport
from kaggle.api.kaggle_api_extended import KaggleApi
from src.s3 import Bucket
from src.settings import AppSettings

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
args = AppSettings()

version = "v1.0"

bucket = Bucket(args.AWS_S3_BUCKET)

bucket.get_session_from_aksk(
    args.AWS_ACCESS_KEY_ID,
    args.AWS_SECRET_ACCESS_KEY)

## Load data

In the next cells, we will load all processed data.

In [5]:
iterations = [
    {
        "data": "processed/olid-br/iterations/2/olidbr.json",
        "metadata": "processed/olid-br/iterations/2/metadata.json",
        "full_data": "processed/olid-br/iterations/2/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/3/olidbr.json",
        "metadata": "processed/olid-br/iterations/3/metadata.json",
        "full_data": "processed/olid-br/iterations/3/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/4/olidbr.json",
        "metadata": "processed/olid-br/iterations/4/metadata.json",
        "full_data": "processed/olid-br/iterations/4/full_olidbr.json",
        "additional_data": "processed/olid-br/iterations/4/additional_texts.json"
    }
]

In [6]:
data = []
metadata = []
full_data = []
additional_data = []

for iteration in iterations:
    if iteration.get("data"):
        print(f"Loading {iteration['data']}")
        iteration_data = bucket.download_json(key=iteration["data"])
        data.extend(iteration_data)
        print(f"Data iteration size: {len(iteration_data)}")

    if iteration.get("metadata"):
        iteration_metadata = bucket.download_json(key=iteration["metadata"])
        metadata.extend(iteration_metadata)
        print(f"Metadata iteration size: {len(iteration_metadata)}")

    if iteration.get("full_data"):
        iteration_full_data = bucket.download_json(key=iteration["full_data"])
        full_data.extend(iteration_full_data)
        print(f"Full data iteration size: {len(iteration_full_data)}")

    if iteration.get("additional_data"):
        iteration_additional_data = bucket.download_json(key=iteration["additional_data"])
        additional_data.extend(iteration_additional_data)
        print(f"Additional data iteration size: {len(iteration_additional_data)}")

    print("\n")
    
print(f"Data: {len(data)}")
print(f"Metadata: {len(metadata)}")
print(f"Full data: {len(full_data)}")
print(f"Additional data: {len(additional_data)}")

Loading processed/olid-br/iterations/2/olidbr.json
Data iteration size: 2996
Metadata iteration size: 11984
Full data iteration size: 2996


Loading processed/olid-br/iterations/3/olidbr.json
Data iteration size: 2987
Metadata iteration size: 11948
Full data iteration size: 2987


Loading processed/olid-br/iterations/4/olidbr.json
Data iteration size: 1995
Metadata iteration size: 7980
Full data iteration size: 1995
Additional data iteration size: 6478


Data: 7978
Metadata: 31912
Full data: 7978
Additional data: 6478


## Iteration 1

The first iteration has only the processed data. So, we will need to prepare it specifically.

In [7]:
iteration_1 = {
    "data": "processed/olid-br/iterations/1/olidbr.json",
    "metadata": "processed/olid-br/iterations/1/metadata.json"
}

iteration_1_data = bucket.download_json(key=iteration_1["data"])
iteration_1_metadata = bucket.download_json(key=iteration_1["metadata"])

print(f"Data iteration 1 size: {len(iteration_1_data)}")
print(f"Metadata iteration 1 size: {len(iteration_1_metadata)}")

Data iteration 1 size: 706
Metadata iteration 1 size: 1520


In [8]:
df_metadata = pd.DataFrame(iteration_1_metadata)

for i in iteration_1_data:
    item = {
        "id": i["id"],
        "text": i["text"],
        "metadata": {},
        "annotations": []
    }

    # Get the metadata
    i_metadata = df_metadata[
        df_metadata["id"] == i["id"]].to_dict(orient="records")

    for m in i_metadata:
        annotators = df_metadata[df_metadata["id"] == i["id"]]["annotator_id"].unique()
        if 1.0 in annotators:
            main_annotator = 1.0
        elif 32.0 in annotators:
            main_annotator = 32.0
        else:
            ValueError(
                f"Annotator not found for {i['id']} - Annotators: {annotators}")
        
        if not pd.isnull(m["source"]):
            for key in ["source",
                        "created_at",
                        "collected_at",
                        "toxicity_score"]:
                item["metadata"][key] = m[key]

        elif (
                not pd.isnull(m["annotator_id"])
                and m["annotator_id"] == main_annotator):
            annotations = {
                "annotator_id": m["annotator_id"],
            }

            for key in ["is_offensive",
                        "is_targeted",
                        "targeted_type",
                        "toxic_spans",
                        "health",
                        "ideology",
                        "insult",
                        "lgbtqphobia",
                        "other_lifestyle",
                        "physical_aspects",
                        "profanity_obscene",
                        "racism",
                        "religious_intolerance",
                        "sexism",
                        "xenophobia"]:
                annotations[key] = i[key]

            item["annotations"].append(annotations)

            assert len(annotations) == 16
            assert len(item["annotations"]) == 1
            assert len(item["metadata"]) == 4
            
    additional_data.append(item)

print(f"Additional data: {len(additional_data)}")

Additional data: 7184


## Data processing

In this section, we will perform some data processing in order to clean and fix some issues in the dataset.

### Remove duplicated entries

In [9]:
df = pd.DataFrame(data)
print(f"Duplicated text: {df['text'].duplicated().sum()}")

df.drop_duplicates(subset="text", inplace=True)
print(df.shape)

data = df.to_dict("records")

Duplicated text: 35
(7943, 17)


In [10]:
# Remove duplicated texts from full data
full_data = [i for i in full_data if i["id"] in df["id"].values]

print(f"Full data: {len(full_data)}")

Full data: 7943


In [11]:
# Remove duplicated texts from metadata
print(f"Count metadata (before): {len(metadata)}")
metadata = [i for i in metadata if i["id"] in df["id"].values]
print(f"Count metadata (after): {len(metadata)}")

Count metadata (before): 31912
Count metadata (after): 31772


## Profiling Report

In this section, we will generate a profiling report for the dataset.

In [12]:
profile = ProfileReport(
    pd.DataFrame(data),
    title=f"OLID-BR {version}",
    explorative=True)

profile.to_file(f"../../docs/reports/olidbr_{version}.html")

Summarize dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:09<00:00,  3.40it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.40s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.95it/s]
Export report to file: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 23.35it/s]


## Split into train, validation and test sets

- **Training set**: 60% of the dataset.
- **Validation set**: 20% of the dataset.
- **Test set**: 20% of the dataset.

Convert `is_offensive`, `is_targeted`, and `targeted_type` categories values to int

In [13]:
# is_offensive
df["is_offensive"] = df["is_offensive"].replace({
    "OFF": 1,
    "NOT": 0
})

# is_targeted
df["is_targeted"] = df["is_targeted"].replace({
    "TIN": 1,
    "UNT": 0
})

# targeted_type
df["targeted_type"].fillna(0, inplace=True)
df["targeted_type"] = df["targeted_type"].replace({
    "IND": 1,
    "GRP": 2,
    "OTH": 3
})

In [14]:
params = {
    "seed": 1993,
    "test_size": 0.2,
    "val_size": 0.25 # 0.25 x 0.8 = 0.2
}

labels = [
    "is_offensive",
    "is_targeted",
    "targeted_type",
    "health",
    "ideology",
    "insult",
    "lgbtqphobia",
    "other_lifestyle",
    "physical_aspects",
    "profanity_obscene",
    "racism",
    "religious_intolerance",
    "sexism",
    "xenophobia"
]

X = df[["id", "text"]].values
y = df[labels].astype(int).values

In [15]:
from src.modeling.selection import multilabel_train_test_split

X_train, X_test, y_train, y_test = multilabel_train_test_split(
    X, y, test_size=params["test_size"],
    random_state=params["seed"], stratify=y)

X_train, X_val, y_train, y_val = multilabel_train_test_split(
    X_train, y_train, test_size=params["val_size"],
    random_state=params["seed"], stratify=y_train)

print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")
print(f"Test: {X_test.shape}")

Train: (4765, 2)
Validation: (1589, 2)
Test: (1589, 2)


In [16]:
train_data = [item for item in data if item["id"] in X_train[:, 0]]
val_data = [item for item in data if item["id"] in X_val[:, 0]]
test_data = [item for item in data if item["id"] in X_test[:, 0]]

train_metadata = [item for item in metadata if item["id"] in [i["id"] for i in train_data]]
val_metadata = [item for item in metadata if item["id"] in [i["id"] for i in val_data]]
test_metadata = [item for item in metadata if item["id"] in [i["id"] for i in test_data]]

full_train_data = [item for item in full_data if item["id"] in X_train[:, 0]]
full_val_data = [item for item in full_data if item["id"] in X_val[:, 0]]
full_test_data = [item for item in full_data if item["id"] in X_test[:, 0]]

print(f"Train: {len(train_data)}")
print(f"Validation: {len(val_data)}")
print(f"Test: {len(test_data)}")

print(f"Full train: {len(full_train_data)}")
print(f"Full validation: {len(full_val_data)}")
print(f"Full test: {len(full_test_data)}")

Train: 4765
Validation: 1589
Test: 1589
Full train: 4765
Full validation: 1589
Full test: 1589


## Sanity check

In [17]:
assert len(full_data) == len(train_data) + len(val_data) + len(test_data)

assert len(metadata) == len(train_metadata) + len(val_metadata) + len(test_metadata)

## Upload data to S3

In [18]:
files = {
    "train.csv": pd.DataFrame(train_data),
    "validation.csv": pd.DataFrame(val_data),
    "test.csv": pd.DataFrame(test_data),
    "train_metadata.csv": pd.DataFrame(train_metadata),
    "validation_metadata.csv": pd.DataFrame(val_metadata),
    "test_metadata.csv": pd.DataFrame(test_metadata),
    "train.json": full_train_data,
    "validation.json": full_val_data,
    "test.json": full_test_data,
    "additional_data.json": additional_data
}

for file, obj in files.items():
    print(f"Saving {file}", end="")
    if file.endswith(".csv"):
        bucket.upload_csv(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    elif file.endswith(".json"):
        bucket.upload_json(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    else:
        raise ValueError("Invalid file format")
    print(" - Done")

print("All files uploaded.")

Saving train.csv - Done
Saving validation.csv - Done
Saving test.csv - Done
Saving train_metadata.csv - Done
Saving validation_metadata.csv - Done
Saving test_metadata.csv - Done
Saving train.json - Done
Saving validation.json - Done
Saving test.json - Done
Saving additional_data.json - Done
All files uploaded.


## Upload dataset to Kaggle and Hugging Face

Kaggle

In [19]:
date = datetime.datetime.now().strftime('%Y-%m-%d')

temp_dir = "temp"

In [20]:
private_files = [
    "validation.csv",
    "validation_metadata.csv",
    "validation.json"
]

dataset_metadata = {
    "id": "dougtrajano/olidbr",
    "licenses": [{"name": "CC BY 4.0"}],
    "title": "OLID-BR"
}

os.makedirs(temp_dir, exist_ok=True)
if not os.path.exists(f"{temp_dir}/dataset-metadata.json"):
    with open(os.path.join(temp_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)

for file, obj in files.items():
    if file.endswith(".csv") and file not in private_files:
        obj.to_csv(f"{temp_dir}/{file}", index=False, encoding="utf-8")
    elif file.endswith(".json") and file not in private_files:
        with open(f"{temp_dir}/{file}", "w") as f:
            json.dump(obj, f, indent=4)

In [21]:
api = KaggleApi()
api.authenticate()

api.dataset_create_version(
    folder=temp_dir,
    version_notes=f"OLID-BR {version} - {date}",
    convert_to_csv=False,
    delete_old_versions=True
)

print("Dataset uploaded to Kaggle.")

Starting upload for file additional_data.json


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.8M/10.8M [00:05<00:00, 1.89MB/s]


Upload successful: additional_data.json (11MB)
Starting upload for file test.csv


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 499k/499k [00:02<00:00, 252kB/s]


Upload successful: test.csv (499KB)
Starting upload for file test.json


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.76M/4.76M [00:03<00:00, 1.48MB/s]


Upload successful: test.json (5MB)
Starting upload for file test_metadata.csv


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 595k/595k [00:02<00:00, 298kB/s]


Upload successful: test_metadata.csv (595KB)
Starting upload for file train.csv


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.47M/1.47M [00:02<00:00, 697kB/s]


Upload successful: train.csv (1MB)
Starting upload for file train.json


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14.3M/14.3M [00:07<00:00, 2.07MB/s]


Upload successful: train.json (14MB)
Starting upload for file train_metadata.csv


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.74M/1.74M [00:02<00:00, 681kB/s]


Upload successful: train_metadata.csv (2MB)
Dataset uploaded to Kaggle.


Hugging Face

In [22]:
import datasets

dataset = datasets.dataset_dict.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(train_data), split="train"),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(test_data), split="test"),
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 4765
    })
    test: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 1589
    })
})

In [23]:
dataset.push_to_hub(
    repo_id="dougtrajano/olid-br",
    private=True,
    token=args.HUGGINGFACE_HUB_TOKEN)

Pushing split train to the Hub.
Resuming upload of the dataset shards.
Pushing dataset shards to the dataset hub: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 8701.88it/s]
Pushing split test to the Hub.
Resuming upload of the dataset shards.
Pushing dataset shards to the dataset hub: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6647.07it/s]


Delete local files

In [24]:
import shutil
shutil.rmtree(temp_dir)

## Inter-Rater Reliability (IRR) analysis

a.k.a inter-rater agreement (IRA) or concordance.

In the next cells, we will perform an agreement analysis to check if the annotations are consistent.

See [Inter-Rater Reliability - OLID-BR](https://dougtrajano.github.io/olid-br/annotation/inter-rater-reliability.html) for more details.

In [25]:
from typing import Any, Dict, List

from irrCAC.raw import CAC
from src.labeling.metrics import percent_agreement
from src.utils import get_annotations_by_rater

import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance, jaccard_distance

def get_annotations(
    annotations: Dict[str, Any],
    label: str,
    annotation_id_key: str = "annotator_id") -> List[Any]:
    """
    Get annotations for a given label.

    Args:
    - annotations: Dictionary with annotations.
    - label: Label to get annotations for.
    - annotation_id_key: Key to use for annotation id.

    Returns:
    - List of annotations for the given label.
    """
    data = {}
    for annotation in annotations:
        for key, value in annotation.items():
            if key == label:
                data[annotation[annotation_id_key]] = value
    
    # Replace annotator ids to A, B, C, etc.
    annotators = {
        126: "A",
        127: "B",
        128: "C",
        260: "C",
        504: "A",
    }

    return {
        annotators.get(key, key): value
        for key, value in data.items()
    }

### `is_offensive`

In [26]:
is_offensive = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="is_offensive",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

is_offensive.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,OFF,OFF,...,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,OFF,OFF
B,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,NOT,OFF,...,OFF,OFF,NOT,NOT,OFF,NOT,OFF,OFF,NOT,OFF
C,OFF,OFF,OFF,OFF,OFF,NOT,OFF,OFF,OFF,OFF,...,OFF,NOT,NOT,OFF,OFF,NOT,OFF,OFF,NOT,NOT


In [27]:
cac = CAC(is_offensive)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: ['NOT', 'OFF'], Weights: "identity">
Percent agreement: 0.6641
Krippendorff's alpha: 0.1733
Gwet's AC1: 0.6929


### `is_targeted`

In [28]:
is_targeted = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="is_targeted",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

is_targeted.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,TIN,...,TIN,UNT,TIN,UNT,TIN,UNT,UNT,UNT,UNT,UNT
B,UNT,UNT,UNT,UNT,TIN,UNT,UNT,UNT,UNT,UNT,...,TIN,TIN,UNT,UNT,TIN,UNT,TIN,TIN,UNT,TIN
C,UNT,TIN,UNT,UNT,UNT,UNT,UNT,TIN,UNT,UNT,...,TIN,UNT,UNT,UNT,TIN,UNT,TIN,TIN,UNT,UNT


In [29]:
cac = CAC(is_targeted)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: ['TIN', 'UNT'], Weights: "identity">
Percent agreement: 0.3000
Krippendorff's alpha: 0.0355
Gwet's AC1: 0.0960


### `targeted_type`

In [30]:
targeted_type = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="targeted_type",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

targeted_type.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,IND,GRP,GRP,GRP,GRP,OTH,GRP,GRP,IND,IND,...,OTH,,IND,,IND,,,,,
B,,,,,GRP,,,,,,...,OTH,GRP,,,IND,,GRP,GRP,,GRP
C,,IND,,,,,,IND,,,...,OTH,,,,IND,,IND,GRP,,


In [32]:
cac = CAC(targeted_type)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7322, Raters: 3, Categories: ['GRP', 'IND', 'OTH'], Weights: "identity">
Percent agreement: 0.1505
Krippendorff's alpha: 0.4149
Gwet's AC1: 0.5689


### `toxic_spans`

In [33]:
toxic_spans = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="toxic_spans",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

toxic_spans.head()

Unnamed: 0,A,B,C
0,[],"[52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...",[]
1,"[20, 21, 22, 23, 24, 25, 93, 94, 95, 96, 97]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
2,"[14, 15, 16, 17, 18]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
3,"[10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 2...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
4,"[10, 11, 12, 13, 14, 165, 166, 167, 176, 177, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[10, 11, 12, 13, 14, 176, 177, 178, 179, 180, ..."


In [34]:
task_data = []
for annotator in toxic_spans.columns:
    for item in range(len(toxic_spans)):
        temp = toxic_spans.iloc[item][annotator]
        if temp != []:
            task_data.append((
                annotator,
                item,
                frozenset(temp)
            ))

jaccard_task = AnnotationTask(distance=jaccard_distance)
masi_task = AnnotationTask(distance=masi_distance)

for task in [jaccard_task, masi_task]:
    task.load_array(task_data)
    print(f"Krippendorff's alpha using {task.distance}")
    print(f"Krippendorff's alpha: {task.alpha():.4f}", "\n")

print(f"Percent agreement: {percent_agreement(toxic_spans):.4f}")

Krippendorff's alpha using <function jaccard_distance at 0x7f9bb1cad240>
Krippendorff's alpha: 0.5403 

Krippendorff's alpha using <function masi_distance at 0x7f9bb1cad2d0>
Krippendorff's alpha: 0.3918 

Percent agreement: 0.1679


### `health`

In [35]:
health = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="health",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

health.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
cac = CAC(health)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9748
Krippendorff's alpha: 0.1752
Gwet's AC1: 0.9829


### `ideology`

In [37]:
ideology = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="ideology",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

ideology.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [38]:
cac = CAC(ideology)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.8255
Krippendorff's alpha: 0.3619
Gwet's AC1: 0.8577


### `insult`

In [39]:
insult = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="insult",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

insult.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,True,True,False,True,True,True,True,False,False,True,...,True,False,True,False,True,False,True,True,False,True
B,True,True,True,True,False,False,True,True,False,True,...,True,True,False,False,True,False,True,True,False,True
C,True,True,True,True,True,False,True,False,True,True,...,True,False,False,False,True,False,True,True,False,False


In [40]:
cac = CAC(insult)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.5065
Krippendorff's alpha: 0.2491
Gwet's AC1: 0.4144


### `lgbtqphobia`

In [41]:
lgbtqphobia = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="lgbtqphobia",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

lgbtqphobia.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [42]:
cac = CAC(lgbtqphobia)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9539
Krippendorff's alpha: 0.5765
Gwet's AC1: 0.9669


### `other_lifestyle`

In [43]:
other_lifestyle = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="other_lifestyle",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

other_lifestyle.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [44]:
cac = CAC(other_lifestyle)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9794
Krippendorff's alpha: 0.3209
Gwet's AC1: 0.9859


### `physical_aspects`

In [45]:
physical_aspects = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="physical_aspects",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

physical_aspects.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [46]:
cac = CAC(physical_aspects)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9525
Krippendorff's alpha: 0.3698
Gwet's AC1: 0.9667


### `profanity_obscene`

In [47]:
profanity_obscene = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="profanity_obscene",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

profanity_obscene.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,True,False,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,True,True,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,False,False,False


In [48]:
cac = CAC(profanity_obscene)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.7079
Krippendorff's alpha: 0.4132
Gwet's AC1: 0.7086


### `racism`

In [49]:
racism = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="racism",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

racism.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [50]:
cac = CAC(racism)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9860
Krippendorff's alpha: 0.2928
Gwet's AC1: 0.9906


### `religious_intolerance`

In [51]:
religious_intolerance = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="religious_intolerance",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

religious_intolerance.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [52]:
cac = CAC(religious_intolerance)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False], Weights: "identity">
Percent agreement: 1.0000
Krippendorff's alpha: 1.0000
Gwet's AC1: 1.0000


  p_value = 2 * (1 - stats.t.cdf(abs(krippen_alpha / stderr), n - 1))
  lower_bound = _a * scale + loc
  upper_bound = _b * scale + loc
  self.z = round(krippen_alpha_est / stderr, self.digits)
  (weights_mat_sum / (self.q * (self.q - 1)))
  ac1_ivec_x = ac1_ivec - 2 * (1 - ac1) * (pe_ivec - pe) / (1 - pe)


### `sexism`

In [53]:
sexism = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="sexism",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

sexism.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [54]:
cac = CAC(sexism)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9272
Krippendorff's alpha: 0.1789
Gwet's AC1: 0.9484


### `xenophobia`

In [55]:
xenophobia = pd.DataFrame(
    [
        get_annotations(
            annotations=item["annotations"],
            label="xenophobia",
            annotation_id_key="annotator_id")
        for item in full_data
    ]
)

xenophobia.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7933,7934,7935,7936,7937,7938,7939,7940,7941,7942
A,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
B,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [56]:
cac = CAC(xenophobia)

print("CAC:", cac)
print(f"Percent agreement: {percent_agreement(cac.ratings):.4f}")
print(f"Krippendorff's alpha: {cac.krippendorff()['est']['coefficient_value']:.4f}")
print(f"Gwet's AC1: {cac.gwet()['est']['coefficient_value']:.4f}")

CAC: <irrCAC.raw.CAC Subjects: 7943, Raters: 3, Categories: [False, True], Weights: "identity">
Percent agreement: 0.9796
Krippendorff's alpha: 0.1951
Gwet's AC1: 0.9862


### Krispendorff's alpha Multi-Label

In the next cells, we will calculate the Krippendorff's alpha considering as a multi-label problem instead of several binary problems.

In [57]:
ratings = {
    "health": health,
    "ideology": ideology,
    "insult": insult,
    "lgbtqphobia": lgbtqphobia,
    "other_lifestyle": other_lifestyle,
    "physical_aspects": physical_aspects,
    "profanity_obscene": profanity_obscene,
    "racism": racism,
    "religious_intolerance": religious_intolerance,
    "sexism": sexism,
    "xenophobia": xenophobia
}

task_data = []
for annotator in health.columns.tolist():
    for item in range(len(health)):
        temp = get_annotations_by_rater(ratings, annotator, item)
        if temp != []:
            task_data.append((
                annotator,
                item,
                frozenset(temp)
            ))

jaccard_task = AnnotationTask(distance=jaccard_distance)
masi_task = AnnotationTask(distance=masi_distance)

for task in [jaccard_task, masi_task]:
    task.load_array(task_data)
    print(f"Krippendorff's alpha using {task.distance}")
    print(f"Krippendorff's alpha: {task.alpha():.4f}", "\n")

pa_mlabels = {}
for item in range(len(health)):
    for annotator in health.columns.tolist():
        temp = get_annotations_by_rater(ratings, annotator, item)
        
        if annotator not in pa_mlabels.keys():
            pa_mlabels[annotator] = []
        
        pa_mlabels[annotator].append(temp)

print(f"Percent agreement: {percent_agreement(pd.DataFrame(pa_mlabels)):.4f}")

Krippendorff's alpha using <function jaccard_distance at 0x7f9bb1cad240>
Krippendorff's alpha: 0.3984 

Krippendorff's alpha using <function masi_distance at 0x7f9bb1cad2d0>
Krippendorff's alpha: 0.3648 

Percent agreement: 0.2435
