# OLID-BR (Build Dataset)

In this notebook, we will build the OLID-BR dataset from the processed data.

## Imports

In [2]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [3]:
from dotenv import load_dotenv

# Initialize the env vars
load_dotenv("../../.env")

True

In [4]:
import os
import json
import datetime
import pandas as pd
from pandas_profiling import ProfileReport
from kaggle.api.kaggle_api_extended import KaggleApi
from src.s3 import Bucket
from src.settings import AppSettings

In [5]:
args = AppSettings()

version = "v0.4-alpha"

bucket = Bucket(args.AWS_S3_BUCKET)

bucket.get_session_from_aksk(
    args.AWS_ACCESS_KEY_ID,
    args.AWS_SECRET_ACCESS_KEY)

## Load data

In the next cells, we will load all processed data.

In [59]:
iterations = [
    # {
    #     "data": "processed/olid-br/iterations/1/olidbr.json",
    #     "metadata": "processed/olid-br/iterations/1/metadata.json"
    # },
    {
        "data": "processed/olid-br/iterations/2/olidbr.json",
        "metadata": "processed/olid-br/iterations/2/metadata.json",
        "full_data": "processed/olid-br/iterations/2/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/3/olidbr.json",
        "metadata": "processed/olid-br/iterations/3/metadata.json",
        "full_data": "processed/olid-br/iterations/3/full_olidbr.json"
    },
    {
        "data": "processed/olid-br/iterations/4/olidbr.json",
        "metadata": "processed/olid-br/iterations/4/metadata.json",
        "full_data": "processed/olid-br/iterations/4/full_olidbr.json"
    }
]

In [60]:
data = []
metadata = []
full_data = []

for iteration in iterations:
    print(f"Loading {iteration['data']}")

    iteration_data = bucket.download_json(key=iteration["data"])
    iteration_metadata = bucket.download_json(key=iteration["metadata"])

    data.extend(iteration_data)
    metadata.extend(iteration_metadata)
    
    print(f"Data iteration size: {len(iteration_data)}")
    print(f"Metadata iteration size: {len(iteration_metadata)}")

    if iteration.get("full_data"):
        iteration_full_data = bucket.download_json(key=iteration["full_data"])
        full_data.extend(iteration_full_data)
        print(f"Full data iteration size: {len(iteration_full_data)}")

print(f"Data: {len(data)}")
print(f"Metadata: {len(metadata)}")
print(f"Full data: {len(full_data)}")

Loading processed/olid-br/iterations/2/olidbr.json
Data iteration size: 2996
Metadata iteration size: 11984
Full data iteration size: 2996
Loading processed/olid-br/iterations/3/olidbr.json
Data iteration size: 2987
Metadata iteration size: 11948
Full data iteration size: 2987
Loading processed/olid-br/iterations/4/olidbr.json
Data iteration size: 1851
Metadata iteration size: 7404
Full data iteration size: 1851
Data: 7834
Metadata: 31336
Full data: 7834


## Data processing

In this section, we will perform some data processing in order to clean and fix some issues in the dataset.

### Remove duplicated entries

In [61]:
df = pd.DataFrame(data)
print(f"Duplicated text: {df['text'].duplicated().sum()}")

df.drop_duplicates(subset="text", inplace=True)
print(df.shape)

data = df.to_dict("records")

Duplicated text: 34
(7800, 17)


In [62]:
# Remove duplicated texts from full data
full_data = [i for i in full_data if i["text"] in df["text"].values]

print(f"Full data: {len(full_data)}")

Full data: 7834


In [63]:
# Remove duplicated texts from metadata
print(f"Count metadata (before): {len(metadata)}")
metadata = [i for i in metadata if i["id"] in df["id"].values]
print(f"Count metadata (after): {len(metadata)}")

Count metadata (before): 31336
Count metadata (after): 31200


## Profiling Report

In this section, we will generate a profiling report for the dataset.

In [64]:
profile = ProfileReport(
    pd.DataFrame(data),
    title=f"OLID-BR {version}",
    explorative=True)

profile.to_file(f"../../docs/reports/olidbr_{version}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Split into train, validation and test sets

- **Training set**: 60% of the dataset.
- **Validation set**: 20% of the dataset.
- **Test set**: 20% of the dataset.

Convert `is_offensive`, `is_targeted`, and `targeted_type` categories values to int

In [65]:
# is_offensive
df["is_offensive"] = df["is_offensive"].replace({
    "OFF": 1,
    "NOT": 0
})

# is_targeted
df["is_targeted"] = df["is_targeted"].replace({
    "TIN": 1,
    "UNT": 0
})

# targeted_type
df["targeted_type"].fillna(0, inplace=True)
df["targeted_type"] = df["targeted_type"].replace({
    "IND": 1,
    "GRP": 2,
    "OTH": 3
})

In [66]:
params = {
    "seed": 1993,
    "test_size": 0.2,
    "val_size": 0.25 # 0.25 x 0.8 = 0.2
}

labels = [
    "is_offensive",
    "is_targeted",
    "targeted_type",
    "health",
    "ideology",
    "insult",
    "lgbtqphobia",
    "other_lifestyle",
    "physical_aspects",
    "profanity_obscene",
    "racism",
    "sexism",
    "xenophobia"
]

X = df[["id", "text"]].values
y = df[labels].astype(int).values

In [67]:
from src.modeling.selection import multilabel_train_test_split

X_train, X_test, y_train, y_test = multilabel_train_test_split(
    X, y, test_size=params["test_size"],
    random_state=params["seed"], stratify=y)

X_train, X_val, y_train, y_val = multilabel_train_test_split(
    X_train, y_train, test_size=params["val_size"],
    random_state=params["seed"], stratify=y_train)

print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")
print(f"Test: {X_test.shape}")

Train: (4680, 2)
Validation: (1560, 2)
Test: (1560, 2)


In [68]:
train_data = [item for item in data if item["id"] in X_train[:, 0]]
val_data = [item for item in data if item["id"] in X_val[:, 0]]
test_data = [item for item in data if item["id"] in X_test[:, 0]]

train_metadata = [item for item in metadata if item["id"] in [i["id"] for i in train_data]]
val_metadata = [item for item in metadata if item["id"] in [i["id"] for i in val_data]]
test_metadata = [item for item in metadata if item["id"] in [i["id"] for i in test_data]]

full_train_data = [item for item in full_data if item["id"] in X_train[:, 0]]
full_val_data = [item for item in full_data if item["id"] in X_val[:, 0]]
full_test_data = [item for item in full_data if item["id"] in X_test[:, 0]]

print(f"Train: {len(train_data)}")
print(f"Validation: {len(val_data)}")
print(f"Test: {len(test_data)}")

print(f"Full train: {len(full_train_data)}")
print(f"Full validation: {len(full_val_data)}")
print(f"Full test: {len(full_test_data)}")

Train: 4680
Validation: 1560
Test: 1560
Full train: 4680
Full validation: 1560
Full test: 1560


## Upload data to S3

In [70]:
files = {
    "train.csv": pd.DataFrame(train_data),
    "validation.csv": pd.DataFrame(val_data),
    "test.csv": pd.DataFrame(test_data),
    "train_metatada.csv": pd.DataFrame(train_metadata),
    "validation_metatada.csv": pd.DataFrame(val_metadata),
    "test_metatada.csv": pd.DataFrame(test_metadata),
    "train.json": full_train_data,
    "validation.json": full_val_data,
    "test.json": full_test_data
}

for file, obj in files.items():
    print(f"Saving {file}", end="")
    if file.endswith(".csv"):
        bucket.upload_csv(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    elif file.endswith(".json"):
        bucket.upload_json(
            data=obj,
            key=f"processed/olid-br/{version}/{file}")
    else:
        raise ValueError("Invalid file format")
    print(" - Done")

print("All files uploaded.")

Saving train.csv - Done
Saving validation.csv - Done
Saving test.csv - Done
Saving train_metatada.csv - Done
Saving validation_metatada.csv - Done
Saving test_metatada.csv - Done
Saving train.json - Done
Saving validation.json - Done
Saving test.json - Done
All files uploaded.


## Upload dataset to Kaggle and Hugging Face

Kaggle

In [None]:
date = datetime.datetime.now().strftime('%Y-%m-%d')

temp_dir = "temp"

In [76]:
private_files = [
    "validation.csv",
    "validation_metatada.csv",
    "validation.json"
]

dataset_metadata = {
    "id": "dougtrajano/olidbr",
    "licenses": [{"name": "CC0-1.0"}],
    "title": "OLID-BR"
}

os.makedirs(temp_dir, exist_ok=True)
if not os.path.exists(f"{temp_dir}/dataset-metadata.json"):
    with open(os.path.join(temp_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)

for file, obj in files.items():
    if file.endswith(".csv") and file not in private_files:
        obj.to_csv(f"{temp_dir}/{file}", index=False, encoding="utf-8")
    elif file.endswith(".json") and file not in private_files:
        with open(f"{temp_dir}/{file}", "w") as f:
            json.dump(obj, f, indent=4)

In [78]:
api = KaggleApi()
api.authenticate()

api.dataset_create_version(
    folder=temp_dir,
    version_notes=f"OLID-BR {version} - {date}",
    convert_to_csv=False
)

print("Dataset uploaded to Kaggle.")

Starting upload for file test.csv


100%|██████████| 493k/493k [00:02<00:00, 171kB/s]  


Upload successful: test.csv (493KB)
Starting upload for file test.json


100%|██████████| 4.78M/4.78M [00:02<00:00, 1.81MB/s]


Upload successful: test.json (5MB)
Starting upload for file test_metatada.csv


100%|██████████| 590k/590k [00:02<00:00, 273kB/s]  


Upload successful: test_metatada.csv (590KB)
Starting upload for file train.csv


100%|██████████| 1.45M/1.45M [00:02<00:00, 633kB/s] 


Upload successful: train.csv (1MB)
Starting upload for file train.json


100%|██████████| 14.6M/14.6M [00:03<00:00, 5.01MB/s]


Upload successful: train.json (15MB)
Starting upload for file train_metatada.csv


100%|██████████| 1.73M/1.73M [00:02<00:00, 694kB/s] 


Upload successful: train_metatada.csv (2MB)
Dataset uploaded to Kaggle.


Hugging Face

In [89]:
import datasets

dataset = datasets.dataset_dict.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(train_data), split="train"),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(test_data), split="test")
})

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 4680
    })
    test: Dataset({
        features: ['id', 'text', 'is_offensive', 'is_targeted', 'targeted_type', 'toxic_spans', 'health', 'ideology', 'insult', 'lgbtqphobia', 'other_lifestyle', 'physical_aspects', 'profanity_obscene', 'racism', 'religious_intolerance', 'sexism', 'xenophobia'],
        num_rows: 1560
    })
})

In [93]:
dataset.push_to_hub(
    repo_id="dougtrajano/olid-br",
    private=True,
    token=args.HUGGINGFACE_HUB_TOKEN)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Delete local files

In [94]:
import shutil
shutil.rmtree(temp_dir)