# OLID-BR (Build Dataset)

## Imports

In [3]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent.parent))

In [4]:
from dotenv import load_dotenv

# Initialize the env vars
load_dotenv("../../.env")

import uuid
import pandas as pd
from src.utils import read_json
from src.settings import AppSettings
from src.s3 import Uploader
from src.data_models import Annotator, ProcessedText, Metadata
from src.annotations import (
    get_is_offensive,
    get_is_targeted,
    get_targeted_type,
    get_offensive_type,
    get_toxic_spans
)

args = AppSettings()

args.AWS_S3_BUCKET_PREFIX = "processed/pilot"

## Load data

In [3]:
data = read_json("olid-br_pilot.json")

print(f"Count: {len(data)}")

Count: 731


In [4]:
def preprocessing_text(text: str):
    """
    Preprocess the text.
    """
    # Remove new lines
    text = text.replace("\n", " ")
    
    # Remove multiple spaces
    text = " ".join(text.split())

    return text.strip()

In [5]:
texts = []
metadata = []

for item in data:
    text = ProcessedText(
        id=uuid.uuid4().hex,
        text=preprocessing_text(item["data"]["text"]),
        is_offensive=get_is_offensive(item["annotations"]),
        is_targeted=get_is_targeted(item["annotations"]),
        targeted_type=get_targeted_type(item["annotations"]),
        toxic_spans=get_toxic_spans(item["annotations"]),
        health=get_offensive_type(item["annotations"], "Health"),
        ideology=get_offensive_type(item["annotations"], "Ideology"),
        insult=get_offensive_type(item["annotations"], "Insult"),
        lgbtqphobia=get_offensive_type(item["annotations"], "Identity Attack"),
        other_lifestyle=get_offensive_type(item["annotations"], "Other-Lifestyle"),
        physical_aspects=get_offensive_type(item["annotations"], "Body"),
        profanity_obscene=get_offensive_type(item["annotations"], "Profanity"),
        racism=get_offensive_type(item["annotations"], "Racism"),
        religious_intolerance=get_offensive_type(item["annotations"], "Religious intolerance"),
        sexism=get_offensive_type(item["annotations"], "Sexism"),
        xenophobia=get_offensive_type(item["annotations"], "Xenophobia")
    )

    # Metadata
    metadata.append(Metadata(
        id=text.id,
        source=item["data"]["meta_info"]["source"],
        created_at=item["data"]["meta_info"]["created_at"],
        collected_at=item["data"]["meta_info"]["collected_at"],
        toxicity_score=item["data"]["meta_info"]["toxicity_score"],
        category=item["data"]["meta_info"].get("category")
    )
    )

    # Annotators
    for annotator in item["annotations"]:
        if annotator["completed_by"] == 1:
            metadata.append(Annotator(
                id=text.id,
                annotator_id=annotator["completed_by"],
                gender="Male",
                age=28,
                education_level="Bachelor's Degree",
                annotator_type="Researcher"
            ))
        elif annotator["completed_by"] == 32:
            metadata.append(Annotator(
                id=text.id,
                annotator_id=annotator["completed_by"],
                gender="Female",
                age=30,
                education_level="Bachelor's Degree",
                annotator_type="Volunteer"
            ))
        else:
            raise ValueError("Invalid annotator ID.", item)

    texts.append(text)

texts = [text.dict() for text in texts]

lst = []
for meta in metadata:
    meta = meta.dict()
    if "created_at" in meta.keys():
        meta["created_at"] = meta["created_at"].isoformat()
    if "collected_at" in meta.keys():
        meta["collected_at"] = meta["collected_at"].isoformat()
    lst.append(meta)

metadata = lst.copy()
del lst

print(f"Texts Count: {len(texts)}")
print(f"Metadata Count: {len(metadata)}")


Texts Count: 731
Metadata Count: 1520


## Validate data

In the next cells, we will apply some validations to the data.

In [8]:
df = pd.DataFrame(texts)

df.drop_duplicates(subset=["text"], inplace=True)

print(f"Shape: {df.shape}")
df.head()

Shape: (711, 17)


Unnamed: 0,id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
0,6391e3dbb12848ac8ef6131edf2d69f9,USER Canalha URL,OFF,TIN,IND,"[5, 6, 7, 8, 9, 10, 11, 12]",False,False,True,False,False,False,False,False,False,False,False
1,5a21508c2afc4c3295137451baf8b981,USER VTNSC FDP,OFF,TIN,IND,"[5, 6, 7, 8, 9, 10, 11, 12, 13]",False,False,True,False,False,False,True,False,False,False,False
2,060b0390f99041cd8cd38ca5a2b66907,USER O partido do Kim votou à favor do fundão....,OFF,TIN,IND,,False,False,True,False,False,False,False,False,False,False,False
3,3264fc0b06974c2daf423a9e5998d003,USER Grande homem. Em um país de covardes. Ete...,OFF,TIN,GRP,"[34, 35, 36, 37, 38, 39, 40, 41]",False,False,True,False,False,False,False,False,False,False,False
4,002bced1b7444833bbe5e054c5aa6512,USER USER QAnon Brasil é um bicho estranho. Ob...,OFF,TIN,IND,"[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 3...",False,False,True,False,False,False,True,False,False,False,False


In [9]:
texts = df.to_dict("records")
print(f"Texts Count: {len(texts)}")

Texts Count: 711


In [10]:
def check_words(text: str, words: list):
    """
    Check if the text contains all the words.
    """
    for word in words:
        if word not in text:
            return False
    return True

texts = [i for i in texts if not check_words(i["text"], ["USER", "HASHTAG", "URL"])]

print(f"Texts Count: {len(texts)}")

Texts Count: 706


## Upload to S3

In [11]:
uploader = Uploader(bucket=args.AWS_S3_BUCKET,
                    bucket_prefix=args.AWS_S3_BUCKET_PREFIX)

uploader.upload_aksk(access_key=args.AWS_ACCESS_KEY_ID,
                     secret_key=args.AWS_SECRET_ACCESS_KEY,
                     key="olidbr.json", data=texts)

{'ResponseMetadata': {'RequestId': '2WN3Y4QTQ1QW37M1',
  'HostId': 'YYwdJTNRk9gPkDOwzrQsaCkumf1fWHSu8hUB6yMtuu1yVJyrLxkzYdyi3koLOHknOPq+d5/CAG0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'YYwdJTNRk9gPkDOwzrQsaCkumf1fWHSu8hUB6yMtuu1yVJyrLxkzYdyi3koLOHknOPq+d5/CAG0=',
   'x-amz-request-id': '2WN3Y4QTQ1QW37M1',
   'date': 'Fri, 04 Feb 2022 03:01:01 GMT',
   'x-amz-version-id': 'rgjB7ewBDPakKt8YwkfnUzdAiobpIsa6',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"ffa1ae4f738ce8f7983d93f61c7046ac"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"ffa1ae4f738ce8f7983d93f61c7046ac"',
 'ServerSideEncryption': 'AES256',
 'VersionId': 'rgjB7ewBDPakKt8YwkfnUzdAiobpIsa6'}

In [9]:
uploader.upload_aksk(access_key=args.AWS_ACCESS_KEY_ID,
                     secret_key=args.AWS_SECRET_ACCESS_KEY,
                     key="metadata.json", data=metadata)

{'ResponseMetadata': {'RequestId': '7B0C9CMX1RPCSKEG',
  'HostId': 'u/AQWjaLmY5p/pWIvT7LEvyLoMp6ygvExYhy3/G7zJpbdqlCvy04zCULqx2N4CfWzFiyB1slWFA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'u/AQWjaLmY5p/pWIvT7LEvyLoMp6ygvExYhy3/G7zJpbdqlCvy04zCULqx2N4CfWzFiyB1slWFA=',
   'x-amz-request-id': '7B0C9CMX1RPCSKEG',
   'date': 'Mon, 24 Jan 2022 02:30:17 GMT',
   'x-amz-version-id': '1VhIUK3e4WlGnfqwj84lg4X1TV6tkL3e',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"e8ad2844319dc1a02d9ee97639f11cb4"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"e8ad2844319dc1a02d9ee97639f11cb4"',
 'ServerSideEncryption': 'AES256',
 'VersionId': '1VhIUK3e4WlGnfqwj84lg4X1TV6tkL3e'}