#### Let's create a curated dataset from the WinoBias dataset

In [17]:
"""
    Here, we will have the occupations that are available in the dataset
"""

male_path: str = "../misc/corefBias/WinoBias/wino/data/male_occupations.txt"
female_path: str = "../misc/corefBias/WinoBias/wino/data/female_occupations.txt"

def text_to_list(path: str) -> list:
    with open(path, 'r') as f:
        data = f.read().splitlines()
        print(data)
        return data
    
male_occupations = text_to_list(male_path)
female_occupations = text_to_list(female_path)

['driver', 'supervisor', 'janitor', 'cook', 'mover', 'laborer', 'construction worker', 'chief', 'developer', 'carpenter', 'manager', 'lawyer', 'farmer', 'salesperson', 'physician', 'guard', 'analyst', 'mechanic', 'sheriff', 'CEO']
['attendant', 'cashier', 'teacher', 'nurse', 'assistant', 'secretary', 'auditor', 'cleaner', 'receptionist', 'clerk', 'counselor', 'designer', 'hairdresser', 'writer', 'housekeeper', 'baker', 'accountant', 'editor', 'librarian', 'tailor']


In [25]:
import json
pronounse = ["he", "she", "his", "her", "him", "hers", "himself", "herself"]


def get_the_context(text: str):
    """
    Formats the sentence along with the label in the following way:
        - Sentence
        - Occupation
        - Pronoun
        - Label
    """
    sentence = text.split("\t")[1]
    label = text.split("\t")[2]
    occupation = sentence.split("[")[1].split("]")[0]
    # print("Occupation: ", occupation)
    try:
        occupation_coding = "female-coded" if occupation.split(" ")[1].lower() in female_occupations else "male-coded"
    except:
        occupation_coding = "female-coded" if occupation.split(" ")[0].lower() in female_occupations else "male-coded"
    pronoun = sentence.split("[")[2].split("]")[0]
    # print("Occupation Coding: ", occupation_coding)
    return sentence, occupation, pronoun, label, occupation_coding


def create_the_dataset(file_name: str):
    """
    The WinoBias dataset contains the following
    columns:
        - index
        - sentence
        - label
    """
    data_dir = "../data/WinoBias/new/TSV/"
    with open(data_dir + file_name, "r") as f:
        lines = f.readlines()
        new_data = []
        for line in lines:
            sentence, occupation, pronoun, label, occupation_coding = get_the_context(line)
            example = {
                "sentence": sentence,
                "occupation": occupation,
                "occupation_coding": occupation_coding,
                "pronoun": pronoun,
                "label": int(label)
            }
            new_data.append(example)

    new_file_name = file_name.split(
        ".")[0] + "." + file_name.split(".")[1] + ".json"
    new_data = {"data": new_data}
    data_dir = "../data/WinoBias/new/JSON/"
    with open(data_dir + new_file_name, "w") as f:
        json.dump(new_data, f)
    print("The new dataset has been created and saved as: ", new_file_name)


file_names = ["anti_stereotyped_type1.dev.tsv", "anti_stereotyped_type1.test.tsv", "anti_stereotyped_type2.dev.tsv", "anti_stereotyped_type2.test.tsv",
              "pro_stereotyped_type1.dev.tsv", "pro_stereotyped_type1.test.tsv", "pro_stereotyped_type2.test.tsv", "pro_stereotyped_type2.dev.tsv"]

def run():
    for file_name in file_names:
        create_the_dataset(file_name)

run()

The new dataset has been created and saved as:  anti_stereotyped_type1.dev.json
The new dataset has been created and saved as:  anti_stereotyped_type1.test.json
The new dataset has been created and saved as:  anti_stereotyped_type2.dev.json
The new dataset has been created and saved as:  anti_stereotyped_type2.test.json
The new dataset has been created and saved as:  pro_stereotyped_type1.dev.json
The new dataset has been created and saved as:  pro_stereotyped_type1.test.json
The new dataset has been created and saved as:  pro_stereotyped_type2.test.json
The new dataset has been created and saved as:  pro_stereotyped_type2.dev.json
