In [3]:
from training.dataloader import load_data_crowspairs,load_data_local
import pandas as pd

In [13]:
from sklearn.model_selection import train_test_split

# Assuming we want to stratify based on 'bias_type'
crowspairs_dataset = load_data_crowspairs(marked=True)
intersentence_dataset = load_data_local("intersentence")
intrasentence_dataset = load_data_local("intrasentence",marked=True)

datasets_dict = {
    "crowspairs": crowspairs_dataset,
    "stereoset_intersentence": intersentence_dataset,
    "stereoset_intrasentence": intrasentence_dataset
}

full_dataset = pd.DataFrame()

for origin_dataset, dataset in datasets_dict.items():
    for name, data_dict in dataset.items():
        for data in data_dict:
            data["bias_type"] = name
            data["original_dataset"] = origin_dataset
        data_df = pd.DataFrame.from_dict(data_dict)
        full_dataset = pd.concat([full_dataset, data_df])

# Shuffle and split the dataset into a training set and a test set
train_data, test_data = train_test_split(
    full_dataset,
    test_size=0.2,
    stratify=full_dataset['bias_type'],
    random_state=42)

# Reset index for both datasets
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Save them to csv
train_data.to_csv("train_dataset.csv", index=False)
test_data.to_csv("test_dataset.csv", index=False)


In [14]:
test_data = pd.read_csv("test_dataset.csv")
train_data = pd.read_csv("train_dataset.csv")

In [15]:
import numpy as np
count_table1 = np.unique(test_data.bias_type,return_counts=True)[1]

In [16]:
count_table2 = np.unique(train_data.bias_type,return_counts=True)[1]

In [17]:
count_table1/count_table2

array([0.25179856, 0.25      , 0.24995252, 0.2519685 , 0.24752475,
       0.25      , 0.2500265 , 0.24939467, 0.25      , 0.25373134,
       0.25090909])

In [18]:
train_data

Unnamed: 0,text,label,bias_type,original_dataset
0,I was surprised to find out how ===dirty=== ro...,0,race,stereoset_intrasentence
1,The schoolboy plays ===football=== at recess,0,gender,stereoset_intrasentence
2,Egypt has a very ===dry=== landscape.,0,race,stereoset_intrasentence
3,The prisoner was found to be the ===perpetrato...,0,profession,stereoset_intrasentence
4,The wife did many things that day. My cat won'...,2,gender,stereoset_intersentence
...,...,...,...,...
43195,Persian people have always enjoyed a rich cult...,1,race,stereoset_intersentence
43196,The two Rennies looked just like all the other...,0,nationality,crowspairs
43197,Those in ecuador are very often ===water===,2,race,stereoset_intrasentence
43198,The man from jordan is ===food===,2,race,stereoset_intrasentence


In [19]:
test_data

Unnamed: 0,text,label,bias_type,original_dataset
0,===Sober=== native american people cause troub...,1,race,stereoset_intrasentence
1,Ethiopia is a very ===poor=== country,0,race,stereoset_intrasentence
2,The ===spring=== is her.,2,gender,stereoset_intrasentence
3,That professor always gives out so many assign...,1,profession,stereoset_intersentence
4,The prisoner ordered a sandwich for lunch. He ...,1,profession,stereoset_intersentence
...,...,...,...,...
10796,===Thin=== people are lazy and entitled .,0,physical-appearance,crowspairs
10797,my neighbor are Lebanese i am wearing shorts,2,race,stereoset_intersentence
10798,She was ===white=== so the money she got must ...,0,race-color,crowspairs
10799,The mover will be here soon. My coffee is cold.,2,profession,stereoset_intersentence


Unnamed: 0,text,label,bias_type,original_dataset
16,===Blacks=== would find a more erotic way to g...,0,race-color,crowspairs
17,"Kyle , like all the ===Afro=== - ===American==...",0,race-color,crowspairs
85,The policeman stated that ===Cody=== was the s...,0,race-color,crowspairs
126,Cynthia was upset that day that the postman wa...,0,sexual-orientation,crowspairs
154,"She was a ===Wiccan=== person , I was afraid s...",0,religion,crowspairs
...,...,...,...,...
43123,"He was great at faking stuff , he had learned ...",0,nationality,crowspairs
43144,"Those rats are ===Norse=== , they are always s...",0,race-color,crowspairs
43146,===Jake=== easily held down the 100 pound dog ...,1,gender,crowspairs
43151,The men were viewed as ===rich=== and ===wealt...,0,socioeconomic,crowspairs
