# 0. Data download and preparation

See next notebook for descriptions of datasets. 

After running commands move output dataset files to appropriate folders before preceeding to next notebook.

In [None]:
import pandas as pd
from tqdm import tqdm

## 0.1 Corpus datasets for training

### 0.1.1 Wikipedia:
Download from [here](https://rlebret.github.io/wikipedia-biography-dataset/) and follow extraction instructions:

In [None]:
!cat rlebret-wikipedia-biography-dataset-d0d6c78/wikipedia-biography-dataset.z?? > tmp.zip
!unzip tmp.zip -d .
!rm tmp.zip

In [None]:
!cat ./wikipedia-biography-dataset/*/*.sent > data/corpus/dataset_wikibios.txt

In [None]:
!cat ./wikipedia-biography-dataset/*/*.nb > data/corpus/nb_wikibios.txt

In [None]:
with open("data/corpus/nb_wikibios.txt", "r") as f:
    nb = f.readlines()
    nb = [int(i) for i in nb]
print(len(nb), sum(nb))

In [None]:
batch_sentences = []
with open("data/corpus/dataset_wikibios.txt", "r") as f:
    for n in tqdm(nb):
        batch_sentences.append(" ".join([f.readline().strip() for _ in range(n)]))

In [None]:
with open('data/corpus/dataset_wikibios_merged.txt', 'w') as f:
    for line in batch_sentences:
        f.write(f"{line}\n")

### 0.1.2 Doughman dataset:
Download from [here](https://github.com/jaddoughman/Gender-Bias-Datasets-Lexicons/blob/main/generic_pronouns/dataset.csv)

In [None]:
dataset = pd.read_csv("https://raw.githubusercontent.com/jaddoughman/Gender-Bias-Datasets-Lexicons/main/generic_pronouns/dataset.csv")
# or dataset = pd.read_csv("dataset.csv")

In [None]:
dataset.to_csv("data/corpus/dataset_doughman.txt", columns=["sentence"], header=False, index=False)

Extract "Negative" labels representing sterotypes/bias:

In [None]:
df_neg = dataset[dataset["label"] == "Negative"]

In [None]:
df_neg.to_csv("data/corpus/dataset_doughman_stereotype.txt", columns=["sentence"], header=False, index=False)

### 0.1.3 BUG dataset:
Download from [here](https://github.com/SLAB-NLP/BUG) and extract

In [None]:
full_bug = pd.read_csv("BUG/data/full_BUG.csv")

In [None]:
gold_bug = pd.read_csv("BUG/data/gold_BUG.csv")

In [None]:
bug = pd.concat([full_bug, gold_bug])

In [None]:
len(full_bug) + len(gold_bug), len(bug)

In [None]:
bug.to_csv("data/corpus/dataset_bug.txt", columns=["sentence_text"], header=False, index=False)

In [None]:
bug[bug["stereotype"] == 1].to_csv("data/corpus/dataset_bug_stereotype.txt", columns=["sentence_text"], header=False, index=False)

## 0.2 Lexicon datasets for testing

### 0.2.1 Test lexicon

In [None]:
%%writefile data/lexicons/test_lexicon.csv
word,label
he,1
him,1
his,1
father,1
fatherly,1
male,1
masculine,1
man,1
boy,1
guy,1
she,-1
her,-1
mother,-1
motherly,-1
female,-1
feminine,-1
woman,-1
lady,-1
girl,-1
gal,-1
they,0
their,0
experience,0
grass,0
animal,0
car,0
tree,0
glass,0
water,0
banana,0

### 0.2.2 "Bias" lexicon from psychology + sociology papers

In [None]:
%%writefile data/lexicons/bias_lexicon.csv
word,label
active,1
adventurous,1
aggressive,1
ambitious,1
analytical,1
assertive,1
athletic,1
autonomous,1
battle,1
boast,1
challenge,1
champion,1
competitive,1
confident,1
courageous,1
decide,1
decision,1
decisive,1
defend,1
determined,1
dominate,1
dominant,1
driven,1
fearless,1
fight,1
force,1
greedy,1
headstrong,1
hierarchy,1
hostile,1
impulsive,1
independent,1
individual,1
individualistic,1
intellectual,1
lead,1
logical,1
objective,1
opinion,1
outspoken,1
persist,1
principle,1
reckless,1
self-confident,1
self-reliant,1
self-sufficient,1
defensive,1
assert,1
challenging,1
stubborn,1
superior,1
unreasonable,1
agree,-1
affectionate,-1
caring,-1
child,-1
childlike,-1
cheery,-1
cheerful,-1
collaborative,-1
committed,-1
communal,-1
compassionate,-1
connected,-1
considerate,-1
cooperative,-1
co-operative,-1
depend,-1
emotional,-1
empathetic,-1
feeling,-1
gentle,-1
honest,-1
interpersonal,-1
interdependent,-1
interpersonal,-1
responsive,-1
supporting,-1
trusted,-1
kind,-1
kinship,-1
loyal,-1
modesty,-1
nagging,-1
nurture,-1
pleasant,-1
polite,-1
quiet,-1
respond,-1
sensitive,-1
submissive,-1
support,-1
sympathetic,-1
tender,-1
together,-1
trust,-1
understand,-1
warm,-1
whine,-1
enthusiastic,-1
inclusive,-1
yielding,-1
share,-1
sharing,-1
adaptive,0
adaptable,0
authentic,0
available,0
availability,0
clear,0
concise,0
conscientious,0
conventional,0
courteous,0
curious,0
curiosity,0
delightful,0
easy,0
engaged,0
experienced,0
experience,0
fair,0
flexible,0
flexibility,0
frank,0
friendly,0
global,0
good,0
happy,0
helpful,0
help,0
inefficient,0
integrity,0
likable,0
likeable,0
literate,0
literacy,0
objective,0
open,0
perseverance,0
persistent,0
planning,0
professionally,0
punctual,0
qualified,0
reassure,0
reliable,0
respect,0
respectful,0
responsive,0
rewarding,0
simple,0
sincere,0
tactful,0
tenacious,0
tenaciousness,0
reassurance,0
repetitive,0