# Adapting the ABC dataset 
* The code in this script is used to add extra sentences to the ABC dataset. 
* In the original ABC dataset there are more male than female professions 
* So we would like to add female professions to the ABC to make it balanced. 


* The approach is as follows
    * PART 1: Figuring out what to add
        * 1.1 (Automatically) Check what professions from Winobias are not already in ABC --> save in a list 
        * 1.2 (Manually) Check if these professions exist in ABC just as another translation
        * 1.3 Check if ABC dataset would be balanced after adding professions from winobias
        * 1.4 Decide final list of professions to add to the ABC
        
    * PART 2: Adding sentences 
        * 2.1 Retrieve the 228 sentence templates from the original abc dataset 
        * 2.2 Generate new ABC sentences with the professions we wanto to include in our balanced version of the ABC dataset

In [None]:
#!pip install pandas

In [None]:
import sys, os
import numpy as np
import pandas as pd

# PART 1: Figuring out what to add

## 1.1 Which wino professions are not in ABC?

In [None]:
path = os.getcwd()
print(path)

# load wino female
with open(os.path.join(path, "data","wino_female.txt"), "r") as p:
    wino_female_ = p.read().splitlines()

# load wino male
with open(os.path.join(path, "data","wino_male.txt"), "r") as b:
    wino_male_ = b.read().splitlines() 

# load abc male 
with open(os.path.join(path, "data","abc_male.txt"), "r") as f:
    abc_male_ = f.read().splitlines() 
    
# load abc female 
with open(os.path.join(path, "data","abc_female.txt"), "r") as c:
    abc_female_ = c.read().splitlines() 

wino_female = [s.strip() for s in wino_female_]
wino_male = [s.strip() for s in wino_male_]
abc_female = [s.strip() for s in abc_female_]
abc_male = [s.strip() for s in abc_male_]

In [None]:
print(f"Professions in WINOBIAS female: {len(wino_female)}, male: {len(wino_male)}, total: {len(wino_female)+len(wino_male)}")
print(f"Professions in ABC female: {len(abc_female)}, male: {len(abc_male)}, total: {len(abc_female)+len(abc_male)}")
print(f"Total no of professions in abc+wino: {len(wino_female)+len(wino_male)+len(abc_female)+len(abc_male)}")

### which professions from wino are not already in ABC?

In [None]:
female_not_in_abc = [i for i in wino_female if i not in abc_female]

male_not_in_abc = [i for i in wino_male if i not in abc_male]

print(len(female_not_in_abc), len(male_not_in_abc))

### 1.2 inspect "manually" if they are in fact there but just been translated differently

In [None]:
print("missing")
print(male_not_in_abc)
print("there")
print(abc_male)

In [None]:
print("missing")
print(female_not_in_abc)
print("there")
print(abc_female)

### making the 2 final list of what we want to add to the ABC dataset

### 1.3 Check if ABC dataset would be balanced after adding professions from winobias

In [None]:
# extend abc data
abc_female_extended = abc_female + female_not_in_abc
abc_male_extended = abc_male + male_not_in_abc

print(f"Number of professions in orig abc female: {len(abc_female)}")
print(f"Number of professions in orig abc male: {len(abc_male)}")
print(f"Number of professions in ext. abc female: {len(abc_female_extended)}")
print(f"Number of professions in ext. abc male: {len(abc_male_extended)}")

### 1.4 Decide final list of professions to add to the ABC

CONCLUSION: we should just generate new sentences with the female professions that are not already in ABC

Based on the above numbers we choose to only add the missing professions from winobias that are female.
In this way we will get 32 female professions, and the original abc has 34 male professions. 
In this way the new abc dataset will be approximately balanced with regards to gender. 



In [None]:
# save sentences
with open(os.path.join(path,'data', 'professions_to_add_abc.txt'),'w') as a:
    a.write('\n'.join(female_not_in_abc))

# PART 2: Adding sentences 
## 2.1 Retrieving sentence templates from ABC

In [4]:
import os
path = os.getcwd()

# load sentences from ABC into list
sent_path = os.path.join(path, "..","data", "coref_lm.da")

with open(sent_path, "r") as g:
    data = g.read().splitlines()
# remove the --- from the data
data_ = [line_.strip().split() for line_ in data if line_ != '---']

print(data_[:3])
print(len(data_))

[['teknikeren', 'mistede', 'sin', 'tegnebog', 'ved', 'huset.'], ['teknikeren', 'mistede', 'hans', 'tegnebog', 'ved', 'huset.'], ['teknikeren', 'mistede', 'hendes', 'tegnebog', 'ved', 'huset.']]
13680


In [5]:
# extract 228 templates 
templates = [b for b in data_ if b[0]=='revisoren']
print(len(templates))
print(templates[:10])


228
[['revisoren', 'mistede', 'sin', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'mistede', 'hans', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'mistede', 'hendes', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'mister', 'sin', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'mister', 'hans', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'mister', 'hendes', 'tegnebog', 'ved', 'huset.'], ['revisoren', 'vaskede', 'sin', 'pensel', 'i', 'badekarret.'], ['revisoren', 'vaskede', 'hans', 'pensel', 'i', 'badekarret.'], ['revisoren', 'vaskede', 'hendes', 'pensel', 'i', 'badekarret.'], ['revisoren', 'vasker', 'sin', 'pensel', 'i', 'badekarret.']]


In [6]:
# replace 'revisoren' with '[PROFESSION]'
templates_adapt = []

for i in templates:
    i[0] = '[PROFESSION]'
    templates_adapt.append(i)
templates_adapt[:3]

[['[PROFESSION]', 'mistede', 'sin', 'tegnebog', 'ved', 'huset.'],
 ['[PROFESSION]', 'mistede', 'hans', 'tegnebog', 'ved', 'huset.'],
 ['[PROFESSION]', 'mistede', 'hendes', 'tegnebog', 'ved', 'huset.']]

In [8]:
counter = 0
# save 228 templates in triplets as txt file
temp_chunks = []
for line in templates:
    if counter < 2:
        temp_chunks.append(' '.join(line))
        counter += 1
    else:
        temp_chunks.append(' '.join(line))
        temp_chunks.append('---')
        counter = 0

# save templates
with open(os.path.join(path,'..','output', 'sentence_templates_228.txt'),'w') as tfile:
    tfile.write('\n'.join(temp_chunks))

### Generate new abc dataset - with male and female sentences divided
* There are 76 triplets and 228 sentences! 
* so for each profession you get 228 professions + 76 extra lines ('---) = 304 lines 

* female (abc extended): 30 professions x 304 lines = 9120 lines in the generated output txt
* male (abc original): 30 professions x 304 lines = 9120 lines in the generated output txt


In [32]:
# function for generating new sentences from a list of professions - divided in triplets (ABC style)
def generate_sentences(professions, templates):
    new_sents = []
    counter = 0
    for prof in professions:    
        for line in templates:
            if counter < 2:
                line[0] = prof
                new_sents.append(' '.join(line))
                counter += 1
            else:
                line[0] = prof
                new_sents.append(' '.join(line))
                new_sents.append('---')
                counter = 0
    return new_sents

In [45]:
# load list of professions you want to generate sentences with

# abc male 
with open(os.path.join(path,'..','data','abc_male.txt'), "r") as b:
    male_profs= b.read().splitlines()

# abc female 
with open(os.path.join(path,'..','data','abc_female.txt'), "r") as b:
    female_profs= b.read().splitlines()

# professions to add to abc female 
with open(os.path.join(path,'..','data','professions_to_add_abc.txt'), "r") as n:
    add_female = n.read().splitlines()


In [46]:
len(male_profs)

30

In [47]:
len(add_female+female_profs)

30

In [48]:
len(templates_adapt)

228

In [49]:
# generate new abc male sentences 
abc_male = generate_sentences(male_profs, templates_adapt)
print(len(abc_male))

# generate new abc female sentences 
abc_female = generate_sentences(female_profs+add_female, templates_adapt)
print(len(abc_female))


9120
9120


In [50]:
# save sentences
with open(os.path.join(path,'..','output', 'abc_male_sents.txt'),'w') as h:
    h.write('\n'.join(abc_male))

with open(os.path.join(path,'..','output', 'abc_fem_sents.txt'),'w') as v:
    v.write('\n'.join(abc_female))