dat# Divide ABC professions
* The code in this script is used to divide the professions in the ABC dataset into two portions: male and female. 

* Firstly, it uses the knowledge from the Winobias a set to assign gender to the professions that are in that corpus, and for which we therefore know the gender 

* Then a Danish wordembeddings is used to assign gender to the remaining professions in the ABC corpus

* It makes use of the following embedding:
https://sprogteknologi.dk/dataset/danish-dsl-and-reddit-word2vec-word-embeddings 

* The script uses a set of definitional gender pairs (e.g. han/hun, mand/kvinde etc.) and computes PCA to find a "gender direction" which representes the difference between the male and female gender in the word embedding. This approach is inspired by Bolukbasi 2016

* It then computes the cosine similarity between the professions and this gender direction. 
* The professions that are the most similar to the profession are said to be male skewed.
* Using common sense a decision boundary is picked and in this way the remaining part of the dataset is divided in two.

In [None]:
#!pip install openpyxl
#!pip install pandas
#!pip install gensim
#!pip install scikit-learn
#!pip install matplotlib

In [None]:
import sys, os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import json
import matplotlib.pyplot as plt
plt.style.use("seaborn")


## Use Winobias to assign gender to the professions we know

In [None]:
# load abc  
with open(os.path.join(os.getcwd(), "..","data","abc_all.txt"), "r") as f:
    abc_ = f.read().splitlines()

abc = [b.strip() for b in abc_]    

In [None]:
len(abc)

In [None]:
# load wino female
with open(os.path.join(os.getcwd(), "..","data","wino_fem.txt"), "r") as p:
    wino_female_ = p.read().splitlines() 

# load wino male
with open(os.path.join(os.getcwd(), "..","data","wino_male.txt"), "r") as b:
    wino_male_ = b.read().splitlines() 

# load abc  
with open(os.path.join(os.getcwd(), "..","data","abc_all.txt"), "r") as f:
    abc_ = f.read().splitlines()

wino_female = [s.strip() for s in wino_female_]
wino_male = [g.strip() for g in wino_male_]
abc = [b.strip() for b in abc_]    

In [None]:
female_abc = [i for i in abc if i in wino_female]
male_abc = [i for i in abc if i in wino_male]

print("male:")
print(male_abc)

print("female:")
print(female_abc)

In [None]:
remaining_ = [i for i in abc if i not in female_abc + male_abc]
len(remaining_)

## Divide remaining 41 ABC professions using wordembedding

In [None]:
from gensim import models
model_path = "/work/cool-programmer-astrid/dsl_skipgram_2020_m5_f500_epoch2_w5.model.txtvectors" 

# load word embedding from txt  
emb_model = models.KeyedVectors.load_word2vec_format(model_path, binary=False, unicode_errors='replace')


In [None]:
# check if words are in model vocab 
remaining = [w for w in remaining_ if w in emb_model.key_to_index]
print(len(remaining_), len(remaining))

In [None]:
print("Profession not existing in the word embedding:")
[item for item in remaining_ if item not in remaining]


In [None]:
# test that profession word vectors are not just filled with zeroes - but have a vector
#for i in abc:
#    print(emb_model[i][:5])

### Compute the gender direction

In [None]:
definitional_filename = os.path.join(path, "data","da_definitional_pairs.json")

# load definitional gender pairs
with open(definitional_filename, "r") as f:
    definitional = json.load(f)

In [None]:
def doPCA(pairs, embedding, num_components = 0.95):
    '''
    Function for doing PCA on the difference vectors from the definitional gender pairs.
    '''
    matrix = []
    for a, b in pairs:
        center = (embedding[a] + embedding[b])/2
        matrix.append(embedding[a] - center)
        matrix.append(embedding[b] - center)
    matrix = np.array(matrix)
    pca = PCA(n_components = num_components)
    pca.fit(matrix)
    return pca
   
def plotPCA(pca, n_components):
    plt.bar(range(pca.n_components_), pca.explained_variance_ratio_, color = "seagreen")
    plt.title(f"PCA: Gender direction", fontsize=20)
    plt.ylim([0.0, 0.7])
    plt.xlabel("PCA components", fontsize=22)
    plt.ylabel("Explained variance", fontsize=22)
    print ("PCA plot saved to output folder")
    plt.savefig(os.path.join(path, "output", "pca_plot.png"))
    

In [None]:
# do PCA analysis
pca = doPCA(definitional, emb_model, num_components=10)
    
# plot PCA
plotPCA(pca, n_components=0.95)

# get gender direction as csv file
gender_direction = pca.components_[0]

#save gender subspace
#np.savetxt(os.path.join(os.getcwd(), "..", "output", f"{model_alias}_gender_subspace.csv"), gender_direction, delimiter=',')


In [None]:
from scipy.spatial.distance import cdist

# get similarities of professions to the female gender direction
female_sim = []

for professsion in remaining:
    cos_sim = cdist(emb_model[professsion].reshape(1, -1), gender_direction.reshape(1, -1), 'cosine')
    female_sim.append(cos_sim[0][0])


In [None]:
# make df 
df = pd.DataFrame({
 'profession': remaining,
 'female_sim': female_sim})
df.head(5)

### Inspect professions
* for this part we inspected the ranking of professions and made a decision boundary 
* subsequently this splitting was validated manually

In [None]:
# sort by similarity to female gender direction
df.sort_values('female_sim')

In [None]:
df['female_skewed'] = df['female_sim'] < 0.917873 # decision boundary 
df.head(5)


In [None]:
print(f"Number of female skewed professions {len(df[df['female_skewed']==True]['profession'].values)}")
print(f"Number of male skewed professions {len(df[df['female_skewed']==False]['profession'].values)}")

In [None]:
print("Female professions:")
print(df[df['female_skewed']==True]['profession'].values)

In [None]:
print("Male professions:")
print(df[df['female_skewed']==False]['profession'].values)

In [None]:
# add the female professions together with the professions from we already know the gender for (from winobias)
female_professions = female_abc + list(df[df['female_skewed']==True]['profession'].values)
male_professions = male_abc + list(df[df['female_skewed']==False]['profession'].values)

In [None]:
# save the two lists of professions 
with open(os.path.join(os.getcwd(),'data', 'abc_female.txt'),'w') as a:
    a.write('\n'.join(female_professions))

with open(os.path.join(os.getcwd(),'data','abc_male.txt'),'w') as b:
    b.write('\n'.join(male_professions))