# Importing Professions

STEP 1: Importing Professions CSV format

In [1]:
import pandas as pd
import os
os.getcwd()
data = pd.read_csv("myprofessions.csv", encoding="latin1")
print(data.head(4))
# 320 professions.I added she, he at the end for embedding and pojection purposes.

# I only select the professions column I want to use
myprofessions = data.iloc[:,1]
professions = myprofessions.values.tolist()

   Unnamed: 0 myprofessions
0           1    accountant
1           2  acquaintance
2           3         actor
3           4       actress


STEP 2: Some Preprocessing 

In [2]:
#I want to remomve the '_', so that the word for ex, vice_chancellor
# become vice-chancellor. I need the correct format to find it in glove
import re
professions = [re.sub('_', "-", p) for p in professions]

# Mapping Professions to their GloVe embedding

STEP 1: Load embedding 100d 

In [6]:
# load the whole embedding into memory
from numpy import array
from numpy import asarray
embeddings_index = dict()
#first download the glove word emedding and then search for them
f = open('glove.6B.50d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


STEP 2: Find Professions in GloVe Embeddings

In [7]:
# Selecting professions and she,he from glove 100d word embedding
professions_embedding = [(key,value) for (key,value) in embeddings_index.items() if key in professions]

# Saving in a CSV Dataframe for future use: 
professions_embedding_df = pd.DataFrame(professions_embedding)
#professions_embedding_df.to_csv(r'professions_embedding_df.csv')
a = (professions_embedding_df.iloc[:,1].values.tolist())
b = pd.DataFrame(a)
c = professions_embedding_df.iloc[:,0]
professions_embedding_df = pd.concat([c,b], axis=1)
#professions_embedding_df.to_csv(r'professions_embedding100d_df.csv')

In [8]:
# Note to self: 299(-2) instead of 327 professions are found
len(professions_embedding_df)

299

# Projecting Professions onto She-He Vector

STEP 1: I normalize the vectors

In [10]:
import numpy as np

def squared (x):
    return(x**2)

def magn(df):
    t = df.apply(squared, axis=1)
    magnitude=np.array(np.sqrt(t.sum(axis=1)))
    return(magnitude)

# Normalizing function
def norm(df):
    t = df.apply(squared, axis=1)
    magnitude=np.array(np.sqrt(t.sum(axis=1)))
    for i in range(0,len(magnitude)):
        df.iloc[i,:] = df.iloc[i,:]/float(magnitude[i])
    return(df)

# Embeddings Normalized:
embedd = professions_embedding_df.iloc[:,1:101]
norm_embedd = pd.concat([professions_embedding_df.iloc[:,0],norm(embedd)], axis=1)

# Professions Subset
professions_embedding = norm_embedd.iloc[2:len(norm_embedd),:]

# he vector
he = norm_embedd.iloc[0:1,:]

# He vector
she = norm_embedd.iloc[1:2,:]

# She-He array:
she_he_arr = np.array(she.iloc[0,1:101])-np.array(he.iloc[0,1:101])

# Extracting profession names:
professions_name = professions_embedding_df.iloc[2:299,0]

STEP 2: Projection of occupation onto She-He vector

In [48]:
# Projection function: Projecting vec onto she_he
def projection(she_he,vec):
    myvec = np.array(vec)
    a = (np.sum(she_he*myvec))*she_he
    return(np.sqrt(np.sum([x**2 for x in a])))

In [50]:
# She-He Projection dataframe
she_he_projection = []
#she_emb = she.iloc[0,1:101]
for i in (range(len(professions_embedding))):
    vec = professions_embedding.iloc[i,1:101]
    she_he_projection.append([projection(she_he_arr, vec)])
    
# Concatenating with Professions
she_he_projection_df = pd.DataFrame((she_he_projection), (professions_name.values.tolist()), columns=['Projection'])

STEP 3: Listing the 15 most Extreme she Occupations: 

In [51]:
print("Extreme she occupations:")
she_he_projection_df.sort_values(by="Projection", ascending=False).head(15)
# 6 out of 10 (from Bolukbasi's paper) match with the 15 most Extreme she occupations

Extreme she occupations:


Unnamed: 0,Projection
actress,0.106859
ballerina,0.104006
stylist,0.098891
socialite,0.096341
waitress,0.095476
coach,0.089692
maid,0.089131
narrator,0.083403
caretaker,0.082049
housewife,0.08195


STEP 4: Listing the 10 most Extreme she Occupations: 

In [52]:
print("Extreme he occupations:")
she_he_projection_df.sort_values(by="Projection", ascending=True).head(15)
# 1 out of 10 (from Bolukbasi's paper) match with the 15 most Extreme he occupations

Extreme he occupations:


Unnamed: 0,Projection
bookkeeper,7.7e-05
clerk,0.000141
bodyguard,0.000262
ambassador,0.000281
pathologist,0.000602
lyricist,0.000694
medic,0.000867
columnist,0.000999
planner,0.001068
magician,0.001432


# Some Notes and Inductions: 
The He extreme occupations from GloVe 100d don't include any of the paper's extreme he occupations, unless we go to Top 25 (then we can find some similarities).
The She extreme occupations from GloVe 100d include most of the paper's top extreme she occupations. 

The paper uses w2vecNEWS trained on Google News. 
We use GloVe 100d, trained on wikipedia articles.

It seems that both embeddings are having similar 'she' bias in occupations. The 'he' bias is less consistent. It would be nice to repeat this process with other embeddings (like GloVe trained on common crawl or twitter) and see if that pattern also exists in those (common she biased occupations are more similar than common he biased occupations).

# Potential Ideas

- Project those 15 extreme occupations onto the she-he space and plot for some visualization
- Project the 10 extreme occupations listed in bolukbasi onto the she-he space and plot for some visualization