In [None]:
""" Import packages """

from gensim.models import Word2Vec, KeyedVectors
from gensim.utils import tokenize
from gensim.corpora import Dictionary


import pandas as pd
import numpy as np
import statistics
import re 

import string
from string import punctuation

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

In [None]:
""" Import the cleaned dataset and double check that it's dropped duplicates & na's """

data = pd.read_csv('data_cleaned_properly.csv', encoding='utf-8', converters={"clean": lambda x: x.strip("[]").replace("'", "").split(", ")})
df = data.drop_duplicates()
df = data.dropna(how='any', subset=['Job Description'])
df = data.dropna(how='any', subset=['clean'])
df = pd.DataFrame(data)

print(df['clean'])

In [None]:
# check shape of dataframe 

df.shape

Creating Gender Bias Dictionary 

In [None]:
""" Function for creating gender bias dictionary with assigned gender bias values """

unique_words_dic = {}
def calculate_gender_bias_dictionary(df_column, w2vmodel, word1, word2):
#df_column = descriptions, w2vmodel = chosen model, word1 & word2 = gender identifiers 
        model = w2vmodel
        male_word = word1
        female_word = word2
# Join all job descriptions
        all_words = ' '.join(df_column)
# Finds all unique words in the "big word"
        unique_words = set(all_words.split(' '))
# Create a dictionary with all unique words with gender bias values
        for word in unique_words:
            if word not in model.key_to_index.keys(): 
                unique_words_dic[word] = float(-1000.0) #assign -1000 if not in dictionary 
            else:
                male_sim = float(w2vmodel.similarity(word, word1)) 
                female_sim = float(w2vmodel.similarity(word, word2)) 
                difference = male_sim - female_sim 
                unique_words_dic[word] = float(difference)
        return unique_words_dic

In [None]:
""" Getting df_column to be  an indexed list of all unique words in dataset """

df_kol = data.clean.tolist()

In [None]:
string1 = ' '.join([str(word) for word in df_kol])

In [None]:
new_string = string1.translate(str.maketrans('', '', string.punctuation))

In [None]:
list1 = new_string.split(' ')

In [None]:
df_column = list1


In [None]:
""" Specifying model and gender identifier vectors """

#model 
w2vmodel = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=500000)

#speficy gender identifiers 
word1 = "man"
word2 = "woman"

In [None]:
""" Calling the function """

gender_bias_dict = calculate_gender_bias_dictionary(df_column, w2vmodel, word1, word2)

In [None]:
# checking the type 
type(gender_bias_dict)

Calculating Gender Bias per Job Description 

In [None]:
""" Defining function that calculates average gender bias for each job description """

def calculate_gender_bias(posting, gender_bias_dict): 
    gender_bias_total = 0
    #avg_gender_bias = 0
    count = 0
    #list_words = posting.split() 
    for word in posting:
        bias = gender_bias_dict[word] 
        if bias != -1000.0:
            gender_bias_total += bias 
            count += 1
    return float((gender_bias_total / count) if count != 0 else 0)

In [None]:
gender_bias = []
for i in df['clean']:
    cal_bias = calculate_gender_bias(i, gender_bias_dict)
    gender_bias.append(cal_bias)

In [None]:
""" Finding mean of overall gender bias in data """ 

mean_bias = statistics.mean(gender_bias)
print(mean_bias)


In [None]:
""" Appending column to dataframe and saving as .csv for further analysis """ 

df["man_woman"] = gender_bias

#saving full dataframe as csv 
df.to_csv("dataset_final.csv")

In [None]:
""" Extracting gender bias score for each word in gender bias dictionary and saving as .csv """ 

pd_bias = pd.DataFrame.from_dict(gender_bias_dict, orient='index')

#save csv with gender scores 
pd_bias.to_csv("gender_bias_dict.csv")

Checking Analogies 

In [None]:
vec = w2vmodel["computer_programmer"] - w2vmodel["man"] + w2vmodel["woman"]
w2vmodel.most_similar([vec])

In [None]:
vec = w2vmodel["king"] - w2vmodel["man"] + w2vmodel["woman"]
w2vmodel.most_similar([vec])