In [1]:
import gensim.downloader
from gensim.models import Word2Vec
import copy
import numpy as np
import random
from scipy.stats import spearmanr

_glove_model = gensim.downloader.load('glove-wiki-gigaword-50')

In [39]:
careers = ['technician', 'accountant', 'supervisor', 'engineer', 'worker', 'educator', 'clerk', 'counselor', 'inspector', 'mechanic', 'manager', 'therapist', 'administrator', 'salesperson', 'receptionist', 'librarian', 'advisor', 'pharmacist', 'janitor', 'psychologist', 'physician', 'carpenter', 'nurse', 'investigator', 'bartender', 'specialist', 'electrician', 'officer', 'pathologist', 'teacher', 'lawyer', 'planner', 'practitioner', 'plumber', 'instructor', 'surgeon', 'veterinarian', 'paramedic', 'examiner', 'chemist', 'machinist', 'appraiser', 'nutritionist', 'architect', 'hairdresser', 'baker', 'programmer', 'paralegal', 'hygienist', 'scientist']
female_attributes = ['female', 'woman', 'girl', 'sister', 'she', 'her', 'hers', 'daughter']
male_attributes = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']

# Função para carregar o dataset de pares de palavras
def load_dataset(filepath):
    word_pairs = []
    human_scores = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                word1, word2, score = parts
                score = float(score)
                word_pairs.append((word1.split('-')[0], word2.split('-')[0]))
                human_scores.append(score)
    return word_pairs, human_scores

filepath = 'MEN_dataset_lemma_form.dev'
word_pairs, human_scores = load_dataset(filepath)

men_words = []
for word1, word2 in word_pairs:
    men_words.append(word1)
    men_words.append(word2)

all_words = list(set(female_attributes + male_attributes + careers + men_words))

# Reduzir o modelo GloVe para o subconjunto relevante
word_vectors = {word: _glove_model[word] for word in all_words if word in _glove_model}
vectors = np.array([word_vectors[word] for word in word_vectors.keys()])
words = list(word_vectors.keys())
reduced_glove = Word2Vec(vector_size=50, min_count=1)
reduced_glove.build_vocab([words])
reduced_glove.wv.vectors = vectors
reduced_glove.wv.index_to_key = words
reduced_glove.wv.key_to_index = {word: idx for idx, word in enumerate(words)}

POP_SIZE = 50
GEN_MAX = 3
MUT_RATE = 0.2
CROSS_RATE = 0.6

def noise_vec(_size=50):
    return np.random.uniform(-2, 2, size=_size)

def init_population():
    population = np.ones((POP_SIZE - 1, 50)) + np.array([noise_vec() for _ in range(POP_SIZE - 1)])
    population = np.vstack([population, np.ones(50)])
    np.random.shuffle(population)
    return population

def crossover(parent1, parent2):
    point1, point2 = sorted(random.sample(range(len(parent1)), 2))
    child1 = np.concatenate((parent1[:point1], parent2[point1:point2], parent1[point2:]))
    child2 = np.concatenate((parent2[:point1], parent1[point1:point2], parent2[point2:]))
    return child1, child2

def mutate(child):
    if random.random() < MUT_RATE:
        child += noise_vec()
    return child

#modificar para seleção por dominância
def selection(population, costs, _k=2):
    return random.choices(population, k=_k)

In [40]:
def wordembedding_similarity(word_pairs, model):
    embedding_similarities = []
    for word1, word2 in word_pairs:
        if word1 in model and word2 in model:
            similarity = model.similarity(word1, word2)
        else:
            similarity = 0.0
        embedding_similarities.append(similarity)
    return embedding_similarities

def calculate_costs(s, word_pairs, human_scores):
    glove_model_modified = copy.copy(reduced_glove.wv)
    
    for _word in all_words:
        glove_model_modified[_word] = np.multiply(glove_model_modified[_word], s)
        
    differences = [
        abs(
            sum(glove_model_modified.similarity(career, attr) for attr in female_attributes) / len(female_attributes) - 
            sum(glove_model_modified.similarity(career, attr) for attr in male_attributes) / len(male_attributes)
        )
        for career in careers
    ]
    bias = sum(differences)
    
    embedding_similarities = wordembedding_similarity(word_pairs, glove_model_modified)
    spearman_corr, _ = spearmanr(human_scores, embedding_similarities)
    cost_spearman = 1 - spearman_corr
    
    return bias, cost_spearman

In [41]:
def fast_non_dominated_sort(population, objectives):
    ranks = np.zeros(len(population))
    domination_count = np.zeros(len(population))
    domination_set = {i: [] for i in range(len(population))}
    fronts = [[]]
    
    for p in range(len(population)):
        for q in range(len(population)):
            if all(obj_p <= obj_q for obj_p, obj_q in zip(objectives[p], objectives[q])) and any(obj_p < obj_q for obj_p, obj_q in zip(objectives[p], objectives[q])):
                domination_set[p].append(q)
            elif all(obj_q <= obj_p for obj_q, obj_p in zip(objectives[q], objectives[p])) and any(obj_q < obj_p for obj_q, obj_p in zip(objectives[q], objectives[p])):
                domination_count[p] += 1

        if domination_count[p] == 0:
            ranks[p] = 1
            fronts[0].append(p)
    
    current_front = 0
    while len(fronts[current_front]) > 0:
        next_front = []
        for p in fronts[current_front]:
            for q in domination_set[p]:
                domination_count[q] -= 1
                if domination_count[q] == 0:
                    ranks[q] = current_front + 2
                    next_front.append(q)
        current_front += 1
        fronts.append(next_front)
    
    return ranks, fronts[:-1]


In [42]:
def calculate_crowding_distance(front, objectives):
    num_objectives = len(objectives[0])
    distances = np.zeros(len(front))
    
    for m in range(num_objectives):
        sorted_indices = np.argsort([objectives[i][m] for i in front])
        distances[sorted_indices[0]] = distances[sorted_indices[-1]] = float('inf')
        for i in range(1, len(front) - 1):
            distances[sorted_indices[i]] += (objectives[front[sorted_indices[i+1]]][m] - objectives[front[sorted_indices[i-1]]][m]) / (max(objectives[front][m]) - min(objectives[front][m]))
    
    return distances

def selection_nsga_ii(population, objectives):
    ranks, fronts = fast_non_dominated_sort(population, objectives)
    new_population = []
    
    for front in fronts:
        if len(new_population) + len(front) > POP_SIZE:
            distances = calculate_crowding_distance(front, objectives)
            sorted_front = [front[i] for i in np.argsort(distances)[::-1]]
            new_population.extend(sorted_front[:POP_SIZE - len(new_population)])
            break
        else:
            new_population.extend(front)
    
    return [population[i] for i in new_population]

In [36]:
population = init_population()
objectives = [calculate_costs(ind, word_pairs, human_scores) for ind in population]

In [38]:
selection(population, objectives)

TypeError: can only concatenate tuple (not "int") to tuple

In [37]:
len(selection(population, objectives))

TypeError: can only concatenate tuple (not "int") to tuple

In [44]:
def nsga2(word_pairs, human_scores):
    population = init_population()
    
    for gen in range(GEN_MAX):
        objectives = [calculate_costs(ind, word_pairs, human_scores) for ind in population]
        new_population = []
        new_population.extend(population)

        for i in range(POP_SIZE//2):
            parent1, parent2 = selection(population, objectives)
            if random.random() < CROSS_RATE:
                child1, child2 = crossover(parent1, parent2)
                child1 = mutate(child1)
                child2 = mutate(child2)
                new_population.extend([child1, child2])
            else:
                new_population.extend([parent1, parent2])        

        objectives = [calculate_costs(ind, word_pairs, human_scores) for ind in new_population]
        ranks, fronts = fast_non_dominated_sort(new_population, objectives)
        print(len(fronts[0]))
        population = selection_nsga_ii(population, objectives)
        
        if gen % 10 == 0:
            best_objectives = min(objectives, key=lambda obj: obj[0])
            print(f"Generation {gen}/{GEN_MAX}: Best Bias {best_objectives[0]}, Best Spearman {best_objectives[1]}")
    
    # Obter a frente de Pareto final
    final_objectives = [calculate_costs(ind, word_pairs, human_scores) for ind in population]
    ranks, fronts = fast_non_dominated_sort(population, final_objectives)
    pareto_front = [population[i] for i in fronts[0]]
    
    return pareto_front, [calculate_costs(population[i], word_pairs, human_scores) for i in fronts[0]]



In [45]:
pareto_solutions = nsga2(word_pairs, human_scores)

10
Generation 0/3: Best Bias 1.9887493399437517, Best Spearman 0.45650751707287673
10
7


In [46]:
pareto_solutions

([array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
  array([ 2.18529027,  1.09601834,  2.83097224,  2.88154863,  2.61286285,
          0.89544458,  2.71568744, -0.19843181,  0.58437036,  2.34050117,
         -0.35754094, -0.06492122,  1.14852577,  0.50673585,  0.13571062,
          1.42786148,  1.73901011,  2.46039317,  1.57703055,  1.32540116,
          2.36899926, -0.56346959,  1.05265828,  0.7682263 ,  0.51143468,
          1.87995678,  2.02514139, -0.42163414,  0.57064072, -0.77646464,
         -0.84354246,  1.52487547,  2.65424105,  1.95168548,  0.35645736,
          0.82116585,  0.9905573 ,  1.81640531,  2.65556665,  1.18950853,
          1.22982868,  2.47176513,  2.98150582,  2.8618161 ,  0.32958091,
          1.87194878, -0.03707514,  0.34181332,  0.68709645, -0.13145034]),
  array([ 2.98635448, -0.2191