In [None]:
import gensim.downloader
from gensim.models import Word2Vec
import copy
import numpy as np
import random
from scipy.stats import spearmanr

In [None]:
#weat 6
_glove_model = gensim.downloader.load('glove-wiki-gigaword-50')
_glove_model_changed = gensim.downloader.load('glove-wiki-gigaword-50')

#careers = ['technician', 'accountant', 'supervisor', 'engineer', 'worker', 'educator', 'clerk', 'counselor', 'inspector', 'mechanic', 'manager', 'therapist', 'administrator', 'salesperson', 'receptionist', 'librarian', 'advisor', 'pharmacist', 'janitor', 'psychologist', 'physician', 'carpenter', 'nurse', 'investigator', 'bartender', 'specialist', 'electrician', 'officer', 'pathologist', 'teacher', 'lawyer', 'planner', 'practitioner', 'plumber', 'instructor', 'surgeon', 'veterinarian', 'paramedic', 'examiner', 'chemist', 'machinist', 'appraiser', 'nutritionist', 'architect', 'hairdresser', 'baker', 'programmer', 'paralegal', 'hygienist', 'scientist']
female_attributes = ['female', 'woman', 'girl', 'sister', 'she', 'her', 'hers', 'daughter']
male_attributes = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']

careers = ['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition', 'poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture']

all_words = list(set(female_attributes + male_attributes + careers))

In [27]:
word_vectors = {word: _glove_model[word] for word in all_words if word in _glove_model}
vectors = np.array([word_vectors[word] for word in word_vectors.keys()])
words = list(word_vectors.keys())
reduced_glove = Word2Vec(vector_size=50, min_count=1)
reduced_glove.build_vocab([words])
reduced_glove.wv.vectors = vectors
reduced_glove.wv.index_to_key = words
reduced_glove.wv.key_to_index = {word: idx for idx, word in enumerate(words)}


In [28]:
POP_SIZE = 50
GEN_MAX = 100
MUT_RATE = 0.2
CROSS_RATE = 0.6

def noise_vec(_size=50):
    return np.random.uniform(-2, 2, size=_size)

def init_population():
    return np.ones((POP_SIZE, 50)) + np.array([noise_vec() for _ in range(POP_SIZE)])

def selection(population, costs, _k=2):
    return random.choices(population, weights=[1 / (cost + 1) for cost in costs], k=_k)

def crossover(parent1, parent2):
    point1, point2 = sorted(random.sample(range(len(parent1)), 2))
    child1 = np.concatenate((parent1[:point1], parent2[point1:point2], parent1[point2:]))
    child2 = np.concatenate((parent2[:point1], parent1[point1:point2], parent2[point2:]))
    return child1, child2

def mutate(child):
    if random.random() < MUT_RATE:
        child += noise_vec()
    return child

def load_dataset(filepath):
    word_pairs = []
    human_scores = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                word1, word2, score = parts
                score = float(score)
                word_pairs.append((word1.split('-')[0], word2.split('-')[0]))
                human_scores.append(score)
    return word_pairs, human_scores

def wordembedding_similarity(word_pairs, model):
    embedding_similarities = []
    for word1, word2 in word_pairs:
        if word1 in model and word2 in model:
            similarity = model.similarity(word1, word2)
        else:
            similarity = 0.0
        embedding_similarities.append(similarity)
    return embedding_similarities

def combined_cost(s, word_pairs, human_scores, alpha=0.5):
    #glove_model_modified = _glove_model
    glove_model_modified = copy.deepcopy(reduced_glove.wv)
    
    for word in all_words:
        if word in glove_model_modified:
            glove_model_modified[word] = np.multiply(glove_model_modified[word], s)


    differences = [
        abs(
            sum(glove_model_modified.similarity(career, attr) for attr in female_attributes) / len(female_attributes) - 
            sum(glove_model_modified.similarity(career, attr) for attr in male_attributes) / len(male_attributes)
        )
        for career in careers
    ]
    
    bias = sum(differences)
    embedding_similarities = wordembedding_similarity(word_pairs, glove_model_modified)
    spearman_corr, _ = spearmanr(human_scores, embedding_similarities)
    cost_spearman = 1 - spearman_corr
    
    return alpha * bias + (1 - alpha) * cost_spearman

def evolutionary_algorithm(word_pairs, human_scores, alpha=0.5):
    population = init_population()
    best_individual = None
    best_cost = float('inf')
    
    for gen in range(GEN_MAX):
        costs = [combined_cost(individual, word_pairs, human_scores, alpha) for individual in population]
        best_idx = np.argmin(costs)
        current_best_cost = costs[best_idx]
        current_best_individual = population[best_idx]
        
        if current_best_cost < best_cost:
            best_cost = current_best_cost
            best_individual = current_best_individual
        
        new_population = [best_individual]
        if gen % 10 == 0:
            print(f"Generation {gen}/{GEN_MAX}: Best Cost {best_cost}")
        
        while len(new_population) < POP_SIZE:
            parent1, parent2 = selection(population, costs)
            if random.random() < CROSS_RATE:
                child1, child2 = crossover(parent1, parent2)
                child1 = mutate(child1)
                child2 = mutate(child2)
                new_population.extend([child1, child2])
            else:
                new_population.extend([parent1, parent2])
        
        population = new_population[:POP_SIZE]
    
    print(f"Generation {100}/{GEN_MAX}: Best Cost {best_cost}")
    return best_individual


In [29]:
filepath = 'MEN_dataset_lemma_form.dev'
word_pairs, human_scores = load_dataset(filepath)

In [30]:
print("Avaliação inicial:")
combined_cost(np.ones(50), word_pairs, human_scores, alpha=0.5)

Avaliação inicial:


0.8004963350659737

In [31]:
best_vector = evolutionary_algorithm(word_pairs, human_scores, alpha=0.5)

Generation 0/100: Best Cost 0.7545248057405034
Generation 10/100: Best Cost 0.6911004585391411
Generation 20/100: Best Cost 0.6547464189762751
Generation 30/100: Best Cost 0.6407461542813608
Generation 40/100: Best Cost 0.6407461542813608
Generation 50/100: Best Cost 0.6081665392676869
Generation 60/100: Best Cost 0.6081665392676869
Generation 70/100: Best Cost 0.5899446915546933
Generation 80/100: Best Cost 0.5834471133249381
Generation 90/100: Best Cost 0.5822042028131464
Generation 100/100: Best Cost 0.5757988010156175


In [32]:
print("Avaliação final:")
print(combined_cost(best_vector, word_pairs, human_scores, alpha=0.5))

Avaliação final:
0.5757988010156175
