In [1]:
import gensim.downloader
import copy
import numpy as np
import random
from scipy.stats import spearmanr

In [2]:
POP_SIZE = 50
GEN_MAX = 20
MUT_RATE = 0.1
CROSS_RATE = 0.8

In [3]:
#weat 6
_glove_model = gensim.downloader.load('glove-wiki-gigaword-50')
_glove_model_changed = gensim.downloader.load('glove-wiki-gigaword-50')

#careers = ['technician', 'accountant', 'supervisor', 'engineer', 'worker', 'educator', 'clerk', 'counselor', 'inspector', 'mechanic', 'manager', 'therapist', 'administrator', 'salesperson', 'receptionist', 'librarian', 'advisor', 'pharmacist', 'janitor', 'psychologist', 'physician', 'carpenter', 'nurse', 'investigator', 'bartender', 'specialist', 'electrician', 'officer', 'pathologist', 'teacher', 'lawyer', 'planner', 'practitioner', 'plumber', 'instructor', 'surgeon', 'veterinarian', 'paramedic', 'examiner', 'chemist', 'machinist', 'appraiser', 'nutritionist', 'architect', 'hairdresser', 'baker', 'programmer', 'paralegal', 'hygienist', 'scientist']
female_attributes = ['female', 'woman', 'girl', 'sister', 'she', 'her', 'hers', 'daughter']
male_attributes = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']

careers = ['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition', 'poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture']


In [4]:
def noise_vec(_size=50):
    return np.random.uniform(-2, 2, size=_size)

def init_population():
    return [np.ones(50) + noise_vec() for _ in range(POP_SIZE)]

def selection(population, costs, _k=2):
    return random.choices(population, weights=[1 / (cost + 1) for cost in costs], k=_k)

def crossover(parent1, parent2):
    crossover_point = random.randint(0, len(parent1) - 1)
    child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
    child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
    return child1, child2

def mutate(child):
    if random.random() < MUT_RATE:
        child += noise_vec()
    return child

In [5]:
#melhorar desempenho
def evaluate_cost(s, glove_model):
    differences = []
    glove_model_modified = copy.deepcopy(glove_model)
    for i in range(len(glove_model_modified.vectors)):
        glove_model_modified.vectors[i] = np.multiply(glove_model_modified.vectors[i],s)

    for career in careers:
        female_similarity = sum([glove_model_modified.similarity(career, attr) for attr in female_attributes]) / len(female_attributes)
        male_similarity = sum([glove_model_modified.similarity(career, attr) for attr in male_attributes]) / len(male_attributes)
        difference = abs(male_similarity - female_similarity)
        differences.append(difference)

    return glove_model_modified, sum(differences)

In [6]:
def load_dataset(filepath):
    word_pairs = []
    human_scores = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                word1, word2, score = parts
                score = float(score)
                word_pairs.append((word1.split('-')[0], word2.split('-')[0]))
                human_scores.append(score)
    return word_pairs, human_scores

def wordembedding_similarity(word_pairs, model):
    embedding_similarities = []
    for word1, word2 in word_pairs:
        if word1 in model and word2 in model:
            similarity = model.similarity(word1, word2)
        else:
            similarity = 0.0
        embedding_similarities.append(similarity)
    return embedding_similarities

def semantic_correction(human_judgements, embedding_similarities):
    spearman_corr = spearmanr(human_judgements, embedding_similarities)
    return spearman_corr


In [7]:
def combined_cost(s, glove_model, word_pairs, human_scores, alpha=0.5):
    glove_model_modified, cost_diff = evaluate_cost(s, glove_model)
    
    embedding_similarities = wordembedding_similarity(word_pairs, glove_model_modified)
    spearman_corr, _ = semantic_correction(human_scores, embedding_similarities)
    
    cost_spearman = 1 - spearman_corr
    
    combined_cost = alpha * cost_diff + (1 - alpha) * cost_spearman
    
    return combined_cost, glove_model_modified


In [8]:
def evolutionary_algorithm(glove_model, word_pairs, human_scores, alpha=0.5):
    population = init_population()
    best_individual = None
    best_cost = float('inf')
    
    for gen in range(GEN_MAX):
        costs = [combined_cost(individual, glove_model, word_pairs, human_scores, alpha)[0] for individual in population]
        best_idx = np.argmin(costs)
        current_best_cost = costs[best_idx]
        current_best_individual = population[best_idx]
        
        if current_best_cost < best_cost:
            best_cost = current_best_cost
            best_individual = current_best_individual
        
        if best_cost < 0.2:
            break
        
        new_population = [best_individual]
        print(f"Generation {gen}/{GEN_MAX}: Best Cost {best_cost}")
        
        while len(new_population) < POP_SIZE:
            parent1, parent2 = selection(population, costs)
            if random.random() < CROSS_RATE:
                child1, child2 = crossover(parent1, parent2)
                child1 = mutate(child1)
                child2 = mutate(child2)
                new_population.extend([child1, child2])
            else:
                new_population.extend([parent1, parent2])
        
        population = new_population[:POP_SIZE]
    
    return best_individual


In [9]:
filepath = 'MEN_dataset_lemma_form.dev'
word_pairs, human_scores = load_dataset(filepath)

best_vector = evolutionary_algorithm(_glove_model, word_pairs, human_scores, alpha=0.5)

print("Evaluation with vector of ones:")
print(evaluate_cost(np.ones(50), _glove_model_changed))

print("Evaluation with best evolved vector:")
print(evaluate_cost(best_vector, _glove_model_changed))

embedding_similarities = wordembedding_similarity(word_pairs, _glove_model_changed)
correction = semantic_correction(human_scores, embedding_similarities)
print(f'Initial semantic correlation: {correction[0]}')

glove_test, _ = evaluate_cost(best_vector, _glove_model_changed)
embedding_similarities = wordembedding_similarity(word_pairs, glove_test)
correction = semantic_correction(human_scores, embedding_similarities)
print(f'Evolved semantic correlation: {correction[0]}')


Generation 0/20: Best Cost 0.34711314200226595
Generation 1/20: Best Cost 0.34711314200226595
Generation 2/20: Best Cost 0.34711314200226595
Generation 3/20: Best Cost 0.34711314200226595
Generation 4/20: Best Cost 0.34711314200226595
Generation 5/20: Best Cost 0.3301178997171804
Generation 6/20: Best Cost 0.3301178997171804
Generation 7/20: Best Cost 0.3249410016332582
Generation 8/20: Best Cost 0.3249410016332582
Generation 9/20: Best Cost 0.3249410016332582
Generation 10/20: Best Cost 0.3249410016332582
Generation 11/20: Best Cost 0.3249410016332582
Generation 12/20: Best Cost 0.3249410016332582
Generation 13/20: Best Cost 0.3249410016332582
Generation 14/20: Best Cost 0.3249410016332582
Generation 15/20: Best Cost 0.3249410016332582
Generation 16/20: Best Cost 0.3249410016332582
Generation 17/20: Best Cost 0.3249410016332582
Generation 18/20: Best Cost 0.3249410016332582
Generation 19/20: Best Cost 0.3249410016332582
Evaluation with vector of ones:
(<gensim.models.keyedvectors.Keye