
We use [Word Embedding Benchmark Repo](https://github.com/kudkudak/word-embeddings-benchmarks)

 ## Are SCM Dimensions crucial to semantics

Compare the embedding quality of PP$_{SCM}$ with PP$_{G+R+A}


In [None]:
import sys
import os
import random
import logging
import pandas as pd
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [None]:
sys.path.insert(1,"../external_code/quality/word-embeddings-benchmarks/")
from web.datasets.analogy import fetch_google_analogy
from web.embeddings import fetch_SG_GoogleNews
from web.evaluate import evaluate_on_all
from web.embeddings import load_embedding

sys.path.insert(1, '../src/')
import config
from utils import  load_pairs
from debias import Debias


## PP$_{SCM}$

In [None]:
debiasing_instance = Debias(config.sgns_wikitext_path, config.debiased_model_dir)

In [None]:
sub_sample_size = 8

warmth_pairs = load_pairs(config.warmth_pair_path)
competence_pairs = load_pairs(config.competence_pair_path)
    
sub_warmth_pairs = random.sample(warmth_pairs, sub_sample_size)
sub_competence_pairs = random.sample(competence_pairs, sub_sample_size)
e = debiasing_instance.partial_project(def_pairs=sub_warmth_pairs)
e_scm = debiasing_instance.partial_project(def_pairs=sub_competence_pairs, embedding=e)
e_scm.save_word2vec_format(os.path.join(config.debiased_model_dir, "quality_scm_word2vec.bin"), binary=True)

In [None]:
scm_embedding = load_embedding(os.path.join(config.debiased_model_dir, "quality_scm_word2vec.bin"))
scm_embedding.standardize_words(lower=True)
evaluate_on_all(scm_embedding)

## PP$_{G+R+A}$

In [None]:
gender_pairs = load_pairs(config.gender_pair_path)
race_pairs = load_pairs(config.race_pair_path)
age_pairs = load_pairs(config.age_pair_path)

In [None]:
sub_sample_size = 8
sub_gender_pairs = random.sample(gender_pairs, sub_sample_size)
sub_race_pairs = random.sample(race_pairs, sub_sample_size)
sub_age_pairs = random.sample(age_pairs, sub_sample_size)
eg = debiasing_instance.partial_project(def_pairs=sub_gender_pairs)
e_gr = debiasing_instance.partial_project(def_pairs=sub_race_pairs, embedding=eg)
e_gra = debiasing_instance.partial_project(def_pairs=sub_age_pairs, embedding=e_gr)
        
e_gra.save_word2vec_format(os.path.join(config.debiased_model_dir, "quality_gra_word2vec.bin"), binary=True)

In [None]:
gra_embedding = load_embedding(os.path.join(config.debiased_model_dir, "quality_gra_word2vec.bin"))
gra_embedding.standardize_words(lower=True)
evaluate_on_all(gra_embedding)

------------------------------------------------------------