In [1]:
import numpy as np
#from sympy.utilities.iterables import multiset_permutations

# 这一部分是WEAT的代码，不用跑

In [16]:
def unit_vector(vec):
    """
    Returns unit vector
    """
    return vec / np.linalg.norm(vec)


def cos_sim(v1, v2):
    """
    Returns cosine of the angle between two vectors
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.clip(np.tensordot(v1_u, v2_u, axes=(-1, -1)), -1.0, 1.0)


def weat_association(W, A, B):
    """
    Returns association of the word w in W with the attribute for WEAT score.
    s(w, A, B)
    :param W: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: (len(W), ) shaped numpy ndarray. each rows represent association of the word w in W
    """
    return np.mean(cos_sim(W, A), axis=-1) - np.mean(cos_sim(W, B), axis=-1)


def weat_differential_association(X, Y, A, B):
    """
    Returns differential association of two sets of target words with the attribute for WEAT score.
    s(X, Y, A, B)
    :param X: target words' vector representations
    :param Y: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: differential association (float value)
    """
    return np.sum(weat_association(X, A, B)) - np.sum(weat_association(Y, A, B))


def weat_p_value(X, Y, A, B):
    """
    Returns one-sided p-value of the permutation test for WEAT score
    CAUTION: this function is not appropriately implemented, so it runs very slowly
    :param X: target words' vector representations
    :param Y: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: p-value (float value)
    """
    diff_association = weat_differential_association(X, Y, A, B)
    target_words = np.concatenate((X, Y), axis=0)

    # get all the partitions of X union Y into two sets of equal size.
    idx = np.zeros(len(target_words))
    idx[:len(target_words) // 2] = 1

    partition_diff_association = []
    for i in multiset_permutations(idx):
        i = np.array(i, dtype=np.int32)
        partition_X = target_words[i]
        partition_Y = target_words[1 - i]
        partition_diff_association.append(weat_differential_association(partition_X, partition_Y, A, B))

    partition_diff_association = np.array(partition_diff_association)

    return np.sum(partition_diff_association > diff_association) / len(partition_diff_association)


def weat_score(X, Y, A, B):
    """
    Returns WEAT score
    X, Y, A, B must be (len(words), dim) shaped numpy ndarray
    CAUTION: this function assumes that there's no intersection word between X and Y
    :param X: target words' vector representations
    :param Y: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: WEAT score
    """

    x_association = weat_association(X, A, B)
    y_association = weat_association(Y, A, B)


    tmp1 = np.mean(x_association, axis=-1) - np.mean(y_association, axis=-1)
    tmp2 = np.std(np.concatenate((x_association, y_association), axis=0))

    return tmp1 / tmp2


def wefat_p_value(W, A, B):
    """
    Returns WEFAT p-value
    W, A, B must be (len(words), dim) shaped numpy ndarray
    CAUTION: not implemented yet
    :param W: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: WEFAT p-value
    """
    pass


def wefat_score(W, A, B):
    """
    Returns WEFAT score
    W, A, B must be (len(words), dim) shaped numpy ndarray
    CAUTION: this function assumes that there's no intersection word between A and B
    :param W: target words' vector representations
    :param A: attribute words' vector representations
    :param B: attribute words' vector representations
    :return: WEFAT score
    """
    tmp1 = weat_association(W, A, B)
    tmp2 = np.std(np.concatenate((cos_sim(W, A), cos_sim(W, B)), axis=0))

    return np.mean(tmp1 / tmp2)

In [17]:
v1_u=np.array([[1,2,3],[1,1,1],[0,1,0]])
v2_u=np.array([[0,0,1],[4,5,6],[1,1,2]])
np.tensordot(v1_u, v2_u, axes=(-1, -1))

array([[ 3, 32,  9],
       [ 1, 15,  4],
       [ 0,  5,  1]])

# 训练好的word2vec模型

In [2]:
def load_model(embedding_path):
    print('loading models....')
    word_embedding_dict = {}
    word_embeddings = []
    word_dict = {}
    index = 0
    for line in open(embedding_path,encoding='UTF-8'):
        line = line.strip().split('\t')
        word = line[0]
        word_embedding = np.array([float(item) for item in line[1].split(',') if item])
        word_embedding_dict[word] = word_embedding
        word_embeddings.append(word_embedding)
        word_dict[index] = word
        index += 1
    return word_embedding_dict, word_dict, np.array(word_embeddings)

In [3]:
we,wd,wea=load_model('model/cbow_wordvec2.bin')
# we 是 word_embedding_dict 是词典集里词的嵌入形式
# wd 是 word_dict 中文词典集
# wea 是word_embedding的array形式

loading models....


In [5]:
we

{'UNK': array([-1.08010710e-02,  3.72548250e-02, -8.78214200e-02,  5.14114700e-02,
        -5.93654140e-02, -3.33209080e-02,  3.72364930e-02,  7.81355500e-02,
        -4.43201700e-03, -4.37716730e-02, -6.82218100e-02, -1.06358920e-02,
        -6.75334860e-02, -1.04048020e-01,  4.18944020e-02,  5.81817960e-02,
         1.03716820e-01,  8.33175600e-02,  3.07985360e-02,  5.08206700e-03,
        -7.89188400e-03,  3.07616610e-02, -1.39895660e-01, -3.07719410e-02,
        -1.03135936e-01, -1.00219370e-01,  1.11939006e-01, -1.41080630e-01,
        -4.94283440e-03, -3.90353800e-02, -2.31792260e-04, -5.70529500e-02,
         4.62116040e-02, -3.95369160e-02,  4.88524300e-02, -3.31922400e-02,
        -9.73715500e-02, -1.04913875e-01,  7.37086760e-02,  1.27734960e-01,
         3.64545360e-02, -1.20553020e-01,  1.79776470e-02,  3.55357050e-03,
         6.82173450e-02, -1.93186420e-02,  1.15365535e-01, -1.41270320e-01,
        -2.42673620e-02, -6.62713500e-02, -7.64202850e-02,  5.42048180e-02,
     

In [4]:
wd

{0: 'UNK',
 1: '的',
 2: '是',
 3: '在',
 4: '、',
 5: '和',
 6: '了',
 7: ')',
 8: '(',
 9: '”',
 10: '“',
 11: '为',
 12: '中',
 13: '与',
 14: '对',
 15: '农业',
 16: '.',
 17: '发展',
 18: '有',
 19: '也',
 20: '经济',
 21: '：',
 22: '上',
 23: '而',
 24: '这',
 25: '1',
 26: '年',
 27: '不',
 28: '（',
 29: '一个',
 30: '）',
 31: '以',
 32: '其',
 33: '从',
 34: '》',
 35: '《',
 36: '-',
 37: '就',
 38: '要',
 39: '；',
 40: '将',
 41: '2',
 42: '问题',
 43: '生产',
 44: '可以',
 45: '到',
 46: '进行',
 47: '地',
 48: '都',
 49: '但',
 50: '艺术',
 51: '使',
 52: '并',
 53: '等',
 54: '它',
 55: '时',
 56: '=',
 57: '中国',
 58: '研究',
 59: '我们',
 60: '下',
 61: '人',
 62: '市场',
 63: '我国',
 64: '3',
 65: '系统',
 66: '一',
 67: '社会',
 68: '主要',
 69: '或',
 70: '由',
 71: '可',
 72: '企业',
 73: '她',
 74: '新',
 75: '由于',
 76: '这种',
 77: '技术',
 78: '则',
 79: '所',
 80: '之',
 81: '％',
 82: '提高',
 83: '一种',
 84: '又',
 85: '—',
 86: '［',
 87: '通过',
 88: '把',
 89: '影响',
 90: '～',
 91: '］',
 92: '过程',
 93: '具有',
 94: '不同',
 95: '大',
 96: '后',
 97: '１',


In [14]:
print(len(wea))
wea

56001


array([[-0.01080107,  0.03725482, -0.08782142, ...,  0.0166816 ,
         0.09195076,  0.0498424 ],
       [-0.00274683,  0.02312648, -0.02235239, ...,  0.0040629 ,
        -0.0465328 , -0.01639906],
       [-0.00383557,  0.03204186,  0.03741058, ...,  0.02852387,
         0.01999717, -0.01131764],
       ...,
       [-0.07715703, -0.06993089,  0.05882506, ...,  0.07517095,
        -0.03222522, -0.00061972],
       [-0.00167508, -0.02331988, -0.11844875, ...,  0.05983521,
        -0.09418002, -0.02916176],
       [ 0.06803297,  0.06975625, -0.06706452, ...,  0.06499384,
         0.0558008 ,  0.00812868]])

# weat可直接用；构造XYAB是关键
## 对应的数组最好保证词数一致(但我想了一下过程，感觉也不是完全需要啊……)

Female-A Male-B Careers-W/Y1 Arts-X2 Science-Y2 Maths-Y3 Family-X1

A
Female
female, woman, girl, sister, she, her, hers, daughter, mother, aunt, grandmother

B
Male
male, man, boy, brother, he, him, his, son, father, uncle, grandfather

W
Careers
technician, accountant, supervisor, engineer, worker, educator, clerk, counselor, inspector, mechanic, manager, therapist, administrator, salesperson, receptionist, librarian, advisor, pharmacist, janitor, psychologist, physician, carpenter, nurse, investigator, bartender, specialist, electrician, officer, pathologist, teacher, lawyer, planner, practitioner, plumber, instructor, surgeon, veterinarian, paramedic, examiner, chemist, machinist, appraiser, nutritionist, architect, hairdresser, baker, programmer, paralegal, hygienist, scientist

Y1
Career
executive, management, professional, corporation, salary, office, business, career

X1
Family
home, parents, children, family, cousins, marriage, wedding, relatives

Y21
Science
science, technology, physics, chemistry, Einstein, NASA, experiment, astronomy

Y22
Math
math, algebra, geometry, calculus, equations, computation, numbers, addition

X2
Arts
poetry, art, Shakespeare, dance, literature, novel, symphony, drama

In [15]:
A=['女','女人',]

## 下面都是复制的WEAT原文档，不用看

Careers : technician, accountant, supervisor, engineer, worker, educator, clerk, counselor, inspector, mechanic, manager, therapist, administrator, salesperson, receptionist, librarian, advisor, pharmacist, janitor, psychologist, physician, carpenter, nurse, investigator, bartender, specialist, electrician, officer, pathologist, teacher, lawyer, planner, practitioner, plumber, instructor, surgeon, veterinarian, paramedic, examiner, chemist, machinist, appraiser, nutritionist, architect, hairdresser, baker, programmer, paralegal, hygienist, scientist
Female attributes: female, woman, girl, sister, she, her, hers, daughter
Male attributes: male, man, boy, brother, he, him, his, son

Male names: John, Paul, Mike, Kevin, Steve, Greg, Jeff, Bill
Female names: Amy, Joan, Lisa, Sarah, Diana, Kate, Ann, Donna
Career words : executive, management, professional, corporation, salary, office, business, career
Family words : home, parents, children, family, cousins, marriage, wedding, relatives

Math words : math, algebra, geometry, calculus, equations, computation, numbers, addition
Arts Words : poetry, art, dance, literature, novel, symphony, drama, sculpture
Male attributes: male, man, boy, brother, he, him, his, son
Female attributes: female, woman, girl, sister, she, her, hers, daughter

Science words : science, technology, physics, chemistry, Einstein, NASA, experiment, astronomy
Arts words : poetry, art, Shakespeare, dance, literature, novel, symphony, drama
Male attributes: brother, father, uncle, grandfather, son, he, his, him
Female attributes: sister, mother, aunt, grandmother, daughter, she, hers, her