In [1]:
# generate w2v for 984 words from David's code
import pandas as pd
import numpy as np
import pickle

In [2]:
with open('simu4_data/simu4_word_freq.pkl', 'rb') as inp:
    df = pickle.load(inp)
df

Unnamed: 0,item,itemno_old,itemno,freq,quantile
0,ABDOMEN,1,1,110,2
1,ACROBAT,4,2,17,0
2,ADULT,8,3,906,8
3,ALLEY,15,4,165,3
4,ALLIGATOR,16,5,12,0
...,...,...,...,...,...
979,YOKE,1632,980,12,0
980,YOLK,1633,981,47,1
981,YOUTH,1634,982,1159,8
982,ZIPPER,1636,983,24,0


In [3]:
df.loc[df.item == 'DOUGHNUT', 'item'] = 'DONUT' # replace doughnut with donut

In [4]:
item_num_df = df[['item','itemno']]
item_num_df

Unnamed: 0,item,itemno
0,ABDOMEN,1
1,ACROBAT,2
2,ADULT,3
3,ALLEY,4
4,ALLIGATOR,5
...,...,...
979,YOKE,980
980,YOLK,981
981,YOUTH,982
982,ZIPPER,983


In [5]:
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

In [6]:
def word_similarity(df, col1, col2, keyed_vector=None):
    try:
        return keyed_vector.similarity(df[col1].lower(), df[col2].lower())
    except:
        return np.nan

In [7]:
# get all pairs of items
items = df.item.values
sem_sim_df = pd.MultiIndex.from_product([items, items], names=['wordpool_item_1', 'wordpool_item_2']).to_frame(index=False)
sem_sim_df

Unnamed: 0,wordpool_item_1,wordpool_item_2
0,ABDOMEN,ABDOMEN
1,ABDOMEN,ACROBAT
2,ABDOMEN,ADULT
3,ABDOMEN,ALLEY
4,ABDOMEN,ALLIGATOR
...,...,...
968251,ZOO,YOKE
968252,ZOO,YOLK
968253,ZOO,YOUTH
968254,ZOO,ZIPPER


In [8]:
# compute similarity of all pairs
sem_sim_df['similarity'] = sem_sim_df.apply(word_similarity, 
               axis=1, col1='wordpool_item_1', 
               col2='wordpool_item_2', 
               keyed_vector=word2vec_vectors)
sem_sim_df

Unnamed: 0,wordpool_item_1,wordpool_item_2,similarity
0,ABDOMEN,ABDOMEN,1.000000
1,ABDOMEN,ACROBAT,0.043753
2,ABDOMEN,ADULT,0.061627
3,ABDOMEN,ALLEY,0.280596
4,ABDOMEN,ALLIGATOR,0.213597
...,...,...,...
968251,ZOO,YOKE,-0.090943
968252,ZOO,YOLK,0.027731
968253,ZOO,YOUTH,0.112560
968254,ZOO,ZIPPER,0.048066


In [9]:
sem_sim_num_df = sem_sim_df.merge(
    item_num_df, left_on='wordpool_item_1', right_on='item').merge(
    item_num_df, left_on='wordpool_item_2', right_on='item', suffixes=('_1', '_2')).drop(columns=['wordpool_item_1', 'wordpool_item_2'])
sem_sim_num_df

Unnamed: 0,similarity,item_1,itemno_1,item_2,itemno_2
0,1.000000,ABDOMEN,1,ABDOMEN,1
1,0.043753,ACROBAT,2,ABDOMEN,1
2,0.061627,ADULT,3,ABDOMEN,1
3,0.280596,ALLEY,4,ABDOMEN,1
4,0.213597,ALLIGATOR,5,ABDOMEN,1
...,...,...,...,...,...
968251,-0.090943,YOKE,980,ZOO,984
968252,0.027731,YOLK,981,ZOO,984
968253,0.112560,YOUTH,982,ZOO,984
968254,0.048066,ZIPPER,983,ZOO,984


In [15]:
sem_sims_df = pd.pivot_table(sem_sim_num_df, index='itemno_1', columns='itemno_2', values='similarity', dropna=False)
smat_w2v = sem_sims_df.to_numpy()
smat_w2v

array([[ 1.        ,  0.04375301,  0.06162726, ...,  0.09605465,
         0.21163696,  0.12027507],
       [ 0.04375301,  1.        ,  0.18831113, ...,  0.06560268,
         0.14024682,  0.14679891],
       [ 0.06162726,  0.18831113,  1.        , ...,  0.38248006,
         0.13602898,  0.22491077],
       ...,
       [ 0.09605465,  0.06560268,  0.38248006, ...,  1.        ,
        -0.01495953,  0.11256015],
       [ 0.21163696,  0.14024682,  0.13602898, ..., -0.01495953,
         0.99999994,  0.04806608],
       [ 0.12027507,  0.14679891,  0.22491077, ...,  0.11256015,
         0.04806608,  1.        ]], dtype=float32)

In [16]:
# np.save('../Data/wordpools/simu1_300_smat.npy',s_mat)
# seems the same with the smat from Ada