## Word similarity measure with spaCy

#### Install required library

In [None]:
#!pip install spacy

##### Compute similarity

In [4]:
def compare_word(nlp_model, w, words_vectors):
    """
    Compares new word with those in the words vectors dictionary
    """
    vec = nlp_model(w)
    return {w1:vec.similarity(vec1) for w1,vec1 in words_vectors.items()}

## For English   
#### Download english models : https://spacy.io/models/en

In [None]:
#!python -m spacy download en_core_web_md 

In [5]:
import spacy

# load the language model
en_nlp_model = spacy.load('en_core_web_md')

# set based keyword list
word_list = ['queen', 'beauty', 'reform', 'bar', 'car', 'drink', 'lesson music']
word_list1 = [ 'travel', 'restaurant', 'architecture', 'massage']

# convert the strings to spaCy Token objects
tokens_ = {}
for ww in word_list:
  tokens_[ww] = (en_nlp_model(ww)[0])

tokens_1 = {}
for ww in word_list1:
  tokens_1[ww] = (en_nlp_model(ww)[0])


In [6]:
print('girl : ', compare_word(en_nlp_model, 'girl', tokens_))
print('girl : ', compare_word(en_nlp_model, 'girl', tokens_1))

girl :  {'queen': 0.4620418872631475, 'beauty': 0.3470903688146996, 'reform': -0.01668207995090511, 'bar': 0.19038626272144732, 'car': 0.22550734404683506, 'drink': 0.2563003838616516, 'lesson music': 0.13454798024404474}
girl :  {'travel': 0.00431965427181635, 'restaurant': 0.15304958562159815, 'architecture': -0.030297284505522805, 'massage': 0.20896072402710156}


In [7]:
print('hair style : ', compare_word(en_nlp_model, 'hair style', tokens_))
print('hair style : ', compare_word(en_nlp_model, 'hair style', tokens_1))

hair style :  {'queen': 0.2138813478817842, 'beauty': 0.5440184398678954, 'reform': 0.05707112084494526, 'bar': 0.21484520763149056, 'car': 0.1840519490356011, 'drink': 0.21416462322399799, 'lesson music': 0.11556897228343867}
hair style :  {'travel': 0.12237028136630919, 'restaurant': 0.20786229246414947, 'architecture': 0.3594509572861326, 'massage': 0.35209446729072374}


## For Japanese
#### Download english models : https://spacy.io/models/ja

In [None]:
#!python -m spacy download ja_core_news_lg

In [8]:
import spacy

# load the language model
ja_nlp_model = spacy.load('ja_core_news_lg')



# set based keyword list
word_list = ['女王', '美容', '改革', 'バー', '車', '飲み物', 'レッスン音楽']
            #['Queen', 'Beauty', 'Reform', 'Bar', 'Car', 'Drink', 'Lesson Music']
word_list1 = [ '旅行', 'レストラン', '建築', 'マッサージ']
            #[ 'Travel', 'Restaurant', 'Architecture', 'Massage']

# convert the strings to spaCy Token objects
tokens_ = {}
for ww in word_list:
  tokens_[ww] = (ja_nlp_model(ww)[0])

tokens_1 = {}
for ww in word_list1:
  tokens_1[ww] = (ja_nlp_model(ww)[0])

In [9]:
print('Fast food : ', compare_word(ja_nlp_model, 'ファストフード', tokens_))
print('ファストフード : ', compare_word(ja_nlp_model, 'ファストフード', tokens_1))

Fast food :  {'女王': 0.07120987830348414, '美容': 0.2375892347426994, '改革': 0.1336707593366052, 'バー': 0.21089440233405077, '車': 0.15480723803406446, '飲み物': 0.3816738421577496, 'レッスン音楽': 0.1301331524993787}
ファストフード :  {'旅行': 0.17283721771662436, 'レストラン': 0.412974824210537, '建築': 0.12923340154345833, 'マッサージ': 0.1876916523216345}


In [10]:
print('hair style : ', compare_word(ja_nlp_model, 'ヘアスタイル', tokens_))
print('ヘアスタイル : ', compare_word(ja_nlp_model, 'ヘアスタイル', tokens_1))

hair style :  {'女王': 0.11064131229204496, '美容': 0.4185264741302789, '改革': 0.03756975943847968, 'バー': 0.14951228099306335, '車': 0.10908257623830947, '飲み物': 0.20861923996778534, 'レッスン音楽': 0.20778039978945873}
ヘアスタイル :  {'旅行': 0.16705126851787047, 'レストラン': 0.21044082423393054, '建築': 0.17307159712980988, 'マッサージ': 0.28862216560994336}


### Similar words 

In [33]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")  
bests = model.most_similar(["drink"], topn= 20)
print("......Most Similar Words......")
for best in bests:
    print(best)

......Most Similar Words......
('drinks', 0.8851481080055237)
('beer', 0.8182137608528137)
('drinking', 0.750116229057312)
('liquor', 0.7198688983917236)
('bottled', 0.7179968953132629)
('soda', 0.71770179271698)
('beverages', 0.7114107608795166)
('bottle', 0.7083661556243896)
('drank', 0.7047054767608643)
('coffee', 0.699092447757721)
('beverage', 0.6927410364151001)
('eat', 0.6849386096000671)
('alcohol', 0.6841266751289368)
('wine', 0.6808558106422424)
('tea', 0.6683973670005798)
('milk', 0.66713547706604)
('snack', 0.6655228137969971)
('alcoholic', 0.6637224555015564)
('bottles', 0.6606711149215698)
('vodka', 0.6571992635726929)


In [34]:
bests = model.most_similar(["beauty"], topn= 5)
print("......Most Similar Words......")
print(bests)

tokens_2 = {}

for best in bests:
    
    tokens_2[best[0]] = (en_nlp_model(best[0])[0])
    

......Most Similar Words......
[('beautiful', 0.6740165948867798), ('fashion', 0.6238462328910828), ('glamour', 0.6215600967407227), ('nature', 0.609063446521759), ('love', 0.6026946902275085)]


In [35]:
print('hair style : ', compare_word(en_nlp_model, 'hair style', tokens_2))

hair style :  {'beautiful': 0.49262654504701126, 'fashion': 0.5497506854341697, 'glamour': 0.4657121984238619, 'nature': 0.38719293301649566, 'love': 0.34706353199867246}


In [36]:
bests = model.most_similar(["construction"], topn= 5)
print("......Most Similar Words......")
print(bests)

tokens_3 = {}

for best in bests:
    
    tokens_3[best[0]] = (en_nlp_model(best[0])[0])

......Most Similar Words......
[('building', 0.7728806138038635), ('projects', 0.7253748774528503), ('industrial', 0.7133155465126038), ('built', 0.7087695002555847), ('project', 0.7039004564285278)]


In [37]:
print('hair style : ', compare_word(en_nlp_model, 'hair style', tokens_3))

hair style :  {'building': 0.18545111781117804, 'projects': 0.1859823358533846, 'industrial': 0.17800051018304502, 'built': 0.09773349029991571, 'project': 0.14033476156478736}
