# FastText Model

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Training models

In [2]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath

# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model = FastText(vector_size=100)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model)

2023-04-04 15:00:52,631 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-04-04 15:00:52,633 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2023-04-04 15:00:52,636 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-04-04T15:00:52.635640', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04-04 15:00:52,968 : INFO : FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-04-04T15:00:52.968193', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04

<gensim.models.fasttext.FastText object at 0x000002DD1E1410D0>


# Saving/loading models

In [3]:
# Save a model trained via Gensim's fastText implementation to temp.
import tempfile
import os
with tempfile.NamedTemporaryFile(prefix='saved_model_gensim-', delete=False) as tmp:
    model.save(tmp.name, separately=[])

# Load back the same model.
loaded_model = FastText.load(tmp.name)
print(loaded_model)

os.unlink(tmp.name)  # demonstration complete, don't need the temp file anymore

2023-04-04 15:01:17,348 : INFO : FastText lifecycle event {'fname_or_handle': 'C:\\Users\\AxelArcidiaco\\AppData\\Local\\Temp\\saved_model_gensim-q0ka8l3l', 'separately': '[]', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-04T15:01:17.348278', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-04-04 15:01:17,350 : INFO : storing np array 'vectors_ngrams' to C:\Users\AxelArcidiaco\AppData\Local\Temp\saved_model_gensim-q0ka8l3l.wv.vectors_ngrams.npy
2023-04-04 15:01:19,505 : INFO : not storing attribute buckets_word
2023-04-04 15:01:19,506 : INFO : not storing attribute vectors
2023-04-04 15:01:19,509 : INFO : not storing attribute cum_table
2023-04-04 15:01:19,518 : INFO : saved C:\Users\AxelArcidiaco\AppData\Local\Temp\saved_model_gensim-q0ka8l3l
2023-04-04 15:01:19,520 : INFO : loading FastText object from C:\Users\AxelArcidiaco\AppData\Local\Temp\saved_m

<gensim.models.fasttext.FastText object at 0x000002DD2EC4C130>


# Word vector lookup

In [4]:
wv = model.wv
print(wv)

#
# FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word.
#
print('night' in wv.key_to_index)

<gensim.models.fasttext.FastTextKeyedVectors object at 0x000002DD1E141400>
True


In [5]:
print('nights' in wv.key_to_index)

False


In [6]:
print(wv['night'])

array([-0.16486326,  0.13170786, -0.25996584, -0.09558579,  0.05301844,
        0.37700224,  0.28572264,  0.52917683,  0.27456078, -0.20208956,
        0.02990697, -0.14047895, -0.22274669,  0.5838353 , -0.39185223,
       -0.58972156,  0.1920398 , -0.2308735 , -0.49477252, -0.5629791 ,
       -0.50117314, -0.02018334, -0.502249  , -0.11196041, -0.13157889,
       -0.29609406, -0.6553267 , -0.09967615, -0.31026295,  0.2649233 ,
       -0.34264475,  0.32188314,  0.8136607 , -0.29243678,  0.18573925,
        0.32878488,  0.35876936, -0.11037865, -0.4102512 , -0.3027385 ,
        0.46218187, -0.45188645,  0.03345847, -0.39535823, -0.53306264,
       -0.35503432, -0.08606107,  0.16408224,  0.35535842,  0.01490612,
        0.36518428, -0.4993883 ,  0.30531123, -0.39880216, -0.17576866,
       -0.20381944, -0.19917205, -0.19154863,  0.05843408, -0.32309493,
       -0.33911535, -0.40221113, -0.16612582,  0.3202992 , -0.06200859,
        0.6695492 ,  0.03227623,  0.0621837 ,  0.47355634,  0.27

In [7]:
print(wv['nights'])

array([-0.14328378,  0.11492597, -0.22516447, -0.0824601 ,  0.04469592,
        0.32516178,  0.24854523,  0.4596679 ,  0.23820755, -0.17666438,
        0.02767511, -0.12000947, -0.19394472,  0.5032246 , -0.34057686,
       -0.5108788 ,  0.16554886, -0.19928062, -0.42703536, -0.48793808,
       -0.43073958, -0.0186529 , -0.43467772, -0.09833384, -0.11253047,
       -0.25489897, -0.56595165, -0.08392761, -0.26834813,  0.2306978 ,
       -0.29466927,  0.27804974,  0.70230687, -0.25275812,  0.16091175,
        0.2839972 ,  0.31172648, -0.09546613, -0.35508126, -0.26252434,
        0.39922836, -0.38993633,  0.02853139, -0.341561  , -0.46216   ,
       -0.30603343, -0.07153537,  0.1425913 ,  0.30897653,  0.01407885,
        0.31751734, -0.43235323,  0.26489508, -0.34491524, -0.15173377,
       -0.17515151, -0.17413944, -0.1639372 ,  0.05188529, -0.27698585,
       -0.29228833, -0.3483383 , -0.14332771,  0.276501  , -0.05306455,
        0.5803583 ,  0.02811984,  0.05130509,  0.4093889 ,  0.23

# Similarity operations

In [8]:
print("nights" in wv.key_to_index)

False


In [9]:
print("night" in wv.key_to_index)

True


In [10]:
print(wv.similarity("night", "nights"))

0.99999213


# Other similarity operations

In [11]:
print(wv.most_similar("nights"))

[('night', 0.9999920725822449),
 ('rights', 0.9999875426292419),
 ('flights', 0.9999875426292419),
 ('overnight', 0.9999872446060181),
 ('fighting', 0.9999858736991882),
 ('entered', 0.9999855756759644),
 ('fight', 0.9999853372573853),
 ('fighters', 0.9999850988388062),
 ('treated', 0.999984622001648),
 ('fighter', 0.9999845027923584)]


In [12]:
print(wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))

0.9999407


In [13]:
print(wv.doesnt_match("breakfast cereal dinner lunch".split()))

'lunch'


In [14]:
print(wv.most_similar(positive=['baghdad', 'england'], negative=['london']))

[('find', 0.9996632933616638),
 ('capital,', 0.999660849571228),
 ('field', 0.9996559023857117),
 ('findings', 0.9996548891067505),
 ('finding', 0.999653697013855),
 ('storm', 0.9996536374092102),
 ('seekers.', 0.999652624130249),
 ('had', 0.9996517896652222),
 ('abuse', 0.9996517300605774),
 ('playing', 0.9996488690376282)]


In [15]:
print(wv.evaluate_word_analogies(datapath('questions-words.txt')))

2023-04-04 15:03:55,933 : INFO : Evaluating word analogies for top 300000 words in the model on c:\Users\AxelArcidiaco\anaconda3\envs\SpacyEnv\lib\site-packages\gensim\test\test_data\questions-words.txt
2023-04-04 15:03:55,992 : INFO : family: 0.0% (0/2)
2023-04-04 15:03:56,022 : INFO : gram3-comparative: 8.3% (1/12)
2023-04-04 15:03:56,037 : INFO : gram4-superlative: 33.3% (4/12)
2023-04-04 15:03:56,061 : INFO : gram5-present-participle: 45.0% (9/20)
2023-04-04 15:03:56,092 : INFO : gram6-nationality-adjective: 25.0% (5/20)
2023-04-04 15:03:56,122 : INFO : gram7-past-tense: 5.0% (1/20)
2023-04-04 15:03:56,140 : INFO : gram8-plural: 33.3% (4/12)
2023-04-04 15:03:56,149 : INFO : Quadruplets with out-of-vocabulary words: 99.5%
2023-04-04 15:03:56,150 : INFO : NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
2023-04-04 15:03:56,152 : INFO : Total accuracy: 24.5% (24/98)


(0.24489795918367346,
 [{'correct': [], 'incorrect': [], 'section': 'capital-common-countries'},
  {'correct': [], 'incorrect': [], 'section': 'capital-world'},
  {'correct': [], 'incorrect': [], 'section': 'currency'},
  {'correct': [], 'incorrect': [], 'section': 'city-in-state'},
  {'correct': [],
   'incorrect': [('HE', 'SHE', 'HIS', 'HER'), ('HIS', 'HER', 'HE', 'SHE')],
   'section': 'family'},
  {'correct': [], 'incorrect': [], 'section': 'gram1-adjective-to-adverb'},
  {'correct': [], 'incorrect': [], 'section': 'gram2-opposite'},
  {'correct': [('LONG', 'LONGER', 'GREAT', 'GREATER')],
   'incorrect': [('GOOD', 'BETTER', 'GREAT', 'GREATER'),
                 ('GOOD', 'BETTER', 'LONG', 'LONGER'),
                 ('GOOD', 'BETTER', 'LOW', 'LOWER'),
                 ('GREAT', 'GREATER', 'LONG', 'LONGER'),
                 ('GREAT', 'GREATER', 'LOW', 'LOWER'),
                 ('GREAT', 'GREATER', 'GOOD', 'BETTER'),
                 ('LONG', 'LONGER', 'LOW', 'LOWER'),
             

# Word Movers distance

In [16]:
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()

In [17]:
from gensim.parsing.preprocessing import STOPWORDS
sentence_obama = [w for w in sentence_obama if w not in STOPWORDS]
sentence_president = [w for w in sentence_president if w not in STOPWORDS]

In [23]:
distance = wv.wmdistance(sentence_obama, sentence_president)
print(f"Word Movers Distance is {distance} (lower means closer)")

2023-04-04 15:14:40,124 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-04-04 15:14:40,125 : INFO : built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)
2023-04-04 15:14:40,127 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)", 'datetime': '2023-04-04T15:14:40.127118', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


'Word Movers Distance is 0.01587736816265295 (lower means closer)'
