In [1]:
# character-based n-gram approach known as fastText, which is used to build word-level embeddings that outperform 
# Word2Vec in most use cases

In [2]:
# Word2Vec relies heavily on the vocabulary it has been trained to represent.
# we will use fastText to capture the information contained in the sub-words:

# Word representations are actually a result of the summation of their character n-grams:
#  two- and three-character n-grams for the word language:
# la, lan, an, ang, ng, ngu, gu, gua, ua, uag, ag, age, ge

# when certain words are missing from the training vocabulary or rarely occur, we can still have a representation 
# for them if their n-grams are present as part of other words.


In [6]:
from gensim.models import FastText
from gensim.test.utils import common_texts

In [7]:
model = FastText(vector_size=5,window=3,min_count=1)

In [8]:
model.build_vocab(corpus_iterable=common_texts)
model.train(corpus_iterable=common_texts,total_examples=len(common_texts),epochs=10)

(36, 290)

In [9]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [10]:
model.wv['human']

array([-0.03166138,  0.0232673 ,  0.01241681,  0.00036033,  0.02841444],
      dtype=float32)

In [11]:
model.wv.most_similar(positive=['computer','interface'],negative=['human'])

[('user', 0.7968782186508179),
 ('system', 0.17462214827537537),
 ('response', 0.10433418303728104),
 ('survey', 0.009605277329683304),
 ('trees', -0.076405368745327),
 ('time', -0.13300471007823944),
 ('minors', -0.1392730176448822),
 ('eps', -0.2409365177154541),
 ('graph', -0.29175299406051636)]

In [12]:
# helps us by setting the minimum and maximum lengths of the character n-grams so that we can build representations

In [13]:
model = FastText(vector_size=5,window=3,min_count=1,min_n=1,max_n=3)
model.build_vocab(corpus_iterable=common_texts)
model.train(corpus_iterable=common_texts,total_examples=len(common_texts),epochs=10)

(36, 290)

In [14]:
# we will try and build a representation of a word that does not occur in our vocabulary

In [15]:
model.wv['rubber']

array([ 0.03171583, -0.01782527, -0.00762408, -0.01099192, -0.01437307],
      dtype=float32)

In [16]:
model.wv.most_similar(positive=['computer','human'],negative=['rubber'])

[('eps', 0.9116687774658203),
 ('survey', 0.19021351635456085),
 ('time', 0.01102948747575283),
 ('trees', -0.04783657565712929),
 ('response', -0.12519775331020355),
 ('graph', -0.13138628005981445),
 ('system', -0.3514636158943176),
 ('user', -0.4300253093242645),
 ('interface', -0.4703499674797058),
 ('minors', -0.5255736112594604)]

In [17]:
## extending the model so that it incorporates new sentences and vocabulary

In [18]:
sentences_to_be_added =[['I','am','learning','Natural','Language','Processing'],
                       ['Natural','Language','Processing','is','cool']]

In [19]:
model.build_vocab(sentences_to_be_added,update=True)

In [20]:
model.train(corpus_iterable=common_texts,total_examples=len(sentences_to_be_added),epochs=10)

(43, 290)

In [21]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11,
 'I': 12,
 'am': 13,
 'learning': 14,
 'Natural': 15,
 'Language': 16,
 'Processing': 17,
 'is': 18,
 'cool': 19}

In [22]:
# the model was updated to incorporate the new vocabulary terms.

In [23]:
# The original fastText research paper extended on the 
# Skip-gram approach for Word2Vec, but today, both the Skip-gram and continuous bag-of-words approach can be used.

In [24]:
# fastText can be applied to solve a plethora of problems such as spelling correction, 
# auto suggestions, and so on since it is based on sub-word representation. 