I) Loading text and preprocessing it

In [11]:
from gensim.utils import simple_preprocess

with open('./text', 'r') as file: 
    content = file.read()

tokens = simple_preprocess(content) # Performs lower casing text, and tokenizing it into words

print(tokens)

['morocco', 'and', 'marrakech', 'tapestry', 'of', 'tradition', 'and', 'modernity', 'morocco', 'located', 'at', 'the', 'crossroads', 'of', 'europe', 'and', 'africa', 'is', 'country', 'drenched', 'in', 'history', 'mystery', 'and', 'cultural', 'richness', 'testament', 'to', 'the', 'ancient', 'civilizations', 'that', 'once', 'flourished', 'here', 'this', 'north', 'african', 'kingdom', 'boasts', 'unique', 'blend', 'of', 'arab', 'berber', 'and', 'european', 'influences', 'at', 'the', 'heart', 'of', 'morocco', 'rich', 'tapestry', 'lies', 'marrakech', 'one', 'of', 'its', 'four', 'imperial', 'cities', 'and', 'vibrant', 'epicenter', 'of', 'tradition', 'and', 'modernity', 'geographical', 'significance', 'morocco', 'is', 'bordered', 'by', 'the', 'atlantic', 'ocean', 'to', 'the', 'west', 'the', 'mediterranean', 'sea', 'to', 'the', 'north', 'algeria', 'to', 'the', 'east', 'and', 'southeast', 'and', 'the', 'vast', 'sahara', 'desert', 'to', 'the', 'south', 'its', 'strategic', 'location', 'has', 'histo

II) Word2Vec model creation & training

III) Vectorial representation extraction (of a word)

In [12]:
from gensim.models import Word2Vec

# II)***************************************************************************
model = Word2Vec([tokens], vector_size = 100, window = 5, min_count = 1, sg = 1)
# tokens: input data for training the model
# vector_size -> d(hyper-param) = 100
# window -> m(hyper param) = 5 words of context before and after the center word
# min_count: the minimum frequency of a word that is included in the vocabulary
# sg: type of algorithm used for training, 1 for SKIP-GRAM model, 0 for CBOW

# III)***************************************************************************
vector = model.wv["flourished"]
print(vector)

[ 7.15501700e-03 -2.26053846e-04  5.81676885e-03 -9.00832005e-03
  7.31976004e-04 -4.65204148e-03  1.04327640e-03  2.66102399e-03
  9.45678726e-03  5.73837385e-03  3.07697197e-03 -6.83627697e-03
  6.86119683e-03 -5.25413174e-03 -3.02791665e-03 -1.98666076e-03
 -2.22545234e-03 -5.09377383e-03  6.93062413e-03  4.88746678e-03
  7.33957640e-05  1.21252076e-03 -3.19570513e-03 -5.77849150e-03
 -4.99767577e-03 -6.76846030e-05 -8.37225281e-03 -6.07777583e-05
  2.78886687e-03 -4.50139074e-03  4.35087224e-03  2.13767332e-03
  1.11611711e-03  5.15793031e-03  5.28235687e-04 -4.74359421e-03
  4.10419656e-03 -4.75682830e-03  8.40304513e-03 -8.02611187e-03
  3.53088696e-03 -4.32773074e-03 -9.04276792e-04  5.59440441e-03
  9.18012578e-03  4.92417067e-03  5.84045053e-03  8.48361943e-03
  3.96723952e-03 -1.77820539e-03 -2.02591415e-03 -8.23051855e-03
  8.22846312e-04 -6.70622010e-03  3.98282567e-03  9.79093020e-04
  7.02203810e-03 -7.95924291e-03 -7.18547963e-03  4.34139417e-03
  1.83731481e-03 -4.09613

IV) Calculate the similarity between two words

In [13]:
# comparing the cos similarity between the two vectors
similarity = model.wv.similarity("morocco", "tradition") # the score is between -1 & 1.
print(similarity) # dissimilarity if it's close to -1, no similarity if it's close to 0, similarity if it's close to 1

0.0459574


V) Extract contextuel words for a central one

In [14]:
context_words = model.wv.most_similar("morocco", topn = 5) # extract the 5 most similar words
print(context_words)

[('comes', 0.35773995518684387), ('snow', 0.33594006299972534), ('mountains', 0.3202439248561859), ('moroccan', 0.2772573232650757), ('against', 0.2763691246509552)]
