In [1]:
import nltk

In [62]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re
import numpy as np

In [41]:
paragraph = '''Artificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals, which involves consciousness and emotionality.
               The distinction between the former and the latter categories is often revealed by the acronym chosen.
               'Strong' AI is usually labelled as AGI (Artificial General Intelligence) while attempts to emulate 'natural' intelligence have been called ABI (Artificial Biological Intelligence).
               Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
               Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".'''

### Preprocess the data

In [42]:
text = re.sub(r'\[[0-9]*\]',' ', paragraph)
text = re.sub(r'\s+',' ', text)
text = text.lower()
text = re.sub(r'\d', ' ',text)
text = re.sub(r'\s+',' ',text )
text = re.sub(r'"',' ',text )
text = re.sub(r'\+', ' ' , text )
text = re.sub(r'\(+',' ',text )
text = re.sub(r'\)+',' ',text )
text = re.sub(r'\'+',' ',text )
text = re.sub(r'\'+',' ',text )
text = re.sub(r'\,+',' ',text )
# text = re.sub(r'\.+',' ',text )
text = re.sub(r'\:+',' ',text )

In [43]:
text

'artificial intelligence  ai  is intelligence demonstrated by machines  unlike the natural intelligence displayed by humans and animals  which involves consciousness and emotionality. the distinction between the former and the latter categories is often revealed by the acronym chosen.  strong  ai is usually labelled as agi  artificial general intelligence  while attempts to emulate  natural  intelligence have been called abi  artificial biological intelligence . leading ai textbooks define the field as the study of  intelligent agents   any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. colloquially  the term  artificial intelligence  is often used to describe machines  or computers  that mimic  cognitive  functions that humans associate with the human mind  such as  learning  and  problem solving .'

# prepare the dataset

In [44]:
sentences = nltk.sent_tokenize(text)

In [46]:
sentences, len(sentences)

(['artificial intelligence  ai  is intelligence demonstrated by machines  unlike the natural intelligence displayed by humans and animals  which involves consciousness and emotionality.',
  'the distinction between the former and the latter categories is often revealed by the acronym chosen.',
  'strong  ai is usually labelled as agi  artificial general intelligence  while attempts to emulate  natural  intelligence have been called abi  artificial biological intelligence .',
  'leading ai textbooks define the field as the study of  intelligent agents   any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.',
  'colloquially  the term  artificial intelligence  is often used to describe machines  or computers  that mimic  cognitive  functions that humans associate with the human mind  such as  learning  and  problem solving .'],
 5)

In [47]:
sentences = [nltk.word_tokenize(sentences) for sentences in sentences]

In [50]:
sentences[0]

['artificial',
 'intelligence',
 'ai',
 'is',
 'intelligence',
 'demonstrated',
 'by',
 'machines',
 'unlike',
 'the',
 'natural',
 'intelligence',
 'displayed',
 'by',
 'humans',
 'and',
 'animals',
 'which',
 'involves',
 'consciousness',
 'and',
 'emotionality',
 '.']

In [51]:
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [56]:
sentences[0]

['artificial',
 'intelligence',
 'ai',
 'intelligence',
 'demonstrated',
 'machines',
 'unlike',
 'natural',
 'intelligence',
 'displayed',
 'humans',
 'animals',
 'involves',
 'consciousness',
 'emotionality',
 '.']

In [57]:
model =Word2Vec(sentences, min_count=1)

In [53]:
words = model.wv.vocab # vocablary

In [59]:
words

{'artificial': <gensim.models.keyedvectors.Vocab at 0x2311bf465e0>,
 'intelligence': <gensim.models.keyedvectors.Vocab at 0x2311bf46790>,
 'ai': <gensim.models.keyedvectors.Vocab at 0x2311bf46b50>,
 'demonstrated': <gensim.models.keyedvectors.Vocab at 0x2311bf46fd0>,
 'machines': <gensim.models.keyedvectors.Vocab at 0x2311bf4e880>,
 'unlike': <gensim.models.keyedvectors.Vocab at 0x2311bf4ed60>,
 'natural': <gensim.models.keyedvectors.Vocab at 0x2311bf4e610>,
 'displayed': <gensim.models.keyedvectors.Vocab at 0x2311bf4e1c0>,
 'humans': <gensim.models.keyedvectors.Vocab at 0x2311bf4e0a0>,
 'animals': <gensim.models.keyedvectors.Vocab at 0x2311bf4ecd0>,
 'involves': <gensim.models.keyedvectors.Vocab at 0x2311bf4eeb0>,
 'consciousness': <gensim.models.keyedvectors.Vocab at 0x2311bf4e6a0>,
 'emotionality': <gensim.models.keyedvectors.Vocab at 0x2311bf4e6d0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x2311bf4e820>,
 'distinction': <gensim.models.keyedvectors.Vocab at 0x2311bf4eaf0>,
 'form

In [60]:
vector = model.wv['device']

In [64]:
vector

array([-3.2746969e-03,  4.8221359e-03,  4.1950741e-03,  2.9707656e-03,
       -8.2473678e-05,  1.6816205e-03, -3.6026742e-03,  2.2184183e-03,
        4.6044099e-03, -4.0549277e-03,  6.9225190e-04, -1.4797857e-03,
        1.4818120e-03,  4.4385483e-03, -2.3883611e-03,  4.9595948e-04,
       -4.7651469e-03, -8.0748176e-04, -3.5031573e-03,  1.9044990e-03,
       -6.3943886e-04, -3.2370435e-03,  4.8401910e-03, -4.3179858e-03,
       -3.2364435e-03,  4.2531341e-03, -1.4921257e-03, -2.4486333e-03,
        1.4367121e-03,  2.8324567e-03, -1.2400742e-04,  4.8365816e-03,
        3.1353519e-04,  5.5915327e-04, -3.4864524e-03,  4.7028223e-03,
        4.2276019e-03, -3.6769784e-03,  2.2585997e-03,  2.6589760e-03,
        1.3664982e-03,  3.0422374e-03, -1.7995614e-03, -3.4429505e-04,
        2.2470097e-03, -4.9894833e-04, -4.4592032e-03, -4.6731746e-03,
        1.0138962e-03, -3.8902722e-03, -2.3352471e-03,  5.7637302e-04,
        4.2800736e-03, -2.0617864e-03,  3.4770539e-03,  4.1199974e-03,
      

In [65]:
vector.shape

(100,)

In [69]:
similar = model.wv.most_similar('artificial')

In [70]:
similar

[('leading', 0.2593465745449066),
 ('natural', 0.23379497230052948),
 ('actions', 0.17415376007556915),
 ('successfully', 0.1713218241930008),
 ('computers', 0.17061316967010498),
 ('mind', 0.15536904335021973),
 ('associate', 0.14930176734924316),
 ('maximize', 0.14711712300777435),
 ('define', 0.14273163676261902),
 ('achieving', 0.1271485686302185)]