In [None]:
import nltk
nltk.download('all')
nltk.download('punkt')

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs
import nltk

## Getting Data

In [None]:
resp = requests.get('https://en.wikipedia.org/wiki/Artificial_intelligence')
parsed_article = bs(resp.text, 'lxml')
paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
  article_text += p.text

## Cleaning Data

In [None]:
def clean_data(text):
  processed_text = text.lower()
  processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
  processed_text = re.sub(r'\s+', ' ', processed_text)
  return processed_text


def tokenize_data(text):
  # Preparing the dataset
  all_sentences = nltk.sent_tokenize(text)
  all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
  return all_words


def removing_stopwords(all_words):
  from nltk.corpus import stopwords
  for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
  return all_words


def prepare_data(text):
  clean_text = clean_data(text)
  tokens = tokenize_data(clean_text)
  processed_tokens = removing_stopwords(tokens)
  return processed_tokens

In [None]:
all_words = prepare_data(article_text)
len(all_words[0])

5629

## Building word2vec model

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(all_words, 
                 min_count = 2,
                 window = 8,
                #  negative = 10, # for negative sampling
                 seed = 14)

In [None]:
print(model)

Word2Vec(vocab=934, size=100, alpha=0.025)


In [None]:
vocabulary = model.wv.vocab
print(vocabulary)

{'artificial': <gensim.models.keyedvectors.Vocab object at 0x7ff1618f36d8>, 'intelligence': <gensim.models.keyedvectors.Vocab object at 0x7ff1871bca90>, 'ai': <gensim.models.keyedvectors.Vocab object at 0x7ff1871bcb00>, 'machines': <gensim.models.keyedvectors.Vocab object at 0x7ff1618f61d0>, 'unlike': <gensim.models.keyedvectors.Vocab object at 0x7ff160060588>, 'natural': <gensim.models.keyedvectors.Vocab object at 0x7ff15f2cce48>, 'displayed': <gensim.models.keyedvectors.Vocab object at 0x7ff15f2da198>, 'humans': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee16320>, 'animals': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee2ce10>, 'involves': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee2ce48>, 'consciousness': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee2ce80>, 'categories': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee2ceb8>, 'often': <gensim.models.keyedvectors.Vocab object at 0x7ff15ee2cef0>, 'strong': <gensim.models.keyedvectors.Vocab object 

In [None]:
# Finding Vectors for a Word
v1 = model.wv['artificial']
v1

array([-0.00557132, -0.00793488,  0.00342524,  0.00441834, -0.00482484,
       -0.00990499, -0.00308784, -0.00435968,  0.0031534 , -0.00265124,
       -0.00275894,  0.00209266,  0.01058484,  0.00320443,  0.00084767,
       -0.0049696 , -0.00366577, -0.01230477, -0.00207417, -0.00190521,
       -0.00556347, -0.00199152,  0.00249953,  0.01387168,  0.00660826,
       -0.00780312,  0.0066384 , -0.00465613,  0.01214761, -0.00189341,
       -0.00023113,  0.00194597, -0.00119612,  0.00150809, -0.00601039,
        0.00023221, -0.00884125,  0.00012608,  0.00503777,  0.00209334,
       -0.0034991 ,  0.00436607, -0.0051123 ,  0.01044457,  0.00464531,
        0.0016968 ,  0.0036553 ,  0.00337477, -0.00025673, -0.01324162,
       -0.00347346, -0.01312127,  0.00224645, -0.00334488, -0.00233944,
        0.00120059, -0.00167149, -0.00768431,  0.00053615, -0.00534271,
       -0.00702318,  0.00705662, -0.00282644, -0.00118589, -0.00876798,
        0.00023598, -0.00043296,  0.00614347,  0.00884653,  0.00

## Finding Similar Words

In [None]:
sim_words = model.wv.most_similar('intelligence')
sim_words

[('ai', 0.7138211727142334),
 ('also', 0.6824641823768616),
 ('human', 0.6631876826286316),
 ('system', 0.6473186016082764),
 ('artificial', 0.6468836069107056),
 ('research', 0.635960578918457),
 ('humans', 0.6194682121276855),
 ('example', 0.6006100177764893),
 ('people', 0.5797998309135437),
 ('neural', 0.5713820457458496)]