Downloading "War and Peace"

In [None]:
import requests
url = "http://www.gutenberg.org/files/2600/2600-0.txt"
text = requests.get(url).content.decode()

In [None]:
with open('war_and_peace.txt',mode='w',encoding='utf-8') as outputfile:
    outputfile.write(text)

In [None]:
with open('war_and_peace.txt',mode='r',encoding='utf-8') as inputfile:
    text = inputfile.read()

In [None]:
len(text)

In [None]:
text[:1000]

In [None]:
# skipping meta info and index at the beginning of the book
text = text[7285:]
print(text[:400])

In [None]:
# removing CHAPTER heading and new lines
text = text.replace('CHAPTER','')
text = text.replace('\n',' ')
text = text.replace('\r',' ')

Tokenization

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk import word_tokenize
tokens = word_tokenize(text)

In [None]:
len(tokens)

In [None]:
print(tokens[2000:2050])

Sentence splitting

In [None]:
from nltk import sent_tokenize

In [None]:
sentences = sent_tokenize(text)

In [None]:
len(sentences)

In [None]:
print(sentences[6])

Let's tokenize every sentence.
This is the expected input for word2vec.

In [None]:
tok_sents = [word_tokenize(sentence) for sentence in sentences]

In [None]:
tok_sents[0:2]

A few statistics

In [None]:
vocab = set(tokens)
len(vocab)

In [None]:
from collections import Counter
frequencies = list(Counter(tokens).items())
frequencies[:30]

In [None]:
frequencies.sort(key=lambda x: x[1], reverse=True)

In [None]:
for word, frequency in frequencies[:30]:
    print(word,'\t',frequency)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.loglog([i+1 for i,_ in enumerate(frequencies)],[f for w,f in frequencies])
plt.show()

### Fitting a Word2Vec model

A tutorial from gensim's author is available at https://rare-technologies.com/word2vec-tutorial/ The tutorial contains details and links on how gensim implements word2vec.

This logging-related command enables printed feedback during the fitting process.

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

The creation of the Word2Vec object coincides with the fitting of the model.
It thus takes some time.

In [None]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(tok_sents, size=100, window=10, min_count=5, sg=1, iter=20, negative=10)

In [None]:
w2v_model.wv.most_similar(['said'])

In [None]:
w2v_model.wv.most_similar(['sir','she'],['he'])

In [None]:
w2v_model.wv.vectors[1]

In [None]:
w2v_model.wv.most_similar('father')

In [None]:
import requests
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
test_file = 'questions-words.txt'
questions = requests.get(url).content.decode()
with open(test_file,mode='w',encoding='utf-8') as outputfile:
    outputfile.write(questions)
print(questions[:1000])

In [None]:
w2v_wAp_analogy = w2v_model.wv.evaluate_word_analogies(test_file)

In [None]:
w2v_wAp_analogy = w2v_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)

Same data, using a FastText model (sub-word character n-grams get their own embeddings).

In [None]:
from gensim.models import FastText
fast_model = FastText(tok_sents,size=100, window=10, min_count=5, sg=1, iter=20, negative=10)

In [None]:
fast_model.wv.most_similar(['said'])

In [None]:
w2v_model.wv.most_similar('father')

In [None]:
w2v_model.wv.most_similar('fathher')

In [None]:
fast_model.wv.most_similar('father')

In [None]:
fast_model.wv.most_similar('fathher')

In [None]:
fasttext_wAp_analogy = fast_model.wv.evaluate_word_analogies(test_file)

In [None]:
fasttext_wAp_analogy = fast_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)

Using google's precomputed word embeddings

This section works only after downloading and uncompressing Google's precomputed word embeddings into the same directory where this notebook is saved

In [None]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz'
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
from gensim.models import KeyedVectors
google_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [None]:
google_model.most_similar(['king','female'],['male'])

In [None]:
google_model.most_similar(['summer','cold'],['warm'])

In [None]:
w2v_large_analogy = google_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)

In [None]:
google_model.doesnt_match(['sun','moon','sand','jupiter'])

Using fasttext's precomputed word embeddings

https://fasttext.cc/docs/en/english-vectors.html

In [None]:
del google_model

In [None]:
!wget -c 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'

In [None]:
%ls

In [None]:
!gunzip cc.en.300.bin.gz

In [None]:
%ls

In [None]:
from gensim.models.fasttext import FastText
fasttext_model = FastText.load_fasttext_format('cc.en.300.bin')

In [None]:
fasttext_model.wv.most_similar('father')

In [None]:
fasttext_model.wv.most_similar('fathher')

In [None]:
fasttext_large_analogy = fasttext_model.wv.evaluate_word_analogies(test_file, dummy4unknown=True)