In [1]:
import numpy as np
import pandas as pd

## WordNet
- Represent the meaning of words with a lot of "categiries" ("tags") based on taxionomy. 
- Need to be annotated by human, and some language don't have many WordNet.

In [40]:
from nltk.corpus import wordnet as wn
panda = wn.synset('panda.n.01')
hyper = lambda s: s.hypernyms()
list(panda.closure(hyper))

panda = wn.synset('dog.n.01')
hyper = lambda s: s.hypernyms()
list(panda.closure(hyper))

[Synset('canine.n.02'),
 Synset('domestic_animal.n.01'),
 Synset('carnivore.n.01'),
 Synset('animal.n.01'),
 Synset('placental.n.01'),
 Synset('organism.n.01'),
 Synset('mammal.n.01'),
 Synset('living_thing.n.01'),
 Synset('vertebrate.n.01'),
 Synset('whole.n.02'),
 Synset('chordate.n.01'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

## Bugs of Words (BOW) represetation of words
- Just counting the occurence of words in sentences. 
- 3 Step 
   1. Tokenize -> NLTK.tokenizer(), for english sentences, it's not that difficult. 
   2. Counting -> Count the occurence of tokens in each sentence. 
   3. Normalizing and weighting -> One of the most common idea "TF-IDF"
- The Probelm : Sparse, also not robust to new words. <br>
    > What if new words come in which doesn't show up in training data set? 

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.','This is the second second document.', \
          'And the third one.','Is this the first document?']
vectorizer = CountVectorizer()
vectorizer.fit_transform(corpus)
vectorized_sentence = vectorizer.transform(["This is the third document written by John."]).toarray()
print(vectorized_sentence)
print(vectorizer.get_feature_names())
# New words cannot be captured.

[[0 1 0 1 0 0 1 1 1]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer()
tfidfvectorizer.fit_transform(corpus)
tfidfvectorized_sentence = tfidfvectorizer.transform(["This is the third document written by John."]).toarray()
print(tfidfvectorized_sentence)
print(tfidfvectorizer.get_feature_names())

[[ 0.          0.40412895  0.          0.40412895  0.          0.
   0.33040189  0.63314609  0.40412895]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


## Methods of window based co-occurence matrix 
- Capture the meaning by checking left/right neighbors of the target words, assuming the words co-occur together have similar topics. 
- Still sparse, not robust to new data.

In [36]:
# Easy example 
sentence = "A D G C E A D G F E B A C E D".split(" ")
m = 2
#Window size will be 5, 2 from l3ft, 2 from right and the center word itself.
distinct_word = len(set(sentence))
word_list = list(set(sentence))
print(word_list)
context_vector = [[0 for j in range(distinct_word)] for i in range(distinct_word)]

for word in set(sentence):
    indices = [i for i, x in enumerate(sentence) if x == word]
    for index in indices:
        for i in range(1, m+1):
            if index + i < len(sentence) and word != sentence[index + i]:
                context_vector[word_list.index(word)][word_list.index(sentence[index + i])] += 1
            if index - i >= 0 and word != sentence[index - i]:
                context_vector[word_list.index(word)][word_list.index(sentence[index - i])] += 1

print(context_vector)

['A', 'D', 'C', 'F', 'E', 'B', 'G']
[[0, 2, 2, 0, 3, 1, 2], [2, 0, 2, 1, 2, 0, 2], [2, 2, 0, 0, 2, 1, 1], [0, 1, 0, 0, 1, 1, 1], [3, 2, 2, 1, 0, 1, 2], [1, 0, 1, 1, 1, 0, 0], [2, 2, 1, 1, 2, 0, 0]]


## Word2Vec fastText 
- Main idea : Skipgram and CBOW(Lecture#2 was mainly talking about Skip-grams)

In [37]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format("/Users/akariasai/Downloads/wiki.en.vec")
print(model.similarity('france', 'spain'))
print(model.similarity('france', 'japan'))
print(model.similarity('france', 'paris'))
print(model.similarity('japan', 'tokyo'))

0.62924441029
0.452758348183
0.615376118122
0.687043852834
