# Word-vectors

In [1]:
import spacy
import numpy as np

from collections import defaultdict

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


def tokenize(text, nlp):
    return [token.lemma_ for token in nlp(text) if not token.is_stop]

class MeanEmbeddingVectorizer(object):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = nlp.vocab.vectors.shape[1]

    def fit(self, X, y):
        return self

    def transform(self, X):
        ''' X: list of texts
        '''
        # spacy will default to vector of zeros if word has no embedding
        return np.array([
            np.mean([token.vector 
                     for token in self.nlp(text) 
                     if not token.is_stop], axis=0)
            for text in X
        ])
    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = nlp.vocab.vectors.shape[1]
        self.word2weight = None

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda text: tokenize(text, self.nlp))
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
            return np.array([
                np.mean([token.vector * self.word2weight[token.lemma_]
                         for token in self.nlp(text)
                         if not token.is_stop], axis=0)
            for text in X
        ]) 

In [2]:
nlp = spacy.load('en_core_web_md')

trees_on_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(nlp)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

trees_on_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(nlp)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [3]:
X = [
    "This is awsome", 
    "This sucks",
    "I don't know how to feel about this"]
y = ["positive", "negative" , "neutral"]

trees_on_w2v.fit(X, y);
print(trees_on_w2v.predict(["This is super"]))

trees_on_w2v_tfidf.fit(X, y);
print(trees_on_w2v_tfidf.predict(["This is super"]))

['positive']
['positive']


## Dependency trees

In [4]:
''' Build-in dependency visualizer'''
from spacy import displacy

doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
displacy.render(doc, style='dep', jupyter=True)

In [5]:
for token in doc:
    print('%-15s' * 5 % (
        token.text, token.dep_, token.head.text, token.head.pos_,
        [child for child in token.children]))

Autonomous     amod           cars           NOUN           []             
cars           nsubj          shift          VERB           [Autonomous]   
shift          ROOT           shift          VERB           [cars, liability]
insurance      compound       liability      NOUN           []             
liability      dobj           shift          VERB           [insurance, toward]
toward         prep           liability      NOUN           [manufacturers]
manufacturers  pobj           toward         ADP            []             
