In [24]:
import gensim
import os, math
import numpy as np
import pandas as pd
import seaborn as sns
import helpers
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
X_train, X_test, y_train, y_test = helpers.train_test_split('./data/training_variants', './data/training_text')

Will mostly follow examples from
http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
and
https://codesachin.wordpress.com/2015/10/09/generating-a-word2vec-model-from-a-block-of-text-using-gensim-python/

In [26]:
unique_text = X_train.Text.unique()

In [27]:
def build_sentences(text):
    sentences = []
    for document in text:
        sts = document.split('.') # list of sentences in the document
        for s in sts:
            sentences.append(s.split(' '))
    return sentences

Train the w2v model on all the unique text in the training set

In [28]:
k = build_sentences(unique_text)
model = gensim.models.Word2Vec(k, size=100)
#model.wv.vocab

In [48]:
print(model.similarity('egfr', 'gene'))
print(model.similarity('tumor', 'tumor'))
print(model.similarity('mutation', 'gene'))

0.0288922343557
1.0
0.277487495915


In [37]:
X_test.head(10)
y_test[y_test['Gene'] == 'BRCA1']

Unnamed: 0,ID,Gene,Variation,Class
2672,2672,BRCA1,K45T,6
2513,2513,BRCA1,S1841A,1
2469,2469,BRCA1,R170Q,3
2440,2440,BRCA1,P1856S,5
2533,2533,BRCA1,Q1785H,5
2607,2607,BRCA1,M1775V,5
2499,2499,BRCA1,S153R,3
2486,2486,BRCA1,S1841R,1
2634,2634,BRCA1,W1718L,1
2573,2573,BRCA1,C1697R,4


Take the mean of all the word vectors in a document

In [47]:
def doc2vec(doc):
    words = doc.split(' ')
    N = 0
    v = np.zeros(100)
    for w in words:
        try:
            v+=model[w.rstrip('.')]
            N +=1
        except KeyError:
            pass
    v = (1./N)*v
    return v

Divide training set into training + validation

In [48]:
Ntotal = len(X_train.Text)
Ntr = 2124 # 80% of training set
X_tr = X_train.Text[0:Ntr]
X_validate = X_train.Text[Ntr:]
y_tr = y_train.Class[0:Ntr]
y_validate = y_train.Class[Ntr:]

Convert every document in the training set to a vector. This takes a few minutes to run.

In [49]:
Xtr = np.zeros((Ntr, 100))
i = 0
for doc in X_tr:
    Xtr[i,:] = doc2vec(doc)
    i += 1
    if i%500 == 0: print(i)
    

500
1000
1500
2000


Prediction method:
1. Get the centroid vectors from the training set.
2. Get distance of new vector from each centroid. Probabilities are the normalized distances from each centroid.
3. Return list of probabilities.

Evaluation metric: log-loss. For some reason the sklearn log-loss gives me something different from when I compute it explicitly, so I'm probably using it wrong.

Comparison to random chance where I set the probabilities to be just the underlying frequency in the training set.

In [169]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
clf = NearestCentroid()
clf.fit(Xtr, y_tr)
centroids = clf.centroids_

def predict(X, ctr):
    p = np.zeros((X.shape[0], 9))
    for n in range(X.shape[0]):
        d = np.zeros(9)
        for i in range(9):
            d[i] = np.linalg.norm(X[n,:] - ctr[i,:])
        d = d/np.linalg.norm(d)
        p[n,:] = d
    return p

def onehot(y):
    output = np.zeros((len(y), 9))
    for i,n in enumerate(y):
        output[i,n-1]=1
    return output

def logloss(y, X):
    N = X.shape[0]
    x = 0
    for i in range(N):
        x+= np.dot(y[i,:], np.log2(X[i,:]))
    x = (-1./N)*x
    return x

print('Log-loss of w2v prediction')
print(logloss(onehot(y_tr), output))

output = predict(Xtr, centroids)

class_prob = np.sum(onehot(y_tr), axis=0)/len(y_tr)
random_chance = np.zeros_like(output)
for i in range(random_chance.shape[0]):
    random_chance[i,:] = class_prob
print('Log-loss of random chance prediction')
print(logloss(onehot(y_tr), random_chance)) # random chance prediction

#uniform guessing
print('Log-loss of uniform guessing')
print(logloss(onehot(y_tr), (1/9.)*np.ones_like(output)))

Log-loss of w2v prediction
1.78894351607
Log-loss of random chance prediction
2.63876002514
Log-loss of uniform guessing
3.16992500144


Persist the w2v model to disk. It turns out to be 119M.

In [76]:
#model.save('firstw2vmodel')