In [1]:
import sys
import itertools
import os
import json

import numpy as np
from scipy.spatial import distance
from nltk import word_tokenize, sent_tokenize
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
if '..' not in sys.path:
    sys.path.append('..')

from src.document import Document

In [3]:
def get_data(path):
    data = []
    for i in itertools.count(start=1):
        try:
            text = open(os.path.join(path, 'problem-' + str(i) + '.txt'), 'r').read()
            changes = json.load(open(os.path.join(path, 'problem-' + str(i) + '.truth')))
            data.append(Document(text, **changes))
        except FileNotFoundError:
            break
    return data

In [4]:
train = get_data('../data/train_raw')
validation = get_data('../data/validation_raw')

In [5]:
embedding_size = 300
embeddings_index = {}
with open(os.path.expanduser('~/Downloads/glove.6B/glove.6B.{}d.txt'.format(embedding_size))) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
# word2vec = lambda w: embeddings_index[w] if w in embeddings_index else np.zeros(embedding_size, dtype='float32')
def word2vec(w):
#     if w[0] in set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
#         print(w)
    w = w.lower()
    if w in embeddings_index:
        return embeddings_index[w]
    return np.zeros(embedding_size, dtype='float32')

In [7]:
def docs2X_y(documents):
    def minmax(a, b):
        return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

    X = []
    y = []

    for d in documents:
        indices = []
        if not d.has_changes:
#             indices = [None, len(d.sentences)//3, (len(d.sentences)//3)*2, None]
            indices = [None, len(d.words)//3, (len(d.words)//3)*2, None]
        else:
#             indices = [None] + d.sent_positions + [None]
            indices = [None] + d.word_positions + [None]

        cache = {}
        for i, j in zip(indices[:-1], indices[1:]):
#             cache['{} {}'.format(i,j)] = ' '.join(d.sentences[i:j])
#             cache['{} {}'.format(i,j)] = np.mean(list(
#                 map(lambda s: np.mean(list(map(word2vec, word_tokenize(s))), axis=0),
#                     d.sentences[i:j])),axis=0)
            v = np.sum(list(map(word2vec, d.words[i:j])), axis=0)
#             v = v / np.linalg.norm(v)
            cache['{} {}'.format(i,j)] = v

        for i, j, k in zip(indices[:-2], indices[1:-1], indices[2:]):
            a = cache['{} {}'.format(i,j)]
            b = cache['{} {}'.format(j,k)]
            X.append(np.array([
                minmax(a, b),
                distance.cosine(a, b),
                distance.braycurtis(a, b),
                distance.canberra(a, b),
                distance.cityblock(a, b)
            ]))
            y.append(d.has_changes)
    return np.asarray(X), y

In [8]:
X_train, y_train = docs2X_y(train)

In [9]:
X_val, y_val = docs2X_y(validation)

In [10]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC()),
])

In [11]:
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [12]:
model.score(X_train, y_train)

0.9029800203183204

In [13]:
model.score(X_val, y_val)

0.8939965694682676