In [1]:
import json
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook, tnrange
from collections import Counter, defaultdict
import re
from gensim import models
from os import listdir
from os.path import isfile, join
import pymorphy2

In [2]:
morph = pymorphy2.MorphAnalyzer()

In [12]:
folder = '/home/v.gulin/code/questions_data/json1/'
onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]

In [13]:
questions_all = []
for f in onlyfiles:
    questions_all += json.loads(json.load(open(folder + f)))

In [17]:
questions_train = questions_all

In [14]:
morphDict = {}
def text_to_tokens(txt):
    txt = txt.lower()
    txt = re.subn('<[a-z\s]*>', '', txt)[0]
    #txt = re.sub('http\w*\s', '', txt)
    r = re.split('(\s|\-|\.|\,)', txt)
    r = (re.subn('[^А-Яа-яA-Za-z\-\.\,\s]', '', _)[0] for _ in r)
    r = [_ for _ in r if len(_)>1 and 'http' not in _]
    result = []
    for w in r:
        if (w not in morphDict):
            morphDict[w] = morph.parse(w)[0].normal_form
        result.append(morphDict[w])
    return result

In [15]:
text_to_tokens('Мама мыла раму!')

['мама', 'мыло', 'рам']

In [18]:
corpus = []
token_counts = Counter()
for q in tqdm_notebook(questions_train):
    corpus.append(text_to_tokens(q['question']))
    for a in q['answers']:   
        r = text_to_tokens(a)
        corpus.append(r)
        token_counts.update(r)

HBox(children=(IntProgress(value=0, max=52143), HTML(value='')))




In [19]:
min_count = 4

tokens = [w for w, c in token_counts.items() if c > min_count] 
tokens = ["_PAD_", "_UNK_"] + tokens

token_to_id = {t : i for i, t in enumerate(tokens)}

In [20]:
corpus

[['какой',
  'документальный',
  'фильм',
  'вы',
  'считать',
  'самый',
  'интересный',
  'качественный'],
 ['фильм',
  'дом',
  'который',
  'многие',
  'слышать',
  'часть',
  'этот',
  'многий',
  'уже',
  'писать',
  'да',
  'он',
  'запретить',
  'да',
  'он',
  'красить',
  'потому',
  'что',
  'люк',
  'бессон',
  'потому',
  'что',
  'фотограф',
  'артюс',
  'бертран',
  'потому',
  'что',
  'мы',
  'мочь',
  'увидеть',
  'свой',
  'планет',
  'сидеть',
  'маленькая',
  'комнатушка',
  'отделать',
  'по',
  'европейск',
  'этот',
  'фильм',
  'также',
  'отделать',
  'сознание',
  'заставить',
  'задуматься',
  'что',
  'же',
  'мы',
  'делать',
  'собственный',
  'дом',
  'он',
  'интересный',
  'свой',
  'интересный',
  'ракурс',
  'собственный',
  'правдивость',
  'не',
  'менее',
  'прекрасный',
  'занимательный'],
 ['манхеттен', 'manhatta', 'сша', 'чарльз', 'шилера', 'половина', 'страндт'],
 ['зачем',
  'samsung',
  'htc',
  'возвращать',
  'на',
  'рынок',
  'смартфон',

In [21]:
word2vec_size=128

In [22]:
w2v = models.Word2Vec(corpus, min_count=1, size=word2vec_size, workers=10)

In [23]:
PAD_ix = token_to_id["_PAD_"]
UNK_ix = token_to_id['_UNK_']

def as_matrix(sequences, max_len=None):
    if isinstance(sequences[0], (str, bytes)):
        sequences = [text_to_tokens(s) for s in sequences]
        
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype='int32') + PAD_ix
    for i, seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_ix) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [24]:
w2v.wv.vocab

{'какой': <gensim.models.keyedvectors.Vocab at 0x7fa83549e9e8>,
 'документальный': <gensim.models.keyedvectors.Vocab at 0x7fa841369588>,
 'фильм': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d128>,
 'вы': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3def0>,
 'считать': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d780>,
 'самый': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d3c8>,
 'интересный': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d2e8>,
 'качественный': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d4e0>,
 'дом': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d6a0>,
 'который': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d588>,
 'многие': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d668>,
 'слышать': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d8d0>,
 'часть': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d208>,
 'этот': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d160>,
 'многий': <gensim.models.keyedvectors.Vocab at 0x7fa7fee3d240>,
 'уже': <gensim.m

In [25]:
w2v.most_similar(['король', 'женщина'], ['мужчина'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('князь', 0.8170202970504761),
 ('де', 0.7870656251907349),
 ('ii', 0.7804590463638306),
 ('iii', 0.7776327133178711),
 ('грозный', 0.7769745588302612),
 ('ленин', 0.775465190410614),
 ('николай', 0.7737562656402588),
 ('борис', 0.7711115479469299),
 ('джон', 0.7696213722229004),
 ('королева', 0.7678171992301941)]

In [26]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
class QuestionVectorizer(nn.Module):
    def __init__(self, n_tokens=len(tokens), out_size=64, use_global_emb=True):
        """ 
        A simple sequential encoder for questions.
        Use any combination of layers you want to encode a variable-length input 
        to a fixed-size output vector
        """
        super(self.__class__, self).__init__()
        
        """<PUT YOUR CODE HERE>"""
        
    def forward(self, text_ix):
        """
        :param text_ix: int64 Variable of shape [batch_size, max_len]
        :returns: float32 Variable of shape [batch_size, out_size]
        """
        """<PUT YOUR CODE HERE>"""

In [None]:
class AnswerVectorizer(nn.Module):
    def __init__(self, n_tokens=len(tokens), out_size=64, use_global_emb=True):
        """ 
        A simple sequential encoder for answers.
        x -> emb -> conv -> global_max -> relu -> dense
        
        If use_global_emb is True, use GLOBAL_EMB as your embedding layer
        """
        super(self.__class__, self).__init__()
        """<PUT YOUR CODE HERE>"""
        
    def forward(self, text_ix):
        """
        :param text_ix: int64 Variable of shape [batch_size, max_len]
        :returns: float32 Variable of shape [batch_size, out_size]
        """
        """<PUT YOUR CODE HERE>"""

In [27]:
test = as_matrix(["В какой стране статистически больше всего левшей? Существует ли страна/город, в котором левшей больше чем правшей?", "Говорят, в ЮАР около 50% населения - левши, возможно и больше 50%, а возможно и меньше."])
print(test)

[[   93    63 19241   196   456 16748  2413  1027     1     4 16748   196
   1004 10236]
 [ 1422  4196  1921  1769 16748   704   196   704   283     0     0     0
      0     0]]


In [None]:
for vectorizer in [QuestionVectorizer(out_size=100), AnswerVectorizer(out_size=100)]:
    print("Testing %s ..." % vectorizer.__class__.__name__)
    dummy_x = Variable(torch.LongTensor(test))
    dummy_v = vectorizer(dummy_x)

    assert isinstance(dummy_v, Variable)
    assert tuple(dummy_v.shape) == (dummy_x.shape[0], 100)

    del vectorizer
    print("Seems fine")

In [31]:
from itertools import chain

question_vectorizer = QuestionVectorizer()
answer_vectorizer = AnswerVectorizer()

opt = torch.optim.Adam(chain(question_vectorizer.parameters(),
                             answer_vectorizer.parameters()))

1


In [29]:
def generate_batch(data, batch_size=None, replace=False, volatile=False, max_len=None):
    """ Samples training/validation batch with random negatives """
    if batch_size is not None:
        batch_ix = np.random.choice(len(data), batch_size, replace=replace)
        negative_ix = np.random.choice(len(data), batch_size, replace=True)
    else:
        batch_ix = range(len(data))
        negative_ix = np.random.permutation(np.arange(len(data)))

    
    anchors, positives = zip(*[data[i] for i in batch_ix])
    
    # sample random rows as negatives.
    # Note: you can do better by sampling "hard" negatives
    negatives = [data[i][1] for i in negative_ix]
    
    anchors, positives, negatives = map(lambda x: Variable(torch.LongTensor(as_matrix(x, max_len=max_len)),
                                                           volatile=volatile), 
                                        [anchors, positives, negatives])
    return anchors, positives, negatives

In [28]:
def build_dataset(train_data):
    '''Takes data
    Returns a list of tuples - a set of pairs (q, a_+)
    '''
    dataset = []
    for row in tqdm_notebook(train_data):
        question, answer = row['question'], row['answers'][0]
        dataset.append((question, answer))
        
    return dataset

In [32]:
train_data = build_dataset(questions_train)

HBox(children=(IntProgress(value=0, max=52143), HTML(value='')))




In [34]:
_dummy_anchors, _dummy_positives, _dummy_negatives = generate_batch(train_data, 2)

In [None]:
def compute_loss(anchors, positives, negatives, delta=3):
    """ 
    Compute the triplet loss:
    
    max(0, delta + sim(anchors, negatives) - sim(anchors, positives))
    
    where sim is a dot-product between vectorized inputs
    
    """
    
    """<PUT YOUR CODE HERE>"""

In [None]:
print(compute_loss(_dummy_anchors, _dummy_positives, _dummy_negatives))

In [35]:
num_epochs = 100
max_len = 100
batch_size = 32
batches_per_epoch = 100

In [36]:
def iterate_minibatches(data, batch_size=32, max_len=None,
                        max_batches=None, shuffle=True, verbose=True):
    indices = np.arange(len(data))
    if shuffle:
        indices = np.random.permutation(indices)
    if max_batches is not None:
        indices = indices[: batch_size * max_batches]
        
    irange = tnrange if verbose else range
    
    for start in irange(0, len(indices), batch_size):
        yield generate_batch([data[i] for i in indices[start : start + batch_size]], max_len=max_len)

In [None]:
for epoch_i in range(num_epochs):
    
    print("Training:")
    train_loss = train_mae = train_batches = 0    
    
    for batch in iterate_minibatches(train_data, max_batches=batches_per_epoch, batch_size=512):
        

        loss = compute_loss(*batch)
        loss.backward()
        opt.step()
        opt.zero_grad()
        

        train_loss += loss.data.cpu().numpy()
        train_batches += 1
    
    print("\tLoss:\t%.5f" % (train_loss / train_batches))
    print('\n\n')
