# Assignment 8
Develop a model for 20 news groups dataset from scikit-learn. Select 20% of data for test set.

Develop metric learning model with siamese network [3 points] and softmax loss or triplet loss [3 points] (from seminar). Use KNN and LSH (any library for approximate nearest neighbor search) for final prediction after the network was trained. [2 points]

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors.

Your quality metric = accuracy score [2 points if acc > 0.8 ]

In [16]:
import gensim
import gensim.downloader as api
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.ndimage.filters import gaussian_filter1d
from tqdm import tqdm, tqdm_notebook

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Asalamatina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
batch_size = 256
random_state = 42

In [19]:
DEVICE = torch.device('cpu') 

In [20]:
newsgroups = fetch_20newsgroups()

In [21]:
corpus = pd.DataFrame(columns=['text', 'target'])
corpus['text'] = newsgroups.data
corpus['text']  = corpus['text'].apply(lambda sent: nltk.word_tokenize(sent))
corpus['target'] = newsgroups.target
corpus['target'] = corpus['target'].apply(float)

In [22]:
corpus.keys()

Index(['text', 'target'], dtype='object')

In [23]:
print(corpus)

                                                    text  target
0      [From, :, lerxst, @, wam.umd.edu, (, where, 's...     7.0
1      [From, :, guykuo, @, carson.u.washington.edu, ...     4.0
2      [From, :, twillis, @, ec.ecn.purdue.edu, (, Th...     4.0
3      [From, :, jgreen, @, amber, (, Joe, Green, ), ...     1.0
4      [From, :, jcm, @, head-cfa.harvard.edu, (, Jon...    14.0
...                                                  ...     ...
11309  [From, :, jim.zisfein, @, factory.com, (, Jim,...    13.0
11310  [From, :, ebodin, @, pearl.tufts.edu, Subject,...     4.0
11311  [From, :, westes, @, netcom.com, (, Will, Este...     3.0
11312  [From, :, steve, @, hcrlgw, (, Steven, Collins...     1.0
11313  [From, :, gunning, @, cco.caltech.edu, (, Kevi...     8.0

[11314 rows x 2 columns]


In [8]:
wv = api.load('word2vec-google-news-300')
wv['king'][:10]

array([ 0.12597656,  0.02978516,  0.00860596,  0.13964844, -0.02563477,
       -0.03613281,  0.11181641, -0.19824219,  0.05126953,  0.36328125],
      dtype=float32)

In [9]:
emb_dim = wv['king'].size

In [10]:
def vectorize_sent(sent, wv):
  sent_vec = []
  for w in sent:
    try:
      vec = wv[w]
      sent_vec.append(vec)
    except KeyError:
      continue
  return np.mean(np.array(sent_vec), axis=0)

In [12]:
corpus['vec'] = corpus['text'].apply(lambda sent: vectorize_sent(sent, wv))

In [13]:
corpus.head()

Unnamed: 0,text,target,vec
0,"[From, :, lerxst, @, wam.umd.edu, (, where, 's...",7.0,"[0.032288477, 0.03263666, 0.07366503, 0.066889..."
1,"[From, :, guykuo, @, carson.u.washington.edu, ...",4.0,"[-0.020951407, 0.072069034, 0.043153763, 0.052..."
2,"[From, :, twillis, @, ec.ecn.purdue.edu, (, Th...",4.0,"[0.02607478, 0.023831822, 0.02035976, 0.102583..."
3,"[From, :, jgreen, @, amber, (, Joe, Green, ), ...",1.0,"[-0.005437399, -0.0005287288, 0.017397419, 0.0..."
4,"[From, :, jcm, @, head-cfa.harvard.edu, (, Jon...",14.0,"[-0.07343274, 0.00018738056, 0.033099294, 0.09..."


In [14]:
def add_positive(row, corpus):
  target = row['target'] 
  vec = row['vec']
  positive = corpus[corpus['target'] == target]['vec']
  selected = np.random.choice(positive)
  while selected.tolist() == vec.tolist():
    selected = np.random.choice(positive)
  return selected

In [15]:
def add_negative(row, corpus):
  target = row['target'] 
  vec = row['vec']
  negative = corpus[corpus['target'] != target]['vec']
  return np.random.choice(negative)

In [0]:
positives = []
negatives = []
for i, row in corpus.iterrows():
  positives.append(add_positive(row, corpus))
  negatives.append(add_negative(row, corpus))
len(positives)

11314

In [0]:
corpus['positive'] = positives
corpus['negative'] = negatives

In [0]:
corpus.head()

Unnamed: 0,text,target,vec,positive,negative
0,"[From, :, lerxst, @, wam.umd.edu, (, where, 's...",7.0,"[0.032288477, 0.03263666, 0.07366503, 0.066889...","[-0.016801193, 0.043750726, 0.02573403, 0.0982...","[-0.023520788, -0.037075657, 0.04633649, 0.046..."
1,"[From, :, guykuo, @, carson.u.washington.edu, ...",4.0,"[-0.020951407, 0.072069034, 0.043153763, 0.052...","[0.019448416, 0.012540248, 0.03226182, 0.10422...","[-0.05689256, -1.608861e-05, 0.023813125, 0.09..."
2,"[From, :, twillis, @, ec.ecn.purdue.edu, (, Th...",4.0,"[0.02607478, 0.023831822, 0.02035976, 0.102583...","[0.0005824312, 0.012559735, 0.02057867, 0.0746...","[-0.019356176, 0.002154373, 0.07111602, 0.0900..."
3,"[From, :, jgreen, @, amber, (, Joe, Green, ), ...",1.0,"[-0.005437399, -0.0005287288, 0.017397419, 0.0...","[0.010827697, 0.014452622, 0.03526863, 0.06302...","[-0.027866324, 0.02060047, 0.061406907, 0.1130..."
4,"[From, :, jcm, @, head-cfa.harvard.edu, (, Jon...",14.0,"[-0.07343274, 0.00018738056, 0.033099294, 0.09...","[0.012298131, 0.018696018, 0.05699073, 0.10540...","[-0.015447801, 0.018567657, 0.042310752, 0.079..."


### Модель

In [0]:
train_corpus, test_corpus = train_test_split(corpus, test_size=0.2, random_state=random_state, shuffle=True)
train_corpus, val_corpus = train_test_split(train_corpus, test_size=0.1, random_state=random_state, shuffle=True)

In [0]:
class Batch:
  def __init__(self, anchors, positives, negatives, targets):
    self.anc = anchors
    self.pos = positives
    self.neg = negatives
    self.tgt = targets

In [0]:
class Iterator:
  def __init__(self, corpus, batch_size=128, device='cpu'):
    self.batches = self.make_batches(corpus, batch_size, device)

  @staticmethod
  def make_batches(corpus, batch_size, device):
    batches = []
    shuffled = corpus.sample(frac=1)
    i = 0
    while i < len(corpus):
      b = shuffled[i: i+batch_size]
      if len(b) == batch_size:
        anc = torch.tensor(list(b.vec.values)).to(device)
        pos = torch.tensor(list(b.positive.values)).to(device)
        neg = torch.tensor(list(b.negative.values)).to(device)
        tgt = torch.tensor(list(b.target.values)).to(device)
        batch = Batch(anc, pos, neg, tgt)
        batches.append(batch)
      i += batch_size
    return batches

In [0]:
Iterator(corpus, batch_size=batch_size, device=DEVICE).batches[0].pos

tensor([[-0.0677, -0.0115,  0.0479,  ..., -0.0248,  0.0321, -0.0339],
        [-0.0020,  0.0125,  0.0552,  ..., -0.0196,  0.0397, -0.0138],
        [-0.0180,  0.0081,  0.0297,  ..., -0.0602,  0.0237, -0.0415],
        ...,
        [-0.0393,  0.0270,  0.0303,  ..., -0.0027,  0.0395, -0.0127],
        [ 0.0606,  0.0232,  0.0271,  ..., -0.0236, -0.0061, -0.0570],
        [ 0.0296, -0.0002,  0.0269,  ..., -0.0504,  0.0171, -0.0089]],
       device='cuda:0')

In [0]:
trn_itr = Iterator(train_corpus, batch_size=batch_size, device=DEVICE).batches
vld_itr = Iterator(val_corpus, batch_size=batch_size, device=DEVICE).batches
tst_itr = Iterator(test_corpus, batch_size=batch_size, device=DEVICE).batches

In [0]:
len_train = len(trn_itr)
len_valid = len(vld_itr)
len_test = len(tst_itr)

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, inp_dim=300, lin_dim=128):
        super(MyModel, self).__init__()
        self.fc = nn.Linear(inp_dim, lin_dim)
        
    def branch(self, x):
        x = self.fc(x)
        return x
        
    def forward(self, batch):
        anc = batch.anc
        pos = batch.pos
        neg = batch.neg
        
        anc = self.branch(anc)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return anc, pos, neg


In [0]:
def train_epoch(data_iter, len_iter, n_epoch, model, criterion, optimizer=None):
    train_losses = []
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len_iter, desc=f"Epoch {n_epoch + 1}", leave=True)
    counter = 0
    for batch in data_iter:
        if optimizer:
          optimizer.zero_grad()
        anc, pos, neg = model.forward(batch)
        loss = criterion(anc, pos, neg)
        loss.backward()
        if optimizer:
          optimizer.step()
        loss_value = loss.detach().item()
        total_loss += loss_value
        train_losses.append(loss_value)
        data_iter.set_postfix(loss = loss_value)
        counter += 1
        
    total_loss /= counter
    return total_loss, train_losses


def valid_epoch(data_iter, len_iter, n_epoch, model, criterion):
    valid_losses = []
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len_iter, desc=f"Eval epoch {n_epoch + 1}", leave=True)
    counter = 0
    for batch in data_iter:
        with torch.no_grad():
            anc, pos, neg = model.forward(batch)
            loss = criterion(anc, pos, neg)
            loss_value = loss.detach().item()
            total_loss += loss_value
            valid_losses.append(loss_value)
            data_iter.set_postfix(loss = loss_value)
            counter +=1
        
    total_loss /= counter
    return total_loss, valid_losses

In [0]:
class TripletLoss(nn.L1Loss):
  def __init__(self, margin=0.1):
    super(TripletLoss, self).__init__()
    self.margin = margin
  
  def forward(self, anc, pos, neg):
    return torch.mean(F.cosine_similarity(anc, neg, dim=-1) - F.cosine_similarity(anc, pos, dim=-1) + self.margin)

In [0]:
criterion = TripletLoss()
criterion(torch.tensor([[1.0,2.0,3.0], [1.0,1.0,2.0]]), torch.tensor([[1.0,1.0,1.0], [1.0,0.0,2.0]]), torch.tensor([[0.0,0.0,0.0], [0.0,2.0,10.0]]))

tensor(-0.3790)

In [0]:
criterion = TripletLoss()
criterion = criterion.to(DEVICE)
model = MyModel(inp_dim=emb_dim, lin_dim=512)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

In [0]:
num_epochs = 15

In [0]:
total_train_losses = []
total_valid_losses = []
for epoch in range(num_epochs):
    model.train()
    loss, train_losses = train_epoch(trn_itr, len_train, epoch, model, criterion, optimizer)
    total_train_losses += train_losses
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss, valid_losses = valid_epoch(vld_itr, len_valid, epoch, model, criterion)
        total_valid_losses += valid_losses
        if scheduler:
          if type(scheduler) == torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step(loss)
          else:
            scheduler.step() 
        print('valid', loss)

HBox(children=(IntProgress(value=0, description='Epoch 1', max=31, style=ProgressStyle(description_width='init…


train -0.25997267507257


HBox(children=(IntProgress(value=0, description='Eval epoch 1', max=3, style=ProgressStyle(description_width='…


valid -0.46416090925534564


HBox(children=(IntProgress(value=0, description='Epoch 2', max=31, style=ProgressStyle(description_width='init…


train -0.5361356754456797


HBox(children=(IntProgress(value=0, description='Eval epoch 2', max=3, style=ProgressStyle(description_width='…


valid -0.5526860952377319


HBox(children=(IntProgress(value=0, description='Epoch 3', max=31, style=ProgressStyle(description_width='init…


train -0.6025515544799066


HBox(children=(IntProgress(value=0, description='Eval epoch 3', max=3, style=ProgressStyle(description_width='…


valid -0.5841647585233053


HBox(children=(IntProgress(value=0, description='Epoch 4', max=31, style=ProgressStyle(description_width='init…


train -0.637180414892012


HBox(children=(IntProgress(value=0, description='Eval epoch 4', max=3, style=ProgressStyle(description_width='…


valid -0.6025231877962748


HBox(children=(IntProgress(value=0, description='Epoch 5', max=31, style=ProgressStyle(description_width='init…


train -0.6608054022635182


HBox(children=(IntProgress(value=0, description='Eval epoch 5', max=3, style=ProgressStyle(description_width='…


valid -0.6152382294336954


HBox(children=(IntProgress(value=0, description='Epoch 6', max=31, style=ProgressStyle(description_width='init…


train -0.6787427375393529


HBox(children=(IntProgress(value=0, description='Eval epoch 6', max=3, style=ProgressStyle(description_width='…


valid -0.6251387000083923


HBox(children=(IntProgress(value=0, description='Epoch 7', max=31, style=ProgressStyle(description_width='init…


train -0.6931519931362521


HBox(children=(IntProgress(value=0, description='Eval epoch 7', max=3, style=ProgressStyle(description_width='…


valid -0.6335259278615316


HBox(children=(IntProgress(value=0, description='Epoch 8', max=31, style=ProgressStyle(description_width='init…


train -0.7052004952584544


HBox(children=(IntProgress(value=0, description='Eval epoch 8', max=3, style=ProgressStyle(description_width='…


valid -0.6409406463305155


HBox(children=(IntProgress(value=0, description='Epoch 9', max=31, style=ProgressStyle(description_width='init…


train -0.7154769436005624


HBox(children=(IntProgress(value=0, description='Eval epoch 9', max=3, style=ProgressStyle(description_width='…


valid -0.6474945743878683


HBox(children=(IntProgress(value=0, description='Epoch 10', max=31, style=ProgressStyle(description_width='ini…


train -0.724289471103299


HBox(children=(IntProgress(value=0, description='Eval epoch 10', max=3, style=ProgressStyle(description_width=…


valid -0.6531778971354166


HBox(children=(IntProgress(value=0, description='Epoch 11', max=31, style=ProgressStyle(description_width='ini…


train -0.73187481780206


HBox(children=(IntProgress(value=0, description='Eval epoch 11', max=3, style=ProgressStyle(description_width=…


valid -0.6580074628194174


HBox(children=(IntProgress(value=0, description='Epoch 12', max=31, style=ProgressStyle(description_width='ini…


train -0.738449260111778


HBox(children=(IntProgress(value=0, description='Eval epoch 12', max=3, style=ProgressStyle(description_width=…


valid -0.6620633602142334


HBox(children=(IntProgress(value=0, description='Epoch 13', max=31, style=ProgressStyle(description_width='ini…


train -0.7442087954090487


HBox(children=(IntProgress(value=0, description='Eval epoch 13', max=3, style=ProgressStyle(description_width=…


valid -0.6654746929804484


HBox(children=(IntProgress(value=0, description='Epoch 14', max=31, style=ProgressStyle(description_width='ini…


train -0.7493244371106548


HBox(children=(IntProgress(value=0, description='Eval epoch 14', max=3, style=ProgressStyle(description_width=…


valid -0.6683491468429565


HBox(children=(IntProgress(value=0, description='Epoch 15', max=31, style=ProgressStyle(description_width='ini…


train -0.7539254503865396


HBox(children=(IntProgress(value=0, description='Eval epoch 15', max=3, style=ProgressStyle(description_width=…


valid -0.6707558830579122


In [0]:
predict_vecs = []
predict_tgts = []
model.eval()
with torch.no_grad():
  data_iter = tqdm_notebook(tst_itr, total=len_test, desc=f"Test", leave=True)
  for batch in data_iter:
    anc, pos, neg = model.forward(batch)
    tgt = batch.tgt
    predict_vecs.append(anc.cpu().detach().numpy())
    predict_tgts += tgt.cpu().detach()

HBox(children=(IntProgress(value=0, description='Test', max=8, style=ProgressStyle(description_width='initial'…

In [0]:
predict_corpus = pd.DataFrame(columns=['target', 'vec'])
predict_corpus['target'] = [x.item() for x in predict_tgts]
predict_corpus['vec'] = list(np.concatenate(predict_vecs))
predict_corpus.tail()

Unnamed: 0,target,vec
2043,10.0,"[0.025157703, 0.036540046, -0.04017117, -0.050..."
2044,2.0,"[-0.07610571, -0.042743195, -0.035869353, 0.06..."
2045,15.0,"[0.056401584, 0.023858132, 0.039060943, -0.049..."
2046,13.0,"[0.0006971962, -0.00037616864, 0.01601189, 0.0..."
2047,7.0,"[-0.02683389, -0.013489235, -0.044622414, 0.02..."


## Оценка качества

In [0]:
X = np.array([[-1, -1, 3], [-2, -1, 12], [-3, -2, 1], [1, 1, 1], [1, 2, 1], [1, 3, 2]])
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X)
x_distances, x_indices = nbrs.kneighbors(X)
x_indices

array([[0, 2, 3],
       [1, 0, 2],
       [2, 0, 3],
       [3, 4, 5],
       [4, 3, 5],
       [5, 4, 3]])

In [0]:
x_distances

array([[ 0.        ,  3.        ,  3.46410162],
       [ 0.        ,  9.05538514, 11.09053651],
       [ 0.        ,  3.        ,  5.        ],
       [ 0.        ,  1.        ,  2.23606798],
       [ 0.        ,  1.        ,  1.41421356],
       [ 0.        ,  1.41421356,  2.23606798]])

In [0]:
predicted = np.concatenate(predict_vecs)
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(predicted)
distances, indices = nbrs.kneighbors(predicted)

In [0]:
predicted_classes = []
for i, row in predict_corpus.iterrows():
  distances_to_classes = [100]*20
  distances_i = distances[i]
  indices_i = indices[i]
  for n_i, n_d in enumerate(distances_i):
    class_n = predict_corpus['target'][indices_i[n_i]]
    if distances_to_classes[int(class_n)] == 100:
      distances_to_classes[int(class_n)] = n_d
    else:
      distances_to_classes[int(class_n)] += n_d
  class_i = distances_to_classes.index(min(distances_to_classes))

  predicted_classes.append(class_i)

In [0]:
accuracy_score(y_true = list(predict_corpus.target.values), y_pred=predicted_classes)

0.8583984375