<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Student-Model" data-toc-modified-id="Student-Model-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Student Model</a></span><ul class="toc-item"><li><span><a href="#Data-processing" data-toc-modified-id="Data-processing-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Data processing</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Model</a></span><ul class="toc-item"><li><span><a href="#Define-Model" data-toc-modified-id="Define-Model-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Define Model</a></span></li><li><span><a href="#Train" data-toc-modified-id="Train-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Train</a></span></li></ul></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#ROC-AUC" data-toc-modified-id="ROC-AUC-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>ROC AUC</a></span></li><li><span><a href="#Compression-rate" data-toc-modified-id="Compression-rate-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Compression rate</a></span></li></ul></li></ul></li></ul></div>

# Student Model


Нужно обучть небольшую модель на [soft таргетах](https://drive.google.com/file/d/1tBbPOUT-Ow9f3zTDApykGXYwt-KslYle/view?usp=sharing)  модели учителя, которая не сильно уступала бы в качестве учителю.

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr.models.dcn import DCN

from collections import defaultdict
from keras.callbacks.callbacks import *
from keras.backend import clear_session

Using TensorFlow backend.


In [2]:
DATA_PATH = './criteo'
TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
RETAIN_PATH = 'soft_targets_full.csv'

In [3]:
soft_target = pd.read_csv(RETAIN_PATH)

In [4]:
soft_target[:10]

Unnamed: 0,id,prob
0,12,0.532062
1,26,0.483268
2,39,0.126496
3,41,0.750299
4,85,0.784883
5,108,0.010671
6,117,0.466868
7,121,0.554683
8,135,0.110773
9,139,0.262256


In [5]:
max_k = 0
with open(TRAIN_PATH) as f:
    for k, line in enumerate(f):
        if k < 10:
            print(line)
        max_k = k
print(k)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22,_c23,_c24,_c25,_c26,_c27,_c28,_c29,_c30,_c31,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39,id

1,0,-1,"","",1465,0,17,0,4,0,4,"","",241546e0,38a947a1,fa673455,6a14f9b9,25c83c98,fe6b92e5,1c86e0eb,1f89b562,a73ee510,e7ba2569,755e4a50,208d9687,5978055e,07d13a8f,5182f694,f8b34416,e5ba7672,e5f8f18f,"","",f3ddd519,"",32c7478e,b34f3128,"","",12

1,0,1,20,16,1548,93,42,32,912,0,15,1,16,8cf07265,942f9a8d,a8e40bcf,0365276a,25c83c98,7e0ccccf,3f4ec687,1f89b562,a73ee510,726f00fd,c4adf918,27c604a6,85dbe138,07d13a8f,a8e962af,c449f783,27c07bd6,1f868fdd,21ddcdc9,a458ea53,7eee76d1,"",32c7478e,9af06ad9,9d93af03,cdfe5ab7,26

0,8,0,15,20,115,24,8,23,24,2,2,"",20,5a9ed9b0,c66fca21,78171040,373c404a,25c83c98,"",8ff6f5af,0b153874,a73ee510,5ba575e7,b5a9f90e,6766a7f0,949ea585,1adce6ef,8736735c,59974c9c,8efede7f,1304f63b,21ddcdc9,b1252a9d,07b2853e,"",32c7478e,94bde4f2,010f6491,09b76f8d,39

1,88,319,"",4,5,4,89,40,88

## Data processing

Данные на Train/Validation/Test нужно разбить как 80/10/10

In [6]:
dense_features_indices = [i for i in range(1, 14)]
sparse_features_indices = [i for i in range(14, 40)]

dense_features = ['c{}'.format(i) for i in dense_features_indices]
sparse_features = ['c{}'.format(i) for i in sparse_features_indices]

len(dense_features_indices), len(sparse_features_indices)

min_arr = [0] * 40
max_arr = [0] * 40
range_arr = [0] * 40

In [7]:
def calculate_minmax(filename, retain):
    with open(filename) as f:
        for k, line in enumerate(f):
            if k == 0:
                continue
            features = line.split('\n')[0].split(',')
            if int(features[-1]) not in retain:
                continue
            for i, f_name in zip(dense_features_indices, dense_features):
                val = features[i] if features[i] != '""' else 0
                val = float(val)
                min_arr[i] = min(min_arr[i], val)
                max_arr[i] = max(max_arr[i], val)
                range_arr[i] = max_arr[i] - min_arr[i]


def prepare_data_dict(batch):
    data_dict = {}
    for f_name in dense_features:
        data_dict[f_name] = pd.core.series.Series(batch[f_name])

    for f_name in sparse_features:
        data_dict[f_name] = pd.core.series.Series(batch[f_name])

    return data_dict
                

def generate_data(filename, retain, targets, batch_size=128):
    data_teacher = defaultdict(list)
    target = defaultdict(list)
    labels = []
    m = 0
    
    while True:
        with open(filename) as f:
            for k, line in enumerate(f):
                if k == 0:
                    continue
                features = line.split('\n')[0].split(',')
                if int(features[-1]) not in retain:
                    continue

                labels.append(np.int32(features[0]))
                target["target"].append(np.float32(targets[m % len(targets)]))

                for i, f_name in zip(dense_features_indices, dense_features):
                    val = features[i] if features[i] != '""' else 0
                    val = (np.float32(val) - min_arr[i]) / range_arr[i] 
                    data_teacher[f_name].append(val)

                for i, f_name in zip(sparse_features_indices, sparse_features):
                    val = features[i] if features[i] != '""' else '-1'
                    data_teacher[f_name].append(val)

                m += 1

                if m % batch_size == 0:            
                    data_dict = prepare_data_dict(data_teacher)
                    yield data_dict, pd.core.series.Series(target["target"])

                    data_teacher = defaultdict(list)
                    target = defaultdict(list)
                    labels = []
    
    
def get_data(filename, retain, targets):
    data_teacher = defaultdict(list)
    target = []
    labels = []
    
    m = 0
    
    with open(filename) as f:
        for k, line in enumerate(f):
            if k == 0:
                continue
            features = line.split('\n')[0].split(',')
            if int(features[-1]) not in retain:
                continue

            labels.append(np.int32(features[0]))
            target.append(np.float32(targets[m]))

            for i, f_name in zip(dense_features_indices, dense_features):
                val = features[i] if features[i] != '""' else 0
                val = (float(val) - min_arr[i]) / range_arr[i] 
                data_teacher[f_name].append(val)

            for i, f_name in zip(sparse_features_indices, sparse_features):
                val = features[i] if features[i] != '""' else '-1'
                data_teacher[f_name].append(val)
            
            m += 1
    data_dict = prepare_data_dict(data_teacher)
    return labels, np.array(target), data_dict

In [8]:
train, test = train_test_split(soft_target, shuffle=False, test_size=0.2)
test, validate = train_test_split(test, shuffle=False, test_size=0.5)

all_retain = set(soft_target['id'].to_list())
train_retain, train_target = set(train['id'].to_list()), train['prob'].to_numpy()
test_retain, test_target = set(test['id'].to_list()), test['prob'].to_numpy()
validate_retain, validate_target = set(validate['id'].to_list()), validate['prob'].to_numpy()

In [9]:
calculate_minmax(TRAIN_PATH, all_retain)

In [10]:
validate_labels, validate_target, validate_data = get_data(TRAIN_PATH, validate_retain, validate_target)
test_labels, test_target, test_data = get_data(TRAIN_PATH, test_retain, test_target) 

## Model

Можно также использовать Pruning и/или Quantinization.

In [11]:
sparse_features_dims = dict([
    ('c14', 1445),
    ('c15', 556),
    ('c16', 1130758),
    ('c17', 360209),
    ('c18', 304),
    ('c19', 21),
    ('c20', 11845),
    ('c21', 631),
    ('c22', 3),
    ('c23', 49223),
    ('c24', 5194),
    ('c25', 985420),
    ('c26', 3157),
    ('c27', 26),
    ('c28', 11588),
    ('c29', 715441),
    ('c30', 10),
    ('c31', 4681),
    ('c32', 2029),
    ('c33', 4),
    ('c34', 870796),
    ('c35', 17),
    ('c36', 15),
    ('c37', 87605),
    ('c38', 84),
    ('c39', 58187)])

### Define Model

In [12]:
def get_model_stat(max_vocab_size=20000, max_embedding_dim=75, hidden_size=(128, 128)):
    fixlen_feature_columns = [SparseFeat(feat, 
                                         vocabulary_size=min(vocab_size, max_vocab_size), 
                                         embedding_dim=min(int(6 * (vocab_size) ** (0.25)), max_embedding_dim), 
                                         use_hash=True, dtype='string')
                              for feat, vocab_size in sparse_features_dims.items()] + \
                            [DenseFeat(feat, 1,) for feat in dense_features]


    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )
    
    model = DCN(linear_feature_columns, dnn_feature_columns, cross_num=2,
            dnn_hidden_units=hidden_size, l2_reg_linear=0, l2_reg_embedding=0,
            l2_reg_cross=0, l2_reg_dnn=0, init_std=0.0001, seed=1024, 
            dnn_use_bn=True, dnn_activation='relu', task='binary')

    model.compile("adam", 'mean_squared_error',
                  metrics=['mean_squared_error'], )
    
    BATCH_SIZE = 3000
    EPOCHS = 6
    
    model.fit(generate_data(TRAIN_PATH, train_retain, train_target, BATCH_SIZE),
              use_multiprocessing=True, steps_per_epoch=3*len(train_retain)//(BATCH_SIZE * EPOCHS), 
              epochs=6, verbose=1)
    model.save_weights('./tmp.h5')
    
    compression_rate = (1024 * 1024 * 168)/os.path.getsize('tmp.h5')
    
    pred_proba = model.predict(validate_data)
    validation_rocauc = roc_auc_score(validate_labels, pred_proba)
    pred_proba = model.predict(test_data)
    test_rocauc = roc_auc_score(test_labels, pred_proba)
    
    clear_session()
    
    return compression_rate, validation_rocauc, test_rocauc

### Train

In [13]:
cr, v_rocauc, t_rocauc = get_model_stat(40000, 75)

print()
print()
print()
print("Compression rate:", cr)
print("Validate rocauc", v_rocauc)
print("Test rocauc", t_rocauc)

  ...
    to  
  ['...']
Train for 488 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6



Compression rate: 1.639958326472275
Validate rocauc 0.7888861936680214
Test rocauc 0.7911934366991362


Process Keras_worker_ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, r

In [14]:
cr, v_rocauc, t_rocauc = get_model_stat(10000, 75)

print()
print()
print()
print("Compression rate:", cr)
print("Validate rocauc", v_rocauc)
print("Test rocauc", t_rocauc)

  ...
    to  
  ['...']
Train for 488 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6



Compression rate: 5.24401682793377
Validate rocauc 0.787183258301624
Test rocauc 0.7894640907015611


Process Keras_worker_ForkPoolWorker-2:
Traceback (most recent call last):
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/argentumwalker/anaconda3/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, r

In [13]:
cr, v_rocauc, t_rocauc = get_model_stat(10000, 25)

print()
print()
print()
print("Compression rate:", cr)
print("Validate rocauc", v_rocauc)
print("Test rocauc", t_rocauc)

  ...
    to  
  ['...']
Train for 488 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6



Compression rate: 13.64507589653087
Validate rocauc 0.7879183896715284
Test rocauc 0.7904537542318204


In [16]:
print(ord('a'), ord('z'), ord('0'), ord('9'))

97 122 48 57


In [14]:
cr, v_rocauc, t_rocauc = get_model_stat(2000, 25)

print()
print()
print()
print("Compression rate:", cr)
print("Validate rocauc", v_rocauc)
print("Test rocauc", t_rocauc)

  ...
    to  
  ['...']
Train for 488 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6



Compression rate: 45.686457821126474
Validate rocauc 0.784907492798154
Test rocauc 0.7873023644509987


Now, lets try something new

In [10]:
import torch
from torch import nn
from torch.nn import functional as F

In [53]:
# Simple one-hot encoding
char_encodings = dict([(key, value) for value, key in enumerate(list(range(97, 123)) + list(range(48, 58)))])
eye = torch.eye(len(char_encodings)).cuda()
zero_word = torch.zeros((8, 36)).cuda()

def encode_word(w):
    result = []
    for c in w:
        result.append(eye[char_encodings[ord(c)]].unsqueeze(0))
    return torch.cat(result, dim=0)

def encode_object(features):
    dense = []
    for i in dense_features_indices:
        val = features[i] if features[i] != '""' else 0
        val = (np.float32(val) - min_arr[i]) / range_arr[i] 
        dense.append(val)
    dense = torch.Tensor(dense).unsqueeze(0).cuda()
    
    result = [dense]
    for i in sparse_features_indices:
        result.append(encode_word(features[i]).unsqueeze(0) if features[i] != '""' else zero_word.unsqueeze(0))
    
    return result
    

def encode_batch(batch_features, batch_targets):
    target = torch.Tensor(batch_targets).unsqueeze(-1).cuda()
    feature_encodings = [torch.cat(b, dim=0) for b in zip(*[encode_object(f) for f in batch_features])]
    return feature_encodings, target


def load_full_data(filename, retain, targets):
    batch = []
    labels = []
    target = []
    m = 0

    with open(filename) as f:
        for k, line in enumerate(f):
            if k == 0:
                continue
            features = line.split('\n')[0].split(',')
            if int(features[-1]) not in retain:
                continue

            labels.append(np.int32(features[0]))
            batch.append(features[1:len(features)])
            target.append(np.float32(targets[m % len(targets)]))
            m += 1
    batch, target = encode_batch(batch, target)
    yield batch, target, labels


def load_batch_data(filename, retain, targets, batch_size=128):
    batch = []
    labels = []
    target = []
    m = 0
    
    while True:
        with open(filename) as f:
            for k, line in enumerate(f):
                if k == 0:
                    continue
                features = line.split('\n')[0].split(',')
                if int(features[-1]) not in retain:
                    continue
                
                labels.append(np.int32(features[0]))
                batch.append(features)
                target.append(np.float32(targets[m % len(targets)]))
                m += 1

                if m % batch_size == 0:
                    yield encode_batch(batch, target)

                    batch = []
                    target = []
                    labels = []

In [51]:
class Embedding(nn.Module):
    def __init__(self, emb_size, word_len=8, chars=36):
        super().__init__()
        self.key = nn.Linear(chars, emb_size)
        self.emb = nn.Linear(chars, emb_size)
        self.attention_rnn = nn.LSTM(emb_size, emb_size, batch_first=True)
        self.emb_size = emb_size
        self.word_len = word_len
    
    def forward(self, x):
        key = self.key(x)
        emb = self.emb(x)
        attention, _ = self.attention_rnn(key)
        attention = F.softmax(attention, dim=-2)
        return (attention * emb).sum(dim=-2)
    
class MimicModel(nn.Module):
    def __init__(self, emb_size=100):
        super().__init__()
        self.word_encoders = nn.ModuleList([Embedding(emb_size) for _ in range(len(sparse_features))])
        self.output = nn.Sequential(
            torch.nn.Linear(emb_size * len(sparse_features) + len(dense_features), 128),
            torch.nn.ELU(),
            torch.nn.Linear(128, 128),
            torch.nn.ELU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid()
        )
        
    def forward(self, xs):
        dense = xs[0]
        words = [e(w) for w, e in zip(xs[1:], self.word_encoders)]
        x = torch.cat([dense] + words, dim=-1)
        return self.output(x)
    
    def predict_proba(self, xs):
        with torch.no_grad():
            proba = self.forward(xs)
        return proba.cpu().numpy()

In [None]:
from tqdm.notebook import tqdm
BATCH_SIZE = 2048

model = MimicModel()
model.cuda()
optimizer = torch.optim.Adam(model.parameters())

generator = load_batch_data(TRAIN_PATH, train_retain, train_target, BATCH_SIZE)
for i in tqdm(range(len(train_retain) // BATCH_SIZE)):
    batch, target = next(generator)
    pred = model(batch)
    loss = F.mse_loss(target, pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 20 == 0:
        print(i, "|", loss.detach().item())    

print("Final loss:", loss.detach().item())

HBox(children=(IntProgress(value=0, max=1431), HTML(value='')))

0 | 0.13187697529792786


## Evaluation

Наша основная задача получить модель, которая 
* в терминах ROC AUC не намного хуже модели учителя, и в то же время 
* сильно меньше по размеру

### ROC AUC

Сравним ROC AUC модели ученика с показателем для учителя.

ROC AUC учителя: 0.802

### Compression rate

Пусть 
* $a$ - \# of the parameters in the original model $M$
* $a^{*}$ - \# of the parameters in compressed model $M^{*}$

тогда compression rate is $$\alpha(M,M^{*}) = \frac{a}{a^{*}}$$

Можно также посчитать comression rate просто как отношение фактических размеров моделей.

Размер модели учителя - 168MB
