Create smaller batches, make strtified splits


In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f5/5a/6e41e8383913dd2ba923cdcd02be2e03911595f4d2f9de559ecbed80d2d3/sentence-transformers-0.3.9.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.3MB/s 
[?25hCollecting transformers<3.6.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 8.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.6MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)

In [None]:
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from tqdm import tqdm_notebook

from tensorflow.keras import layers, Sequential
from tensorflow import keras
import tensorflow as tf

import pandas as pd
import numpy as np
import os

pd.options.display.max_colwidth = 100

In [None]:
os.chdir("/content/drive/MyDrive/SKBKonturTest")

In [None]:
test = pd.read_csv('Data/test_data.tsv', sep='\t', index_col=0)
train = pd.read_csv('Data/train_data.tsv', sep='\t', index_col=0)

  mask |= (ar1 == a)


In [None]:
train_data, valid_data = train_test_split(train, test_size=0.2, stratify=train['answer'], random_state=42, shuffle=True)

In [None]:
model_trans_name = 'LaBSE'
model_trans = SentenceTransformer(model_trans_name)

In [None]:
def get_embeddings_and_save(data, embed_model, dir_name, batch_size):
  process_simultanious_n = 500_000

  n_bathes_per_iter = (process_simultanious_n // batch_size)
  process_simultanious_n = n_bathes_per_iter * batch_size
  n_iters = (np.ceil(len(data) / float(process_simultanious_n))).astype(np.int)

  for n_iter in tqdm_notebook(range(n_iters)):
    data_process = data[n_iter * process_simultanious_n : (n_iter + 1) * process_simultanious_n]

    eng_embedings = embed_model.encode(data_process['eng_name'].to_list(), show_progress_bar=False,
                                       batch_size=256, device='cuda:0', num_workers=4)
    ru_embedings = embed_model.encode(data_process['ru_name'].to_list(), show_progress_bar=False,
                                      batch_size=256, device='cuda:0', num_workers=4)
    
    X_process = np.concatenate((ru_embedings, eng_embedings), axis=1)
    y_process = data_process['answer'].astype(np.bool)

    n_batches = (np.ceil(len(data_process) / float(batch_size))).astype(np.int)

    for batch in range(n_batches):
      X_batch = X_process[batch * batch_size : (batch + 1) * batch_size]
      y_batch = y_process[batch * batch_size : (batch + 1) * batch_size]

      np.save(f'embedded_data/{dir_name}/X_batch_{n_iter * n_bathes_per_iter + batch}.npy', X_batch)
      np.save(f'embedded_data/{dir_name}/y_batch_{n_iter * n_bathes_per_iter + batch}.npy', y_batch)

In [None]:
batch_size = 1024

get_embeddings_and_save(train_data, model_trans, 'train', batch_size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [None]:
get_embeddings_and_save(valid_data, model_trans, 'val', batch_size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## Modeling

In [None]:
class My_Custom_Generator(keras.utils.Sequence) :
  def __init__(self, batch_size, dir_name) :
    self.batch_size = batch_size
    self.dir_name = dir_name
    

  def __len__(self) :
    return 2 # len(os.listdir(f"embedded_data/{self.dir_name}")) // 2
  
  
  def __getitem__(self, batch) :
    batch_x = np.load(f'embedded_data/{self.dir_name}/X_batch_{batch}.npy')
    batch_y = np.load(f'embedded_data/{self.dir_name}/y_batch_{batch}.npy')
    return batch_x, batch_y

In [None]:
batch_size = 1024

In [None]:
model = keras.Sequential(
    [
        layers.Dense(128, input_shape=(1536,) , activation="relu", name="Layer1"),
        layers.Dense(32, activation="relu", name="Layer2"),
        layers.Dense(1, activation='sigmoid' ,name="SigmoidLayer"),
    ]
)

model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.Precision(), keras.metrics.Recall()],
)

In [None]:
training_batch_generator = My_Custom_Generator(batch_size, 'train')
validation_batch_generator = My_Custom_Generator(batch_size, 'val')

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, 
                                               mode="min", restore_best_weights=True)
checkpoint = keras.callbacks.ModelCheckpoint('trained_models/model_best.h5', verbose=1, 
                                             monitor='val_loss', save_best_only=True, mode='min')  

In [None]:
history = model.fit(x=training_batch_generator,
                    epochs=30,
                    verbose=1,
                    validation_data=validation_batch_generator,
                    callbacks=[early_stopping, checkpoint]
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 610/3112 [====>.........................] - ETA: 54:26 - loss: 0.0236 - binary_accuracy: 0.9938 - precision: 0.9680 - recall: 0.8957

In [None]:
model.fit(x=validation_batch_generator,
          epochs=10,
          verbose=1,
          )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9486429518>

In [None]:
history = model.fit(x=validation_batch_generator,
                    epochs=10,
                    verbose=1,
                    validation_data=validation_batch_generator,
                    callbacks=[early_stopping, checkpoint]
                    )

Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.14892, saving model to trained_models/model_best.h5
Epoch 2/10
Epoch 00002: val_loss improved from 0.14892 to 0.14872, saving model to trained_models/model_best.h5
Epoch 3/10
Epoch 00003: val_loss improved from 0.14872 to 0.14850, saving model to trained_models/model_best.h5
Epoch 4/10
Epoch 00004: val_loss improved from 0.14850 to 0.14825, saving model to trained_models/model_best.h5
Epoch 5/10
Epoch 00005: val_loss improved from 0.14825 to 0.14796, saving model to trained_models/model_best.h5
Epoch 6/10
Epoch 00006: val_loss improved from 0.14796 to 0.14764, saving model to trained_models/model_best.h5
Epoch 7/10
Epoch 00007: val_loss improved from 0.14764 to 0.14730, saving model to trained_models/model_best.h5
Epoch 8/10
Epoch 00008: val_loss improved from 0.14730 to 0.14696, saving model to trained_models/model_best.h5
Epoch 9/10
Epoch 00009: val_loss improved from 0.14696 to 0.14661, saving model to trained_models/model_best

In [None]:
history.history

{'binary_accuracy': [0.96533203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125],
 'loss': [0.6183249354362488,
  0.5445506572723389,
  0.4686732888221741,
  0.3950635492801666,
  0.329170823097229,
  0.2730850577354431,
  0.22949877381324768,
  0.1969877928495407,
  0.17555108666419983,
  0.1627693772315979],
 'precision_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'recall_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'val_binary_accuracy': [0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125,
  0.9658203125],
 'val_loss': [0.5635513067245483,
  0.48754847049713135,
  0.41279375553131104,
  0.3443722426891327,
  0.28574615716934204,
  0.23878489434719086,
  0.20379427075386047,
  0.17992278933525085,
  0.16525551676750183,
  0.15751402080059052],
 'val_precision_1': [0.0, 0.0, 

## Check on valid data

In [None]:
ru_embedings = model_trans.encode(valid_data['ru_name'].to_list(), show_progress_bar=True,
                                  batch_size=256, device='cuda:0', num_workers=4)

eng_embedings = model_trans.encode(valid_data['eng_name'].to_list(), show_progress_bar=True,
                                  batch_size=256, device='cuda:0', num_workers=4)

X_val = np.concatenate((ru_embedings, eng_embedings), axis=1)
y_val = valid_data['answer'].astype(np.int8)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=3112.0, style=ProgressStyle(description_wid…

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f9fe05f65c0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1174, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f9fe05f65c0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dat




HBox(children=(FloatProgress(value=0.0, description='Batches', max=3112.0, style=ProgressStyle(description_wid…




In [None]:
def get_model(input_shape):
  model = keras.Sequential(
      [
          layers.Dense(128, input_shape=input_shape , activation="relu", name="Layer1"),
          layers.Dense(32, activation="relu", name="Layer2"),
          layers.Dense(1, activation='sigmoid' ,name="SigmoidLayer"),
      ]
  )

  model.compile(
      optimizer=keras.optimizers.Adam(),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.Precision(), keras.metrics.Recall()],
  )
  return model

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=512, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9fe0b1e6d8>

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=1024, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9ea672a5c0>

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=2048, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9fe0955d68>

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=4096, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9fe01bf6d8>

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=10_000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9fe070ff60>

In [None]:
model = get_model(input_shape=(1536,))
model.fit(X_val, y_val, batch_size=50_000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9fe044a550>