In [18]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from tensorflow.keras import layers, Sequential
from tensorflow import keras
import tensorflow as tf

from glob import glob
import pandas as pd
import numpy as np
import os

pd.options.display.max_colwidth = 100

In [2]:
os.chdir("/content/drive/MyDrive/SKBKonturTest")

## Train Model

In [3]:
def init_model(input_shape):
  model = keras.Sequential(
      [
          layers.Dense(128, input_shape=input_shape , activation="relu", name="Layer1"),
          layers.Dense(32, activation="relu", name="Layer2"),
          layers.Dense(1, activation='sigmoid' ,name="SigmoidLayer"),
      ]
  )

  model.compile(
      optimizer=keras.optimizers.Adam(),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.Precision(), keras.metrics.Recall()],
  )
  return model

def get_model_and_parameters(models_train_dir, best_model_dir, patience):
  n_epochs_trained = len(os.listdir(models_train_dir))

  if n_epochs_trained == 0:
    model = init_model(input_shape=(1536,))
    fscore_prev = 0
    print('Initialized new model.')
  else:
    last_model_path = glob(f'{models_dir}/model_epoch-{n_epochs_trained}*')[0]
    
    # Update patience
    best_model = glob(f'{best_model_dir}/model_epoch-*')[0]
    best_epoch = int(best_model.split('model_epoch-')[1].split('_valfscore-')[0])
    patience = patience - (n_epochs_trained - best_epoch)
    print(f'Patience: {patience}')

    # Update fscore_prev
    fscore_prev = float(best_model.split('valfscore-')[1].split('.h5')[0])
    print(f'Best F1 Score: {fscore_prev}')

    # Load model
    model = keras.models.load_model(last_model_path)
    print(f"Loaded model: {last_model_path}")

    return model, fscore_prev, patience

def train_iter(model, fscore_prev, patience, n_iter):
  train_X = np.load(f'processed_data/train/X_processed_{n_iter}.npy')
  train_y = np.load(f'processed_data/train/y_processed_{n_iter}.npy')

  model.fit(x=train_X,
            y=train_y,
            batch_size=batch_size,
            epochs=1,
            verbose=1
            )
  
  del train_X, train_y
    
  return model, fscore_prev, patience

def train_epoch(model, fscore_prev, patience):
  for n_iter in range(n_processed):
    model, fscore_prev, patience = train_iter(model, fscore_prev, patience, n_iter)
    
  return model, fscore_prev, patience


In [15]:
batch_size = 1024
epochs = 30

In [16]:
model, fscore_prev, patience = get_model_and_parameters(models_train_dir='trained_models/sample_train',
                                                        best_model_dir='best_models/sample_train', patience=5)

Patience: 4
Best F1 Score: 0.9217171358333452
Loaded model: trained_models/model_epoch-10_valfscore-0.9213.h5


In [7]:
val_X = np.load('processed_data/val/X_processed_0.npy')
val_y = np.load('processed_data/val/y_processed_0.npy')

In [17]:
n_processed = len(os.listdir('processed_data/train')) // 2
n_epochs_trained = len(os.listdir('trained_models/sample_train'))

for epoch in range(n_epochs_trained, epochs):
  print(f'Epoch: {epoch}')

  model, fscore_prev, patience = train_epoch(model, fscore_prev, patience)

  pred_y = model.predict(val_X) > 0.5

  precision, recall, fscore, _ = precision_recall_fscore_support(val_y, pred_y, average='binary')
  accuracy = accuracy_score(val_y, pred_y)
  print(f"val_accuracy: {accuracy}, val_precision: {precision}, val_recall: {recall}, val_fscore: {fscore}")

  if fscore > fscore_prev:
    patience = 5
    fscore_prev = fscore
    os.remove(glob('best_models/sample_train/model_epoch*')[0])
    model.save(f'best_models/sample_train/model_epoch-{epoch + 1}_valfscore-{fscore:.4f}.h5')
  else:
    patience -= 1

  model.save(f'trained_models/sample_train/model_epoch-{epoch + 1}_valfscore-{fscore:.4f}.h5')
  if patience == 0:
    print('Training stoped by early stopping')
    break
  print('-' * 40)

Epoch: 10
val_accuracy: 0.9926039458175012, val_precision: 0.9333145036011863, val_recall: 0.9034586466165414, val_fscore: 0.9181439288691303
----------------------------------------
Epoch: 11
val_accuracy: 0.9929340819766996, val_precision: 0.9538068981698733, val_recall: 0.8891592617908407, val_fscore: 0.9203492238683476
----------------------------------------
Epoch: 12
val_accuracy: 0.9930269719986795, val_precision: 0.9592561885585692, val_recall: 0.8857416267942584, val_fscore: 0.9210343013916726
----------------------------------------
Epoch: 13
val_accuracy: 0.9925323953951655, val_precision: 0.9311820690431942, val_recall: 0.9041695146958305, val_fscore: 0.9174770075878428
Training stoped by early stopping


## Check wrong predictions

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train = pd.read_csv('Data/train_data.tsv', sep='\t', index_col=0)
train_data, valid_data = train_test_split(train, test_size=0.2, stratify=train['answer'], random_state=42, shuffle=True)

val_X = np.load('processed_data/val/X_processed_0.npy')
val_y = np.load('processed_data/val/y_processed_0.npy')

  mask |= (ar1 == a)


In [3]:
model = keras.models.load_model(glob('best_model/model_*.h5')[0])

In [7]:
pred_y = model.predict(val_X) > 0.5

In [25]:
valid_data[pred_y.reshape(-1) != val_y.reshape(-1)].head(10)

Unnamed: 0,ru_name,eng_name,answer
760498,"Общество с ограниченной ответственностью ""ПРОСТОР""",SPACE Ltd,True
1095640,"Закрытое акционерное общество ""Экопрофиль""",URFKK,False
1813241,"ООО ""Лидер""",Ltd,False
2365385,"ООО ""ДЖМИНА ПЛЭЙ""",Jmina Play,True
1629797,"Общество с ограниченной ответственностью ""Партнер-технология""","""Partec""",True
2531811,"Общество с ограниченной ответственностью ""Финагролес""",FAL Ipc,True
667180,"Общество с ограниченной ответственностью ""ТриМед""","""AWT Global STC"" Limited Liability Company",True
1381680,"ООО ""ИНЖЕНЕР-УРАЛ""","ООО ""Trek-Ural""",True
2910821,"Общество с ограниченной ответственностью ""Сапротек""",***,True
1783946,"ООО ""КСБ""","""Security Systems Company"" LTD",True


In [26]:
valid_data[pred_y.reshape(-1) != val_y.reshape(-1)]['answer'].value_counts()

True     4143
False    1366
Name: answer, dtype: int64

## Generate predictions for test

In [None]:
model = keras.models.load_model(glob('best_model/model_*.h5')[0])

In [29]:
test_X = np.load('processed_data/test/X_processed_0.npy')

In [30]:
y_pred_test = (model.predict(test_X) > 0.5).reshape(-1)
y_pred_test = pd.Series(y_pred_test)
y_pred_test.name = 'answer'

y_pred_test.to_csv('result.tsv', index=False)

In [31]:
y_pred_test.mean()

0.042335139129282405