In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

import tensorflow as tf
import keras
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
 
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from tensorflow.keras.layers import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

In [2]:
import pickle

In [3]:
train_df = pd.read_csv("drive/MyDrive/train.csv")
print("Train shape : ",train_df.shape)

Train shape :  (1306122, 3)


In [4]:
## делим выборку на обучающую и тестовую
train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2022)

## задаем параметры 
embed_size = 300 # длина мах вектора (300, так как используем библиотеки .300)
max_features = 95000 # как много уникальный слов использовать (рандом)
maxlen = 70 # мах кол-во слов в предложении

## заполняем пропущенные значения
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values

## маркирование
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

## дополняем предложения до махлен
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)


## выделяем целевые значения
train_y = train_df['target'].values
val_y = val_df['target'].values

In [5]:
# перемешиваем данные
np.random.seed(2022)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]

In [6]:
pred_reg_val_y = pickle.load( open( "drive/MyDrive/save.p", "rb" ) )
reg_val_y = pickle.load( open( "drive/MyDrive/save_vay.p", "rb" ) )

In [7]:
pred_cnn_val_y = pickle.load( open( "drive/MyDrive/cnn_save.p", "rb" ) )
cnn_val_y = pickle.load( open( "drive/MyDrive/cnn_save_vay.p", "rb" ) )

In [8]:
pred_lstm_val_y = pickle.load( open( "drive/MyDrive/lstm_save.p", "rb" ) )
lstm_val_y = pickle.load( open( "drive/MyDrive/lstm_save_vay.p", "rb" ) )

In [9]:
pred_fasttext_val_y = pickle.load( open( "drive/MyDrive/fasttext_save.p", "rb" ) )
fasttext_val_y = pickle.load( open( "drive/MyDrive/fasttext_save_vay.p", "rb" ) )

In [10]:
pred_val_y = (pred_cnn_val_y + pred_lstm_val_y + pred_fasttext_val_y) / 3.0

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6132654122050109
F1 score at threshold 0.11 is 0.6223571390530968
F1 score at threshold 0.12 is 0.6295491580662683
F1 score at threshold 0.13 is 0.6366904485371384
F1 score at threshold 0.14 is 0.6417575008427914
F1 score at threshold 0.15 is 0.6481766820749871
F1 score at threshold 0.16 is 0.6537948628747028
F1 score at threshold 0.17 is 0.6579179662213853
F1 score at threshold 0.18 is 0.6605427974947807
F1 score at threshold 0.19 is 0.664210208396255
F1 score at threshold 0.2 is 0.6663405686334455
F1 score at threshold 0.21 is 0.6688925987757374
F1 score at threshold 0.22 is 0.6726794438181135
F1 score at threshold 0.23 is 0.6749841671944269
F1 score at threshold 0.24 is 0.6775060811675842
F1 score at threshold 0.25 is 0.6789987711014811
F1 score at threshold 0.26 is 0.6791391378295284
F1 score at threshold 0.27 is 0.6799630167745344
F1 score at threshold 0.28 is 0.6792503168145134
F1 score at threshold 0.29 is 0.6799004104703587
F1 score at threshold 0

In [11]:
pred_precision = tf.keras.metrics.Precision(thresholds=0.34)
pred_precision .update_state(val_y, pred_val_y)
print('pred_precision : ', pred_precision .result().numpy())

pred_precision :  0.64337796


In [12]:
pred_recall = tf.keras.metrics.Recall(thresholds=0.34)
pred_recall .update_state(val_y, pred_val_y)
print('pred_recall : ', pred_recall .result().numpy())

pred_recall :  0.72856927


In [13]:
pred_accuracy = tf.keras.metrics.BinaryAccuracy(threshold=0.34)
pred_accuracy.update_state(val_y, pred_val_y)
print('pred_accuracy : ', pred_accuracy.result().numpy())

pred_accuracy :  0.9569528


In [14]:
from google.colab import files
pickle.dump(pred_val_y, open( "pred_save.p", "wb" ) )
pickle.dump(val_y, open( "pred_save_vay.p", "wb" ) )
files.download("pred_save.p")
files.download("pred_save_vay.p")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve

fpr1, tpr1, treshold1 = roc_curve(reg_val_y, pred_reg_val_y)
plt.plot(fpr1, tpr1)

fpr2, tpr2, treshold2 = roc_curve(cnn_val_y, pred_cnn_val_y)
plt.plot(fpr2, tpr2)

fpr3, tpr3, treshold3 = roc_curve(lstm_val_y, pred_lstm_val_y)
plt.plot(fpr3, tpr3)

fpr4, tpr4, treshold4 = roc_curve(fasttext_val_y, pred_fasttext_val_y)
plt.plot(fpr4, tpr4)

fpr5, tpr5, treshold5 = roc_curve(pred_val_y, p_val_y)
plt.plot(fpr1, tpr1)

plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Пример ROC-кривой')
plt.legend(loc="lower right")
plt.show()