In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras as k
import numpy as np
from models import dataset_ops
from models import vectorization_ops
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, accuracy_score
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)  # show all columns
GPUs = tf.config.list_physical_devices('GPU')
if GPUs is None or len(GPUs) == 0:
    print("WARNING: No GPU, all there is is:")
    for device in tf.config.list_physical_devices():
        print(f'- {device}')
else:
    for gpu in GPUs:
        tf.config.experimental.set_memory_growth(gpu, True)
        print("Initialized", gpu)

- PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


In [6]:
data_train, data_validation, data = dataset_ops.load_data(split_ratio=0.3, random_state=42)
data.groupby('type').aggregate('count'), data_train.groupby('type').aggregate('count'), data_validation.groupby('type').aggregate('count'),

(         url
 type        
 0     344821
 1      75643,
          url
 type        
 0     241374
 1      52950,
          url
 type        
 0     103447
 1      22693)

In [7]:
char_vectorizer = vectorization_ops.create_char_vectorizer(data_train['url'])
LC = len(char_vectorizer.word_counts)
print('Char vocab size:', LC)

Char vocab size: 150


In [8]:
dataset_train = dataset_ops.create_dataset_generator(None, char_vectorizer, data_train)\
    .shuffle(10000)\
    .prefetch(10000) #.batch(15*1024)
dataset_validation = dataset_ops.create_dataset_generator(None, char_vectorizer, data_validation)\
    .shuffle(10000)\
    .prefetch(10000) #.batch(15*1024)

print('Train:', dataset_train.element_spec, '\nValid:', dataset_validation.element_spec)

Train: ({'char': TensorSpec(shape=(200,), dtype=tf.float64, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None)) 
Valid: ({'char': TensorSpec(shape=(200,), dtype=tf.float64, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))


In [10]:
def create_conv_subnet(input_layer, conv_kernel_sizes, prefix=''):
    convolutions = list()
    for kernel_size in conv_kernel_sizes:
        x = k.layers.Conv1D(
            filters=32,
            kernel_size=kernel_size,
            padding='same',
            activation='relu',
            name=f'{prefix}_conv_{kernel_size}'
        )(input_layer)
        x = k.layers.MaxPool1D()(x)
        convolutions.append(x)

    x = k.layers.concatenate(convolutions, axis=2)
    x = k.layers.Flatten()(x)
    x = k.layers.Dropout(0.5, name=f'{prefix}_dropout')(x)
    x = k.layers.Dense(512, name=f'{prefix}_dense', activation='relu')(x)
    return x

def create_url_net(input_length, emb_dim, conv_kernel_sizes):
    char_input = k.layers.Input(shape=[input_length], name='char')

    x = create_conv_subnet(
        k.layers.Embedding(2+LC, emb_dim, mask_zero=True)(char_input),
        conv_kernel_sizes,
        'char'
    )

    x = k.layers.Dense(128, activation='relu', name='dense_1')(x)
    x = k.layers.Dense(1, activation='sigmoid', name='dense_comb_out')(x)

    model = k.models.Model(inputs=[char_input], outputs=[x])
    return model

model = create_url_net(
    input_length=200,
    emb_dim=16,
    conv_kernel_sizes=[3,5]
)
model.compile(
    optimizer=k.optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']#, k.metrics.Precision(), k.metrics.Recall()]
)
#     loss='binary_crossentropy',
model.summary()
k.utils.plot_model(model, show_shapes=True)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char (InputLayer)               [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 16)      2432        char[0][0]                       
__________________________________________________________________________________________________
char_conv_3 (Conv1D)            (None, 200, 32)      1568        embedding_1[0][0]                
__________________________________________________________________________________________________
char_conv_5 (Conv1D)            (None, 200, 32)      2592        embedding_1[0][0]                
____________________________________________________________________________________________

In [18]:
bs=256*8

model.fit(
    dataset_train.batch(bs),
    epochs=100,
    validation_data=dataset_validation.batch(bs),
    callbacks=[
        k.callbacks.EarlyStopping(monitor='val_loss', patience=3),
        k.callbacks.ModelCheckpoint('./checkpoints', verbose=0)
    ],
)

Epoch 1/100


InvalidArgumentError:  TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was tf.Tensor(b'bad', shape=(), dtype=string).
Traceback (most recent call last):

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 982, in generator_py_func
    dtype=dtype.as_numpy_dtype))

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\ops\script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\numpy\core\_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

ValueError: invalid literal for int() with base 10: b'bad'


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\ops\script_ops.py", line 249, in __call__
    ret = func(*args)

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 645, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 991, in generator_py_func
    sys.exc_info()[2])

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\six.py", line 702, in reraise
    raise value.with_traceback(tb)

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 982, in generator_py_func
    dtype=dtype.as_numpy_dtype))

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\tensorflow\python\ops\script_ops.py", line 209, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "C:\Users\13215\.conda\envs\tens\lib\site-packages\numpy\core\_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was tf.Tensor(b'bad', shape=(), dtype=string).


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_1220]

Function call stack:
train_function


In [None]:
model.save('full_convolution')

In [None]:
bs=256*8
model = k.models.load_model('full_convolution')

In [None]:
X_validation, y_validation = np.array([*dataset_validation.as_numpy_iterator()]).T
X_validation = np.array([item['char'] for item in X_validation])

In [None]:
X_validation2 = tf.data.Dataset.from_tensor_slices(((X_validation, X_validation),)).batch(bs)
y_validation = y_validation.astype('int32')

y_hat = model.predict(X_validation2).ravel()

In [None]:
fpr, tpr, thresholds = roc_curve(y_validation, y_hat)
auc_ = auc(fpr, tpr)
best_threshold = thresholds[np.argmax(-fpr + tpr)]

In [None]:
model_name = "Fully Convolutional Word"
model_full_name = "Fully Convolutional model with Word Level Embedding"

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label=f'{model_name} (area = {auc_:.3f})')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title(f'ROC curve for {model_full_name}')
plt.legend(loc='best')
plt.savefig(f'../plots/{model_name.replace(" ", "_").lower()}_roc.pdf')
plt.show()

In [None]:
y_hat_01 = np.zeros_like(y_hat)
y_hat_01[y_hat >= best_threshold] = 1

np.unique(y_hat_01)

In [None]:
precision_recall_fscore_support(y_validation, y_hat_01, beta=1, average='binary'), \
    tpr[np.argmax(-fpr + tpr)], fpr[np.argmax(-fpr + tpr)], \
    accuracy_score(y_validation, y_hat_01)

In [None]:
plt.plot((-fpr + tpr) / 2, label='Sum')
plt.plot(tpr, label='TPR')
plt.plot(1-fpr, label='FPR')
plt.legend()

In [None]:
np.save("fpr_tpr/fccl-fpr", fpr)
np.save("fpr_tpr/fccl-tpr", tpr)