## Import

In [10]:
import pandas as pd
import numpy as np

# Then what you need from tensorflow.keras
from tensorflow.keras.models import load_model
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, \
    DistilBertConfig, DistilBertTokenizerFast

import tensorflow as tf
import tensorflow_addons as tfa

from itertools import compress

In [2]:
import transformers
print(f"Transformers package version: {transformers.__version__}")

Transformers package version: 4.17.0


In [3]:
RANDOM_SEED = 42

# SETUP

In [4]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 50  # We truncate anything after the 200-th word to speed up training
TEGS = [
    'release_points', 
    'technical_update_points',
    'partnership_points', 
    'listing_points', 
    'security_points',
    'from_the_project', 
    'not_from_the_project', 
    'staking'
    ]

# Func

In [5]:
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
    """For multi-label classification, one has to define a custom
    acccuracy function because neither tf.keras.metrics.Accuracy nor
    tf.keras.metrics.CategoricalAccuracy evaluate the number of 
    exact matches.

    :Example:
    >>> from tensorflow.keras import metrics
    >>> y_true = tf.convert_to_tensor([[1., 1.]])
    >>> y_pred = tf.convert_to_tensor([[1., 0.]])
    >>> metrics.Accuracy()(y_true, y_pred).numpy()
    0.5
    >>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
    1.0
    >>> multi_label_accuracy(y_true, y_pred).numpy()
    0.0
    """   
    y_pred = tf.math.round(y_pred)
    exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
    exact_matches = tf.cast(exact_matches, tf.float32)
    return tf.math.reduce_mean(exact_matches)

## Data

In [6]:
sample_text = "We'd like to remind everyone that our $BONDLY token contract remains compromised by an unknown attacker and we ask you to refrain from trading our token until we have redeployed our new token. \n\nMore details here: https://t.co/WuSSNt2bsH"

In [7]:
sample_text_2 = "@LuckyBartlett We'll be releasing details soon, including for those who held LP tokens, apologies for the delay"

# TF Bert Model

In [11]:
config = DistilBertConfig.from_pretrained(MODEL_NAME)
model = load_model("model.h5",
    custom_objects={
        "multi_label_accuracy": multi_label_accuracy,
        "RectifiedAdam": tfa.optimizers.RectifiedAdam
        }
    )

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

2022-03-30 18:27:24.426711: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-03-30 18:27:25.275106: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-30 18:27:25.275128: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-30 18:27:25.275139: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Ubuntu-2004-focal-64-minimal): /proc/driver/nvidia/version does not exist
2022-03-30 18:27:25.275260: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enab

In [32]:
def score_text(text, model=model, tokenizer=tokenizer):
    padded_encodings = tokenizer.encode_plus(
        text,
        max_length=MAX_LENGTH, # truncates if len(s) > max_length
        return_token_type_ids=True,
        return_attention_mask=True,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    return model(padded_encodings["input_ids"]).numpy()

In [33]:
score_text("dummy")

array([[0.00151449, 0.00014731, 0.00303701, 0.00015244, 0.00022259,
        0.00300139, 0.00018477, 0.00069699]], dtype=float32)

In [48]:
(score_text(sample_text) > 0.5)

array([[False, False, False, False,  True,  True, False, False]])

In [55]:
score_text(sample_text_2) > 0.5

array([[False,  True, False, False, False, False, False, False]])

In [73]:
from itertools import compress

text_tegs = list(compress(TEGS, (score_text(sample_text) > 0.5)[0]))
text_tegs

['security_points', 'from_the_project']

In [81]:
import yaml
from pathlib import Path

In [78]:
config = {
    "MODEL_NAME": MODEL_NAME,
    "MAX_LENGTH": MAX_LENGTH,
    "TEGS": TEGS,
    }

In [79]:
with open('config.yaml', 'w') as file:
    yaml.dump(config, file, default_flow_style=False)

In [86]:
with open("config.yaml", 'r') as stream:
    try:
        config=yaml.safe_load(stream)
        print(config)
    except yaml.YAMLError as exc:
        print(exc)

{'MAX_LENGTH': 50, 'MODEL_NAME': 'distilbert-base-uncased', 'TEGS': ['release_points', 'technical_update_points', 'partnership_points', 'listing_points', 'security_points', 'from_the_project', 'not_from_the_project', 'staking']}
