In [None]:
pip install alibi[tensorflow]

In [None]:
pip install transformers

In [3]:
import re
import os
import numpy as np
import matplotlib as mpl
import matplotlib.cm
import tensorflow as tf
import tensorflow.keras as keras

from tqdm import tqdm
from typing import Optional, Union, List, Dict
from IPython.display import HTML
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import PreTrainedTokenizer
from alibi.explainers import IntegratedGradients

In [4]:
def decode_sentence(x: List[int], reverse_index: Dict[int, str], unk_token: str = '[UNK]') -> str:
    """ 
    Decodes the tokenized sentences from keras IMDB dataset into plain text.
    
    Parameters
    ----------
    x
        List of integers to be docoded.
    revese_index:
        Reverse index map, from `int` to `str`.
    unk_token:
        Unkown token to be used.
        
    Returns
    -------
        Decoded sentence.
    """
    # the `-3` offset is due to the special tokens used by keras
    # see https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
    return " ".join([reverse_index.get(i - 3, unk_token) for i in x])


def process_sentences(sentence: List[str], 
                      tokenizer: PreTrainedTokenizer, 
                      max_len: int) -> Dict[str, np.ndarray]:
    """
    Tokenize the text sentences.
    
    Parameters
    ----------
    sentence:
        Sentence to be processed.
    tokenizer:
        Tokenizer to be used.
    
    Returns
    -------
        Tokenized representation containing:
         - input_ids
         - attention_mask
    """
    # since we are using the model for classification, we need to include special char (i.e, '[CLS]', ''[SEP]')
    # check the example here: https://huggingface.co/transformers/v4.4.2/quicktour.html
    z = tokenizer(sentence, 
                  add_special_tokens=True, 
                  padding='max_length', 
                  max_length=max_len, 
                  truncation=True,
                  return_attention_mask = True,  
                  return_tensors='np')
    return z

In [5]:
def  hlstr(string: str , color: str = 'white') -> str:
    """
    Return HTML markup highlighting text with the desired color.
    """
    return f"<mark style=background-color:{color}>{string} </mark>"


def colorize(attrs: np.ndarray, cmap: str = 'PiYG') -> List:
    """
    Compute hex colors based on the attributions for a single instance.
    Uses a diverging colorscale by default and normalizes and scales
    the colormap so that colors are consistent with the attributions.
    
    Parameters
    ----------
    attrs:
        Attributions to be visualized.
    cmap:
        Matplotlib cmap type.
    """
    cmap_bound = np.abs(attrs).max()
    norm = mpl.colors.Normalize(vmin=-cmap_bound, vmax=cmap_bound)
    cmap = mpl.cm.get_cmap(cmap)
    return list(map(lambda x: mpl.colors.rgb2hex(cmap(norm(x))), attrs))


def display(X: np.ndarray, 
            attrs: np.ndarray, 
            tokenizer: PreTrainedTokenizer,
            pred: np.ndarray) -> None:
    """
    Display the attribution of a given instance.
    
    Parameters
    ----------
    X:
        Instance to display the attributions for.
    attrs:
        Attributions values for the given instance.
    tokenizer:
        Tokenizer to be used for decoding.
    pred:
        Classification label (prediction) for the given instance.
    """
    pred_dict = {1: 'Positive review', 0: 'Negative review'}
    
    # remove padding
    fst_pad_indices = np.where(X ==tokenizer.pad_token_id)[0]
    if len(fst_pad_indices) > 0:
        X, attrs = X[:fst_pad_indices[0]], attrs[:fst_pad_indices[0]]
    
    # decode tokens and get colors
    tokens = [tokenizer.decode([X[i]]) for i in range(len(X))]
    colors = colorize(attrs)
    
    print('Predicted label =  {}: {}'.format(pred, pred_dict[pred]))
    return HTML("".join(list(map(hlstr, tokens, colors))))

In [6]:
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

# load model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

cl_path = '/content/drive/MyDrive/classifier_model/finbert-sentiment'
model = TFAutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3, from_pt=True)

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# # load model and tokenizer
# model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [7]:
class AutoModelWrapper(keras.Model):
    def __init__(self, transformer: keras.Model, **kwargs):
        """
        Constructor.
        
        Parameters
        ----------
        transformer:
            Transformer to be wrapped.
        """
        super().__init__()
        self.transformer = transformer

    def call(self, 
             input_ids: Union[np.ndarray, tf.Tensor], 
             attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
             training: bool = False):
        """
        Performs forward pass throguh the model.
        
        Parameters
        ----------
        input_ids:
            Indices of input sequence tokens in the vocabulary.
        attention_mask:
            Mask to avoid performing attention on padding token indices.
        
        Returns
        -------
            Classification probabilities.
        """
        out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, training=training)
        return tf.nn.softmax(out.logits, axis=-1)
    
    def get_config(self):
        return {}

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [8]:
auto_model = AutoModelWrapper(model)


In [13]:
max_features = 10000
max_len = 128

In [None]:
text_samples = ['I love you, I like you', 
                'I love you, I like you, but I also kind of dislike you',
                'Everything is so nice about you']

# since using the uncased model, we need to use lowercase sentences
text_samples = [text.lower() for text in text_samples]

# tokenize the sentences using the transformer's tokenizer.
tokenized_samples = process_sentences(text_samples, tokenizer, max_len)
X_test = tokenized_samples['input_ids'].astype(np.int32)

# the values of the kwargs have to be `tf.Tensor`. 
# see transformers issue #14404: https://github.com/huggingface/transformers/issues/14404
kwargs = {k: tf.constant(v) for k,v in tokenized_samples.items() if k == 'attention_mask'}

In [None]:
auto_model.layers[0].layers

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7f3907c71290>,
 <keras.layers.core.dropout.Dropout at 0x7f3907f1ba10>,
 <keras.layers.core.dense.Dense at 0x7f3907f1bd10>]

In [None]:
layer = auto_model.layers[0].layers[0].embeddings

In [None]:
n_steps = 50
internal_batch_size = 5
method = "gausslegendre"

ig  = IntegratedGradients(auto_model,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=internal_batch_size)

In [None]:
predictions = auto_model(X_test, **kwargs).numpy().argmax(axis=1)

# Get the baselines. Note that the baseline contains special characters (e.g, [CLS], [SEP], [UNK] [PAD]) and
# the regular tokens are replaced by the [PAD] token which is a neutral token.
# By including special tokens such as [CLS], [SEP], [UNK], we ensure that the attribution for those tokens
# will be 0 if we use the embedding layer. The 0 attribution is due to integration between [x, x] which is 0.
mask = np.isin(X_test, tokenizer.all_special_ids)
baselines = X_test * mask + tokenizer.pad_token_id * (1 - mask)

# get explanation
explanation = ig.explain(X_test, 
                         forward_kwargs=kwargs,
                         baselines=baselines, 
                         target=predictions)

In [None]:
attrs = explanation.attributions[0]
print('Attributions shape:', attrs.shape)

Attributions shape: (3, 128, 768)


In [None]:
attrs = attrs.sum(axis=2)
print('Attributions shape:', attrs.shape)

Attributions shape: (3, 128)


In [None]:
index = 0
display(X=X_test[index], attrs=attrs[index], pred=predictions[index], tokenizer=tokenizer)

In [9]:
import pandas as pd

In [10]:
df=pd.read_csv("dataSpecial.csv", encoding="latin", header=[0])
text=df['Description']
label=df['Semantic']

In [11]:
s=text.head(10)
print(s[0])


Finnish Talentum reports its operating profit increased to EUR 20.5 mn in 2005 from EUR 9.3 mn in 2004 , and net sales totaled EUR 103.3 mn , up from EUR 96.4 mn .


In [14]:
s = [text.lower() for text in s]

# tokenize the sentences using the transformer's tokenizer.
tokenized_samples = process_sentences(s, tokenizer, max_len)
X_test = tokenized_samples['input_ids'].astype(np.int32)

# the values of the kwargs have to be `tf.Tensor`. 
# see transformers issue #14404: https://github.com/huggingface/transformers/issues/14404
kwargs = {k: tf.constant(v) for k,v in tokenized_samples.items() if k == 'attention_mask'}

In [15]:
print(s)

['finnish talentum reports its operating profit increased to eur 20.5 mn in 2005 from eur 9.3 mn in 2004 , and net sales totaled eur 103.3 mn , up from eur 96.4 mn .', 'lifetree was founded in 2000 , and its revenues have risen on an average by 40 % with margins in late 30s .', 'nokia also noted the average selling price of handsets declined during the period , though its mobile phone profit margin rose to more than 22 percent from 13 percent in the year-ago quarter .', 'calls to the switchboard and directory services have decreased significantly since our employees now have up-to-date contact information from all their colleagues and customers on their phone and can place the call directly .', 'earnings per share eps are seen at eur 0.56 , up from eur 0.38 .', 'the growth of net sales has continued favourably in the middle east and africaand in asia pacific .', 'the company slipped to an operating loss of eur 2.6 million from a profit of eur 1.3 million .', "the company 's profit befo

In [16]:
auto_model.layers[0].layers

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7fbff2044610>,
 <keras.layers.core.dropout.Dropout at 0x7fbff1a54790>,
 <keras.layers.core.dense.Dense at 0x7fbff1a24050>]

In [17]:
layer = auto_model.layers[0].layers[0].embeddings

In [18]:
n_steps = 50
internal_batch_size = 5
method = "gausslegendre"

ig  = IntegratedGradients(auto_model,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=internal_batch_size)

In [19]:
predictions = auto_model(X_test, **kwargs).numpy().argmax(axis=1)

# Get the baselines. Note that the baseline contains special characters (e.g, [CLS], [SEP], [UNK] [PAD]) and
# the regular tokens are replaced by the [PAD] token which is a neutral token.
# By including special tokens such as [CLS], [SEP], [UNK], we ensure that the attribution for those tokens
# will be 0 if we use the embedding layer. The 0 attribution is due to integration between [x, x] which is 0.
mask = np.isin(X_test, tokenizer.all_special_ids)
baselines = X_test * mask + tokenizer.pad_token_id * (1 - mask)

# get explanation
explanation = ig.explain(X_test, 
                         forward_kwargs=kwargs,
                         baselines=baselines, 
                         target=predictions)

In [20]:
attrs = explanation.attributions[0]
print('Attributions shape:', attrs.shape)

Attributions shape: (10, 128, 768)


In [21]:
attrs = attrs.sum(axis=2)
print('Attributions shape:', attrs.shape)

Attributions shape: (10, 128)


In [23]:
index = 5
display(X=X_test[index], attrs=attrs[index], pred=predictions[index], tokenizer=tokenizer)

Predicted label =  1: Positive review
