In [1]:
# Imports
import numpy as np
import pandas as pd
from typing import List
from collections import OrderedDict

import torch
from torch.jit import RecursiveScriptModule
from torch.nn.functional import sigmoid


# other libraries
from typing import Final

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# own modules
from src.model_utils import set_seed
from src.model_utils import load_model
from src.model_utils import predict_single_text
from torch.nn.utils.rnn import pad_sequence
from src.model_utils import load_w2v_model

import shap

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# static variables
DATA_PATH: Final[str] = "NLP_Data/data"
MODEL_TYPE: Final[str] = "IMDB"  # "TweepFake"

# set device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
set_seed(42)

In [3]:
# load the model
if MODEL_TYPE == "IMDB":
    model: RecursiveScriptModule = load_model("IMDB_best_model")
else:
    model: RecursiveScriptModule = load_model("best_model")
    
# Load the w2v model
w2vec_model = load_w2v_model()


Explain the model with SHAP:

In [4]:
if MODEL_TYPE == "IMDB":
    file_path = DATA_PATH + '/test.txt'
    data: pd.DataFrame = pd.read_csv(file_path, sep='\t', header=None)
    data.columns = ['text', 'tag']

else:
    file_path = DATA_PATH + '/test.csv'
    data: pd.DataFrame = pd.read_csv(file_path)

    # replace the target column with a binary representation
    data['tag'] = data['account.type'].replace('human', 0)
    data['tag'] = data['tag'].replace('bot', 1)
    # Only keep columns text and tag
    data = data[['text', 'tag']]

print(data.head())

                                                text  tag
0  I first saw The Buddy Holly Story when I was a...    1
1  There were so many things wrong with this movi...    0
2  There's a unique place in the pantheon of John...    1
3  It kicks you in the stomach. There are other f...    1
4  To start, I'm not a person to rate movies that...    0


In [5]:
# Test the model with a single text
text = data['text'][2]
print(text)
predicted = predict_single_text(text, model, device)
print(f"Predicted: {predicted}, Real: {data['tag'][0]}")

There's a unique place in the pantheon of John Ford films for Wagonmaster, Sergeant Rutledge, and The Sun Shines Bright. It was these three films with no box office names in them that Ford didn't have to tailor the film around the persona of a star being it John Wayne, Henry Fonda, or any of the others he worked with. Not surprising that Ford considered all these as favorites of one kind or another. <br /><br />Ben Johnson and Harry Carey, Jr. a couple of likable cowpokes sign on to guide a Mormon wagon train to a valley in Arizona territory. Along the way they are joined first by a group stranded players from a medicine show and then by a family of outlaws on the run named Clegg. Their stories merge and what happens is the basis of the film's plot.<br /><br />Had Wagonmaster been done even 10 years earlier on the strength of the two performances turned in by Johnson and Carey, both probably would have had substantial careers as B picture cowboys. In the case of Johnson it would have b

  return forward_call(*args, **kwargs)


In [13]:
# SHAP explects a pipeline that returns something like this:
"""
[[{'label': 'NEGATIVE', 'score': 0.0012035118415951729},
  {'label': 'POSITIVE', 'score': 0.9987965226173401}],
 [{'label': 'NEGATIVE', 'score': 0.002218781039118767},
  {'label': 'POSITIVE', 'score': 0.9977812170982361}]]
"""
def classifier_fn(tokenized_texts: List[List[int]]) -> int: 
    print(tokenized_texts)
    result = []
    for tokenized_text in tokenized_texts:
      text_padded = pad_sequence([torch.tensor(tokenized_text)], batch_first=True)
      length = torch.tensor([len(tokenized_text)])
      if length == 0:
          return 0
      #Send to device
      text_padded = text_padded.to(device)
      prediction = model(text_padded, length)
      
      prediction = sigmoid(prediction).item()
      
      result.append([{'label': 'NEGATIVE', 'score': 1-prediction},
                {'label': 'POSITIVE', 'score': prediction}])
    
    result = np.array(result)
    return result

In [11]:
import src.RNNModelTrain.data as data_utils

class Tokenizer:
    def __init__(self):
        pass

    def __call__(self, _: any, text: str) -> List[torch.Tensor]:
        if MODEL_TYPE == "IMDB":
            text = data_utils.tokenize_sentence(text)
        else:
            text = data_utils.tokenize_tweet(text)
        # Convert the reviews to bag of words representation
        texts_idx: List[torch.Tensor] = data_utils.word2idx(w2vec_model, text)
        return [texts_idx]
    
    def shape(self, text: str) -> torch.Size:
        len_in_tokens = len(data_utils.tokenize_sentence(text))
        return (1, len_in_tokens)
        

In [14]:
# Create a SHAP model explainer
explainer = shap.Explainer(classifier_fn, Tokenizer())

# Explain model predictions on 5 examples
data_selected = data['text'][2:7]
# Data to np
data_np = np.array(data_selected)
print(data_np)
shap_values = explainer(data_np, max_evals=2000)

# Visualize data
shap.plots.text(shap_values[:, :, "POSITIVE"])


["There's a unique place in the pantheon of John Ford films for Wagonmaster, Sergeant Rutledge, and The Sun Shines Bright. It was these three films with no box office names in them that Ford didn't have to tailor the film around the persona of a star being it John Wayne, Henry Fonda, or any of the others he worked with. Not surprising that Ford considered all these as favorites of one kind or another. <br /><br />Ben Johnson and Harry Carey, Jr. a couple of likable cowpokes sign on to guide a Mormon wagon train to a valley in Arizona territory. Along the way they are joined first by a group stranded players from a medicine show and then by a family of outlaws on the run named Clegg. Their stories merge and what happens is the basis of the film's plot.<br /><br />Had Wagonmaster been done even 10 years earlier on the strength of the two performances turned in by Johnson and Carey, both probably would have had substantial careers as B picture cowboys. In the case of Johnson it would have

  return forward_call(*args, **kwargs)


Args being passed to masker ("There's a unique place in the pantheon of John Ford films for Wagonmaster, Sergeant Rutledge, and The Sun Shines Bright. It was these three films with no box office names in them that Ford didn't have to tailor the film around the persona of a star being it John Wayne, Henry Fonda, or any of the others he worked with. Not surprising that Ford considered all these as favorites of one kind or another. <br /><br />Ben Johnson and Harry Carey, Jr. a couple of likable cowpokes sign on to guide a Mormon wagon train to a valley in Arizona territory. Along the way they are joined first by a group stranded players from a medicine show and then by a family of outlaws on the run named Clegg. Their stories merge and what happens is the basis of the film's plot.<br /><br />Had Wagonmaster been done even 10 years earlier on the strength of the two performances turned in by Johnson and Carey, both probably would have had substantial careers as B picture cowboys. In the c

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mnon-precise type array(pyobject, 2d, C)[0m
[0m[1mDuring: typing of argument at c:\Users\pablo\Desktop\IMAT\Tercero\NaturalLanguageProcessing\rnn_explainability\env\lib\site-packages\shap\utils\_masked_model.py (396)[0m
[1m
File "env\lib\site-packages\shap\utils\_masked_model.py", line 396:[0m
[1mdef _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
    <source elided>

[1m@njit
[0m[1m^[0m[0m
