# **Import Libraries**

In [44]:
pip install lime




In [45]:
# for data reading
import json
import pandas as pd
import numpy as np

# for plotting the data
import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing
import re
import nltk

#for bag-of-words (a method to extract features from text documents)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

# for explainer
from lime import lime_text

# for word embedding
import gensim
import gensim.downloader as gensim_api

# for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K



# **Data reading**

In [46]:
df = pd.read_csv("../fyp/mbti_full_pull.csv")

In [47]:
df.head()

Unnamed: 0,author_flair_text,body,subreddit
0,INTJ,Knowing you're in INTJ is a tool for you to us...,intj
1,INTJ,You are truly an enlightened mastermind.,intj
2,"INFJ, 26F",You should :) it will help if you have a down ...,infj
3,INTP,I watch a bit of everything (including hentai)...,INTP
4,INTJ,I don't know if I would count this as a pet pe...,intj


In [48]:
df.shape

(1794016, 3)

In [49]:
df.describe(include='O')

Unnamed: 0,author_flair_text,body,subreddit
count,1794016,1793961,1794016
unique,8702,1746610,520
top,INTP,Yes.,INTP
freq,365646,677,419700


In [50]:
df["subreddit"].value_counts()

INTP            419700
intj            296101
mbti            253602
entp            178379
infj            164662
                 ...  
PourPainting         1
Trophies             1
BF_Hardline          1
PKA                  1
mylittlepony         1
Name: subreddit, Length: 520, dtype: int64

In [51]:
df["author_flair_text"].value_counts()

INTP                                     365646
INTJ                                     323224
ENFP                                      88334
ENTP                                      73481
INFJ                                      69730
                                          ...  
[INTj-Ne 5w6] Ask me about my hobbies         1
5w6 SP | ISTP                                 1
M-20-ENTP                                     1
INFP/Leo/Hufflepuff                           1
35M INTP                                      1
Name: author_flair_text, Length: 8702, dtype: int64

In [52]:
# find # of missing value
df.isnull().sum()


author_flair_text     0
body                 55
subreddit             0
dtype: int64

In [53]:
# remove missing values
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

author_flair_text    0
body                 0
subreddit            0
dtype: int64

In [54]:
# rename column that will be used
df = df.rename(columns={"author_flair_text" : "y" , "body" : "text"})
df.head

<bound method NDFrame.head of                  y                                               text  \
0             INTJ  Knowing you're in INTJ is a tool for you to us...   
1             INTJ           You are truly an enlightened mastermind.   
2        INFJ, 26F  You should :) it will help if you have a down ...   
3             INTP  I watch a bit of everything (including hentai)...   
4             INTJ  I don't know if I would count this as a pet pe...   
...            ...                                                ...   
1793956       INTP                                Remind me! 40 hours   
1793957       INTJ  We are seeing the start of a process that is g...   
1793958       INTJ  [NSFDL. I cry laughing at this every single ti...   
1793959       INFJ                                         Ravenclaw!   
1793960       INTP  I struggle massive with focusing, I would love...   

        subreddit  
0            intj  
1            intj  
2            infj  
3            

In [55]:
# drop the column that will not be used
df.drop("subreddit", inplace=True, axis=1)

In [56]:
df.head

<bound method NDFrame.head of                  y                                               text
0             INTJ  Knowing you're in INTJ is a tool for you to us...
1             INTJ           You are truly an enlightened mastermind.
2        INFJ, 26F  You should :) it will help if you have a down ...
3             INTP  I watch a bit of everything (including hentai)...
4             INTJ  I don't know if I would count this as a pet pe...
...            ...                                                ...
1793956       INTP                                Remind me! 40 hours
1793957       INTJ  We are seeing the start of a process that is g...
1793958       INTJ  [NSFDL. I cry laughing at this every single ti...
1793959       INFJ                                         Ravenclaw!
1793960       INTP  I struggle massive with focusing, I would love...

[1793961 rows x 2 columns]>

In [57]:
# Cleaning for y column (converting to uppercase and remove noise)
df["y"] = df["y"].str.upper()
df["y"].value_counts()

INTP                            366505
INTJ                            323717
ENFP                             88438
ENTP                             80836
INFJ                             71676
                                 ...  
AN ENTJ                              1
INFJ |19 | F                         1
INFJ | SURELY | ��                   1
ENFP: IT'S GETTING COLD OUT.         1
35M INTP                             1
Name: y, Length: 8337, dtype: int64

In [58]:
# clear out noise
import re

pattern = "[IE][SN][FT][PJ]"

matches = df["y"].apply(lambda x: re.findall("[IE][SN][FT][PJ]", x)).apply(lambda x: x[0] if x else None)

In [59]:
df["y"] = matches

df.head

<bound method NDFrame.head of             y                                               text
0        INTJ  Knowing you're in INTJ is a tool for you to us...
1        INTJ           You are truly an enlightened mastermind.
2        INFJ  You should :) it will help if you have a down ...
3        INTP  I watch a bit of everything (including hentai)...
4        INTJ  I don't know if I would count this as a pet pe...
...       ...                                                ...
1793956  INTP                                Remind me! 40 hours
1793957  INTJ  We are seeing the start of a process that is g...
1793958  INTJ  [NSFDL. I cry laughing at this every single ti...
1793959  INFJ                                         Ravenclaw!
1793960  INTP  I struggle massive with focusing, I would love...

[1793961 rows x 2 columns]>

In [60]:
df["y"].value_counts()

INTP    474174
INTJ    365450
ENTP    223000
INFJ    206966
INFP    180164
ENFP     98171
ISTP     64965
ESTP     53258
ENTJ     51067
ENFJ     21254
ISTJ     18395
ISFP     11430
ISFJ      7911
ESFP      7730
ESTJ      7232
ESFJ      2794
Name: y, dtype: int64

In [61]:
'''
  Preprocess a string.
  :parameter
    :param text: string - name of column containing text
    :param 1st_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmentisation is to be applied
  :return
    cleaned text
'''
# Cleaning text column

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, first_stopwords=None):
  # change to lowercase and remove punctuations and characters and then strip
  text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

  # Tokenize (conver from string to list)
  first_text = text.split()

  # remove stopwords
  if first_stopwords is not None:
    first_text = [word for word in first_text if word not in first_stopwords]

  # Stemming (removing -ing, -ly, -ed, .....)
  if flg_stemm == True:
    ps = nltk.stem.porter.PorterStemmer()
    first_text = [ps.stem(word) for word in first_text]

  # Lemmatisation (convert the word into root word)
  if flg_lemm == True:
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    first_text = [lem.lemmatize(word) for word in first_text]

  # back to sring from list
  text = " ".join(first_text)
  return text

In [62]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
first_stopwords = nltk.corpus.stopwords.words("english")

first_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [64]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [65]:
df["text_clean"] = df["text"].apply(lambda x: utils_preprocess_text(x,flg_stemm = False, flg_lemm = True, first_stopwords = first_stopwords))

In [66]:
df.head()

Unnamed: 0,y,text,text_clean
0,INTJ,Knowing you're in INTJ is a tool for you to us...,knowing youre intj tool use interaction people...
1,INTJ,You are truly an enlightened mastermind.,truly enlightened mastermind
2,INFJ,You should :) it will help if you have a down ...,help moment hobby keep mind busy dont like loo...
3,INTP,I watch a bit of everything (including hentai)...,watch bit everything including hentai tend enj...
4,INTJ,I don't know if I would count this as a pet pe...,dont know would count pet peeze something time...


In [67]:
# split dataset
df_train, df_test = model_selection.train_test_split(df, test_size = 0.3)

#get target
y_train = df_train["y"].values
y_test = df_test["y"].values

In [68]:
nlp = gensim_api.load("word2vec-google-news-300")

In [69]:
corpus = df_train["text_clean"]

# create list of lists of unigrams
first_corpus = []
for string in corpus: 
  first_words = string.split()
  first_grams = [" ".join(first_words[i:i+1]) for i in range (0, len(first_words), 1)]
  first_corpus.append(first_grams)

# detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(first_corpus, min_count=5, threshold=10)

trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[first_corpus], min_count=5, threshold=10)


In [70]:
import multiprocessing

from gensim.models import Word2Vec

In [71]:
# fit w2v
nlp = gensim.models.word2vec.Word2Vec(first_corpus, vector_size = 300, window=8, min_count=1, sg=1, epochs=30)

In [72]:
## tokenize text
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                     oov_token="NaN", 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(first_corpus)
dic_vocabulary = tokenizer.word_index
## create sequence
first_text2seq= tokenizer.texts_to_sequences(first_corpus) 
## padding sequence
X_train = kprocessing.sequence.pad_sequences(first_text2seq, 
                    maxlen=15, padding="post", truncating="post")

In [73]:
i = 0

## list of text: ["I like this", ...]
len_txt = len(df_train["text_clean"].iloc[i].split())
print("from: ", df_train["text_clean"].iloc[i], "| len:", len_txt)

## sequence of token ids: [[1, 2, 3], ...]
len_tokens = len(X_train[i])
print("to: ", X_train[i], "| len:", len(X_train[i]))

## vocabulary: {"I":1, "like":2, "this":3, ...}
print("check: ", df_train["text_clean"].iloc[i].split()[0], 
      " -- idx in vocabulary -->", 
      dic_vocabulary[df_train["text_clean"].iloc[i].split()[0]])

print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")

from:  im opposite tend get angry openly im comfortable people closest feel brunt rage people handle people closest already skin past wall dont need find way im selfish bastard way thats way little decided protect cant figure change didnt build wall around keep people theyre keep feeling | len: 46
to:  [    3   543   181     8   792  2869     3   470     4  1437    16 16326
  2796     4   800] | len: 15
check:  im  -- idx in vocabulary --> 3
vocabulary:  {'NaN': 1, 'like': 2, 'im': 3, 'people': 4, 'dont': 5} ... (padding element, 0)


In [74]:
corpus = df_test["text_clean"]

## create list of n-grams
first_corpus = []
for string in corpus:
    first_words = string.split()
    first_grams = [" ".join(first_words[i:i+1]) for i in range(0, 
                 len(first_words), 1)]
    first_corpus.append(first_grams)
    
## detect common bigrams and trigrams using the fitted detectors
first_corpus = list(bigrams_detector[first_corpus])
first_corpus = list(trigrams_detector[first_corpus])

## text to sequence with the fitted tokenizer
first_text2seq = tokenizer.texts_to_sequences(first_corpus)

## padding sequence
X_test = kprocessing.sequence.pad_sequences(first_text2seq, maxlen=15,
             padding="post", truncating="post")

In [75]:
## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, 300))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  nlp[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

In [76]:
## code attention layer
def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

## input
x_in = layers.Input(shape=(15,))
## embedding
x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=15, trainable=False)(x_in)
## apply attention
x = attention_layer(x, neurons=15)
## 2 layers of bidirectional lstm
x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                         return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)
## final dense layers
x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(3, activation='softmax')(x)
## compile
model = models.Model(x_in, y_out)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 15, 300)      141800400   ['input_2[0][0]']                
                                                                                                  
 permute_1 (Permute)            (None, 300, 15)      0           ['embedding_1[0][0]']            
                                                                                                  
 dense_3 (Dense)                (None, 300, 15)      240         ['permute_1[0][0]']              
                                                                                            

In [84]:
# encode y

dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train = np.array([inverse_dic[y] for y in y_train])


In [86]:
y_train.shape

(1255772,)

In [89]:
X_train.shape

(1255772, 15)

In [82]:
# import numpy as np

#X_train = np.expand_dims(X_train, axis=-1)
#this fked up the values

In [83]:
#import numpy as np

X_train = X_train.reshape((X_train.shape[0], 1))

ValueError: cannot reshape array of size 18836580 into shape (1255772,1)

In [85]:
if y_train.dtype == np.int:
    print("y_train is an array of integers")
else:
    print("y_train is not an array of integers")

y_train is an array of integers


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if y_train.dtype == np.int:


In [80]:
if X_train.dtype == np.int:
    print("X_train is an array of integers")
else:
    print("X_train is not an array of integers")

X_train is an array of integers


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if X_train.dtype == np.int:


In [88]:
## train
training = model.fit(x=X_train, y=y_train, batch_size=256, 
                     epochs=10, shuffle=True, verbose=0, 
                     validation_split=0.3)
## plot loss and accuracy
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(training.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
    ax11.plot(training.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(training.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
     ax22.plot(training.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\win10\AppData\Local\Temp/ipykernel_4532/2442711333.py", line 2, in <module>
      training = model.fit(x=X_train, y=y_train, batch_size=256,
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 994, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
      return self.compiled_loss(
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 2084, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "C:\Users\win10\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 5630, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
Received a label value of 15 which is outside the valid range of [0, 3).  Label values: 2 0 15 10 8 11 10 11 11 11 11 11 8 9 10 11 1 8 8 8 9 10 10 11 10 8 11 8 11 11 11 11 9 1 8 10 11 7 10 11 11 11 10 9 10 11 10 9 8 15 8 3 10 2 10 10 8 8 11 10 10 10 11 10 12 7 3 9 14 0 8 9 2 2 3 10 11 2 3 10 11 3 1 3 10 11 3 8 10 10 3 8 3 10 8 8 1 10 3 3 3 11 3 7 10 7 11 11 3 15 7 3 15 3 11 10 11 11 15 9 11 3 1 15 8 1 3 11 9 10 1 6 11 9 10 1 11 11 11 3 11 8 3 15 11 11 8 10 10 11 1 14 11 10 9 11 10 0 11 11 8 2 14 10 7 3 11 10 2 8 3 11 2 8 9 10 7 7 10 11 15 8 11 11 8 8 9 3 8 9 11 8 11 8 3 11 11 0 14 11 15 11 11 8 11 10 3 3 11 7 1 11 8 8 14 10 8 9 9 10 10 8 10 11 3 3 11 10 3 1 3 10 8 3 11 1 7 7 1 11 11 2 11 15 9 0 10 1 9 3 8 11 14 11 3 11
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_13358]

In [None]:
## test
predicted_prob = model.predict(X_test)
predicted = [dic_y_mapping[np.argmax(pred)] for pred in 
             predicted_prob]

In [None]:
## select observation
i = 0
txt_instance = df_test["text"].iloc[i]
## check true value and predicted value
print("True:", y_test[i], "--> Pred:", predicted[i], "| Prob:", round(np.max(predicted_prob[i]),2))

## show explanation
### 1. preprocess input
first_corpus = []
for string in [re.sub(r'[^\w\s]','', txt_instance.lower().strip())]:
    first_words = string.split()
    first_grams = [" ".join(first_words[i:i+1]) for i in range(0, 
                 len(first_words), 1)]
    first_corpus.append(first_grams)
first_corpus = list(bigrams_detector[first_corpus])
first_corpus = list(trigrams_detector[first_corpus])
X_instance = kprocessing.sequence.pad_sequences(
              tokenizer.texts_to_sequences(corpus), maxlen=15, 
              padding="post", truncating="post")
### 2. get attention weights
layer = [layer for layer in model.layers if "attention" in 
         layer.name][0]
func = K.function([model.input], [layer.output])
weights = func(X_instance)[0]
weights = np.mean(weights, axis=2).flatten()
### 3. rescale weights, remove null vector, map word-weight
weights = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(np.array(weights).reshape(-1,1)).reshape(-1)
weights = [weights[n] for n,idx in enumerate(X_instance[0]) if idx 
           != 0]
dic_word_weigth = {word:weights[n] for n,word in 
                   enumerate(first_corpus[0]) if word in 
                   tokenizer.word_index.keys()}
### 4. barplot
if len(dic_word_weigth) > 0:
   df = pd.DataFrame.from_dict(dic_word_weigth, orient='index', 
                                columns=["score"])
   df.sort_values(by="score", 
           ascending=True).tail(3).plot(kind="barh", 
           legend=False).grid(axis='x')
   plt.show()
else:
   print("--- No word recognized ---")
### 5. produce html visualization
text = []
for word in first_corpus[0]:
    weight = dic_word_weigth.get(word)
    if weight is not None:
         text.append('<b><span style="background-color:rgba(100,149,237,' + str(weight) + ');">' + word + '</span></b>')
    else:
         text.append(word)
text = ' '.join(text)
### 6. visualize on notebook
print("\033[1m"+"Text with highlighted words")
from IPython.core.display import display, HTML
display(HTML(text))