# Imports and Setup

In [1]:
!pip install faiss-gpu
!pip install tensorflow-hub
!pip install tensorflow-datasets
!pip install tensorflow-text


import faiss
from scipy.linalg import get_lapack_funcs
import warnings

import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model
import os

import pandas as pd
import pickle
from google.colab import drive

import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text

%matplotlib inline

drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 85 kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 8.2 MB/s 
Collecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |█████

# Setup Dataset and Models

Using IMDb dataset to compare LSTMs and BERT

In [2]:
## Load dataset from tensorflow_datasets

imdb_train_data, imdb_validation_data, imdb_test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:90%]', 'train[90%:]', 'test'),
    as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOMCVT0/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOMCVT0/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteOMCVT0/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
## Convert dataset from EagerTensor to List

i = 0
imdb_X_test_raw = []
imdb_y_test = []
for elem in imdb_test_data.as_numpy_iterator():
  imdb_X_test_raw.append(elem[0].decode('utf-8'))
  imdb_y_test.append(elem[1])


i = 0
imdb_X_train_raw = []
imdb_y_train = []
for elem in imdb_train_data.as_numpy_iterator():
  imdb_X_train_raw.append(elem[0].decode('utf-8'))
  imdb_y_train.append(elem[1])


imdb_X_val_raw = []
imdb_y_val = []
for elem in imdb_validation_data.as_numpy_iterator():
  imdb_X_val_raw.append(elem[0].decode('utf-8'))
  imdb_y_val.append(elem[1])

In [4]:
##Function that uses GloVe embedding to process tweets

embeddings_file = '/content/drive/My Drive/CAIS Project/glove.twitter.27B.50d.txt'

import os
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
embedding_dimension = 50

def preprocess_text(tweets, embedding_address):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(tweets) 
  sequences = tokenizer.texts_to_sequences(tweets)
  word_index = tokenizer.word_index
  X_train = pad_sequences(sequences) 


  embeddings_index = {}
  f = open(embedding_address, 'rb')
  for line in f:
    values = line.split()
    word = values[0].decode('UTF-8')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  f.close()


  num_words = len(word_index)+1
  embedding_matrix = np.zeros((num_words, embedding_dimension))

  for word, i in word_index.items():
    if i >= num_words:
      continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

  return X_train, word_index, embedding_matrix

In [5]:
## Generate word embeddings for LSTM training
# Combining train, test and val data for processing through embeddings
all_X_vals = []
all_y_vals = []

for val in imdb_X_train_raw:
  all_X_vals.append(val)
for val in imdb_y_train:
  all_y_vals.append(val)
for val in imdb_X_test_raw:
  all_X_vals.append(val)
for val in imdb_y_test:
  all_y_vals.append(val)
for val in imdb_X_val_raw:
  all_X_vals.append(val)
for val in imdb_y_val:
  all_y_vals.append(val)


LSTM_all_X, word_index, embedding_matrix = preprocess_text(all_X_vals, embeddings_file)


#Split the data into train, test and val
from sklearn.model_selection import train_test_split

LSTM_X_train, LSTM_X_test_val, LSTM_y_train, LSTM_y_test_val = train_test_split(LSTM_all_X, all_y_vals, test_size=0.50, random_state=42)
LSTM_X_test, LSTM_X_val, LSTM_y_test, LSTM_y_val = train_test_split(LSTM_X_test_val, LSTM_y_test_val, test_size=0.0032, random_state=42)

LSTM_X_train_raw, LSTM_X_test_val_raw, LSTM_y_train_raw, LSTM_y_test_val_raw = train_test_split(all_X_vals, all_y_vals, test_size=0.50, random_state=42)
LSTM_X_test_raw, LSTM_X_val_raw, LSTM_y_test, LSTM_y_val = train_test_split(LSTM_X_test_val_raw, LSTM_y_test_val_raw, test_size=0.0032, random_state=42)

# Convert to a compatible format
LSTM_y_train = np.array(LSTM_y_train)
LSTM_y_test = np.array(LSTM_y_test)
LSTM_y_val = np.array(LSTM_y_val)

In [None]:
## Setup Train LSTM Model

# Keras imports

from keras.models import Sequential
from keras.layers import Embedding, Input
from keras.layers.merge import Concatenate
from keras.layers.core import Dense, Activation, Flatten
from keras.layers import Dropout, concatenate
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras import metrics
from keras.models import Model

#Setup and compile model
model = Sequential()

model.add(Embedding(len(word_index) + 1,
                    embedding_dimension,
                    weights=[embedding_matrix],
                    input_length=LSTM_X_train.shape[1],
                    trainable=False))

model.add(LSTM(64, return_sequences = True, activation='tanh'))
model.add(Dropout(.2))

model.add(LSTM(64, activation='tanh'))
model.add(Dropout(.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(.2))

model.add(Dense(1, activation = 'sigmoid'))

print(model.summary())

#Use binary crossentropy and adam
LOSS = 'binary_crossentropy' # because we're classifying between 0 and 1
OPTIMIZER = 'Adam' #Commonly used in NLP

model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = [metrics.binary_accuracy])


#Train model
EPOCHS = 5
BATCH = 250
VAL = (LSTM_X_val, LSTM_y_val)
model.fit(LSTM_X_train, LSTM_y_train, epochs=EPOCHS, batch_size=BATCH)

model.save("/content/drive/My Drive/CURVE/IMDB_LSTM")

In [6]:
## Load model (added after training was completed)

model = keras.models.load_model("/content/drive/My Drive/CURVE/IMDB_LSTM")

## Load BERT from TF-hub
bert_preprocess_model = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert = hub.load('https://tfhub.dev/google/experts/bert/wiki_books/sst2/2')

In [7]:
## Load BERT embeddings of dataset

# Note: the embeddings were generated using the BERT Sentiment Classification expert model 
# from https://tfhub.dev/google/experts/bert/wiki_books/sst2/2

BERT_train_transformed = np.load("/content/drive/My Drive/CURVE/BERT/X_train_bert.npy")
BERT_test_transformed = np.load("/content/drive/My Drive/CURVE/BERT/X_test_bert.npy")
BERT_val_transformed = np.load("/content/drive/My Drive/CURVE/BERT/X_val.npy")

In [28]:
## Load LSTM Feature Predictions

LSTM_train_pred = np.load("/content/drive/My Drive/CURVE/LSTM_NNK/LSTM_pred_train.npy")
LSTM_test_pred = np.load("/content/drive/My Drive/CURVE/LSTM_NNK/LSTM_pred_test.npy")

In [13]:
model.pop()

In [18]:
LSTM_X_train_trans = model.predict(LSTM_X_train)
LSTM_X_test_trans = model.predict(LSTM_X_test)



# Generate Non-Negative Kernel Regression (NNK) Graph

In [10]:
### NNK Code and helper Functions ###

def non_negative_qpsolver(A, b, x_init, x_tol, check_tol=-1, epsilon_low=-1, epsilon_high=-1):
    #"""
    #Solves (1/2)x.T A x - b.T x
    #:param x_init: Initial value for solution x
    #:param x_tol: Smallest allowed non zero value for x_opt. Values below x_tol are made zero
    #:param check_tol: Allowed tolerance for stopping criteria. If negative, uses x_tol value
    #:param epsilon_high: maximum value of x during optimization
    #:param epsilon_low: minimum value of x during optimization
    #:return: x_opt, error
    #"""
    if epsilon_low < 0:
        epsilon_low = x_tol  # np.finfo(float).eps
    if epsilon_high < 0:
        epsilon_high = x_tol
    if check_tol < 0:
        check_tol = x_tol
    n = A.shape[0]
    # A = A + 1e-6 * np.eye(n)
    max_iter = 50 * n
    itr = 0
    # %%
    x_opt = np.reshape(x_init, (n, 1))
    N = 1.0 * (x_opt > (1 - epsilon_high))  # Similarity too close to 1 (nodes collapse)
    if np.sum(N) > 0:
        x_opt = x_opt * N
        return x_opt[:, 0]

    # %%
    non_pruned_elements = x_opt > epsilon_low
    check = 1

    while (check > check_tol) and (itr < max_iter):
        x_opt_solver = np.zeros((n, 1))
        x_opt_solver[non_pruned_elements] = cholesky_solver(
            A[non_pruned_elements[:, 0], :][:, non_pruned_elements[:, 0]], b[non_pruned_elements[:, 0]], tol=x_tol)
        x_opt = x_opt_solver
        itr = itr + 1
        N = x_opt < epsilon_low
        if np.sum(N) > 0:
            check = np.max(np.abs(x_opt[N]))
        else:
            check = 0
        non_pruned_elements = np.logical_and(x_opt > epsilon_low, non_pruned_elements)

    x_opt[x_opt < x_tol] = 0
    return x_opt[:, 0]

def cholesky_solver(a, b, tol=1e-10, lower=False, overwrite_a=False, overwrite_b=False, clean=True):
    #"""Modified code from SciPy LinAlg routine"""​
    a1 = np.atleast_2d(a)
    # Quick return for square empty array
    if a1.size == 0:
        return b

    potrf, = get_lapack_funcs(('potrf',), (a1,))
    c, info = potrf(a1, lower=lower, overwrite_a=overwrite_a, clean=clean)

    if info > 0:
        warnings.warn("Cholesky solver encountered positive semi-definite matrix -- possible duplicates in data")
        # return solve(a1, b, assume_a='sym', lower=lower, overwrite_a=overwrite_a, overwrite_b=overwrite_b,
        #              check_finite=False)
        c = c + tol*np.eye(b.size)

    potrs, = get_lapack_funcs(('potrs',), (c, b))
    x, info = potrs(c, b, lower=lower, overwrite_b=overwrite_b)
    return x


#%%
def nnk_neighbors(train_features, queries, top_k=50, use_gpu=False):
   # """
   # train_features: shape [n_train, d] Feature vectors of available dataset
   # queries: shape [n_queries, d] Query feature vectors for which neighbors are to be selected
   # top_k: Maximum number of neighbors to select
    #use_gpu: Boolean flag to signal use of GPU for neighbor search
    #"""
    dim = train_features.shape[1]
    normalized_features = train_features / np.linalg.norm(train_features, axis=1, keepdims=True)
    index = faiss.IndexFlatIP(dim)
    if use_gpu:
        index = faiss.index_cpu_to_all_gpus(index)
    index.add(normalized_features)

    normalized_queries = queries / np.linalg.norm(queries, axis=1, keepdims=True)
    n_queries = queries.shape[0]

    weight_values = np.zeros((n_queries, top_k))
    similarities, indices = index.search(normalized_queries, top_k)

    for ii, x_test in enumerate(normalized_queries):
        neighbor_indices = indices[ii, :]
        x_support = normalized_features[neighbor_indices]
        g_i = 0.5 + 0.5*similarities[ii, :]
        G_i = 0.5 + 0.5*np.dot(x_support, x_support.T)
        x_opt = non_negative_qpsolver(G_i, g_i, g_i, x_tol=1e-10)
        # x_opt = g_i
        non_zero_indices = np.nonzero(x_opt)
        x_opt = x_opt / np.sum(x_opt[non_zero_indices])
        weight_values[ii, :] = x_opt
        if ii % 10000 == 0:
            print(f"{ii}/{n_queries} processed...")

    return weight_values, indices

In [None]:
## Get classifications from LSTM Model (for accuracy comparisons)

LSTM_train_pred = model.predict(LSTM_X_train)
LSTM_test_pred = model.predict(LSTM_X_test)

# Pop final layer to get feature vectors

model.pop()

LSTM_X_train_trans = model.predict(LSTM_X_train)
LSTM_X_test_trans = model.predict(LSTM_X_test)

In [19]:
## Generate neighbors and weights for BERT
BERT_weights, BERT_indicies = nnk_neighbors(np.float32(BERT_train_transformed), np.float32(BERT_test_transformed))

## Get NNK data for LSTM
LSTM_weights, LSTM_indicies = nnk_neighbors(np.float32(LSTM_X_train_trans), np.float32(LSTM_X_test_trans)) 

0/25000 processed...




10000/25000 processed...
20000/25000 processed...
0/24920 processed...
10000/24920 processed...
20000/24920 processed...


# Analyze NNK Graph

In [20]:
## Find average number of NNK matches for LSTM
sum = 0
for i in range(len(LSTM_X_test_raw)):
  for j in range(50):
    if (not(LSTM_weights[i][j] == 0)):
      sum += 1

print(sum/len(LSTM_X_test_raw))

5.0115569823434996


In [None]:
## Find average number of NNK Matches for BERT
sum = 0
for i in range(len(imdb_X_test_raw)):
  for j in range(50):
    if (not(BERT_weights[i][j] == 0)):
      sum += 1

print(sum/len(imdb_X_test_raw))

15.48268


In [21]:
## Find the range of NNK neighbors for LSTM

min = 100
max = 0
max_index = -1
min_index = -1
for i in range(len(LSTM_weights)):
  # for each element test data point
  counter = 0
  for j in range(len(LSTM_weights[i])):
    if not(LSTM_weights[i][j] == 0):
      counter = counter + 1
  
  if counter > max:
    max = counter
    max_index = i
  if counter < min:
    min = counter
    min_index = i

print("Max: ", max, "at index: ", max_index)    
print("Min:", min, "at index:", min_index)

Max:  12 at index:  1925
Min: 1 at index: 335


In [None]:
## Find the range of NNK neighbors for BERT

min = 100
max = 0
max_index = -1
min_index = -1
for i in range(len(BERT_weights)):
  # for each element test data point
  counter = 0
  for j in range(len(BERT_weights[i])):
    if not(BERT_weights[i][j] == 0):
      counter = counter + 1
  
  if counter > max:
    max = counter
    max_index = i
  if counter < min:
    min = counter
    min_index = i

print("Max: ", max, "at index: ", max_index)    
print("Min:", min, "at index:", min_index)

Max:  31 at index:  17957
Min: 1 at index: 696


# Classify Using KNN and NNK

In [48]:
## Create a BERT classifier that uses KNN, NNK and Weighted NNK 

def bert_classify(idx, indicies, weights, train_labels):
  num_1 = 0
  nnk_num_1 = 0
  i = 0
  num_nnk = 0
  nnk_sum = 0

  for index in indicies[idx]:
    if (train_labels[index] == 1):
      num_1 += 1
    if (not(weights[idx][i] == 0)):
      num_nnk += 1
      nnk_sum += train_labels[index]
      if (train_labels[index] == 1):
        nnk_num_1 += weights[idx][i]
    i += 1
  nnk_sum = nnk_sum/num_nnk

  if num_1 >= 25:
    labels_class = 1
  else:
    labels_class = 0
  if nnk_num_1 >= 0.5:
    nnk_labels_class = 1
  else:
    nnk_labels_class = 0
  if nnk_sum >= 0.5:
    nnk_sum_class = 1
  else:
    nnk_sum_class = 0

    
  return labels_class, nnk_labels_class, nnk_sum_class

In [49]:
## Evaluate classification accuracy using KNN, NNK, and weighted NNK
correct_1 = 0 # KNN
correct_2 = 0 # NNK
correct_3 = 0 # Weighted NNK
classified_correct = 0
total = 0
differing_classifications = []
nnk_failures = []
knn_failures = []
for i in range(len(imdb_X_test_raw)):
  total += 1
  knn_label, nnk_label, nnk_2 = bert_classify(i, BERT_indicies, BERT_weights, imdb_y_train)
  if knn_label == imdb_y_test[i]:
    correct_1 += 1
  if nnk_label == imdb_y_test[i]:
    correct_2 += 1
  if nnk_2 == imdb_y_test[i]:
    correct_3 += 1
  if not(nnk_label == knn_label):
    differing_classifications.append(i)
  if not(nnk_label == imdb_y_test[i]) and knn_label == imdb_y_test[i]:
    nnk_failures.append(i)
  if (nnk_label == imdb_y_test[i]) and not(knn_label == imdb_y_test[i]):
    knn_failures.append(i)
print("knn-correct:", correct_1, "nnk-weights-correct:",correct_2,"nnk-label-correct: " , correct_3, "total:", total)

knn-correct: 21650 nnk-weights-correct: 21656 nnk-label-correct:  21653 total: 25000


In [44]:
## Compare LSTM Model classification with KNN, NNK and Weighted NNK

def lstm_classify(idx, indicies, weights, train_labels):
  sum = 0
  num_1 = 0
  nnk_sum = 0
  nnk_num_1 = 0
  i = 0
  num_nnk = 0
  for index in indicies[idx]:
    if (train_labels[index] == 1):
      num_1 += 1
    if (not(weights[idx][i] == 0)):
      num_nnk += 1
      nnk_sum += train_labels[index]
      if (train_labels[index] == 1):
        nnk_num_1 += LSTM_weights[idx][i]
    i += 1

  nnk_avg = nnk_sum/num_nnk

  if num_1 >= 25:
    labels_class = 1
  else:
    labels_class = 0
  if nnk_num_1 >= 0.5:
    nnk_sum_class = 1
  else:
    nnk_sum_class = 0
  if nnk_avg >= 0.5:
    nnk_labels_class = 1
  else:
    nnk_labels_class = 0
    
  return labels_class, nnk_sum_class, nnk_labels_class

In [26]:
## Helper function to provide cleaner code below

def prob_to_pred(val):
  if (val >= 0.5):
    return 1
  else:
    return 0

In [45]:
#Evaluate accuracy of various classifiers
correct_1 = 0 # KNN
correct_2 = 0 # Weighted NNK
correct_3 = 0 # NNK
classified_correct = 0 # LSTM Model Classification
total = 0
for i in range(len(LSTM_X_test_raw)):
  total += 1
  class1, nnk_class1, nnk_class2 = lstm_classify(i, LSTM_indicies, LSTM_weights, LSTM_y_train)
  if class1 == LSTM_y_test[i]:
    correct_1 += 1
  if prob_to_pred(LSTM_test_pred[i]) == LSTM_y_test[i]:
    classified_correct += 1
  if nnk_class1 == LSTM_y_test[i]:
    correct_2 += 1
  if nnk_class2 == LSTM_y_test[i]:
    correct_3 += 1
print("label-correct:", correct_1, "model-correct:", classified_correct, "nnk-avg-correct:", correct_2, "nnk-label-correct:",correct_3,"total:", total)

label-correct: 20395 model-correct: 20283 nnk-avg-correct: 19671 nnk-label-correct: 19938 total: 24920


# Qualitative Analysis

In [30]:
def lstm_nnk(idx, knn=False):
  all_nnk_X = np.empty((0,32))
  all_nnk_indicies = np.empty((0))

  print("Original Text: ", LSTM_X_test_raw[idx])
  print("Model Prediction: ", prob_to_pred(LSTM_test_pred[idx]))
  print("Actual Classification: ", LSTM_y_test[idx])
  print("\n")
  print("\n")


  for i in range(50):
    if (knn or not(LSTM_weights[idx][i] == 0)):
      print("Matched Text: ", LSTM_X_train_raw[LSTM_indicies[idx][i]])
      print("\n")
      print("Model Prediction for this: ", prob_to_pred(LSTM_train_pred[LSTM_indicies[idx][i]]))
      print("Actual value for this: ", LSTM_y_train[LSTM_indicies[idx][i]])
      print("With weight value: ", LSTM_weights[idx][i])
      print("\n")
      print("\n")

In [31]:
def bert_nnk(idx, knn=False):
  all_nnk_X = np.empty((0,32))
  all_nnk_indicies = np.empty((0))

  print("Original Text: ", imdb_X_test_raw[idx])
  print("Actual Classification: ", imdb_y_test[idx])
  print("\n")
  print("\n")

  for i in range(50):
    
    if (knn or not(BERT_weights[idx][i] == 0)):
      print("Matched Text: ", imdb_X_train_raw[BERT_indicies[idx][i]])
      print("\n")
      print("Actual value for this: ", imdb_y_train[BERT_indicies[idx][i]])
      print("With weight value: ", BERT_weights[idx][i])
      print("\n")
      print("\n")

In [32]:
lstm_nnk(0)

Original Text:  Can you people please stop believing everything this man says. Get<br /><br />your facts straight before you start praising this liar. He's not even<br /><br />from Flint. He just says that to keep his "blue collar" look. He's <br /><br />from a rich suburb next to Flint. I mean he went to a private school.<br /><br />His parents paid for him to go to school. Wow, that sure does seem<br /><br />like they suffered a lot from Flint going to the dumps. He was also born in Canada. Oh wait, that makes him a Canadian citizen. I wish he would just move there. Instead he lives here in his 1 million dollar New York apartment. Thats working class right there. I sure can't wait for his DVD set to come out. I want to here him talking about how big corporations are bad. Where will I get those dvds? Oh yeah, at Target and Wal-Mart. The two biggest corporations in America, which were also the only two stores allowed to sell them.
Model Prediction:  0
Actual Classification:  0




Matc

In [33]:
bert_nnk(0)

Original Text:  There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.
Actual Classification:  1




Matched Text: 

# Using LIME to Analyze NNK Classification

In [34]:
!pip install eli5

import eli5
from eli5.lime import TextExplainer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 9.4 MB/s 
Collecting jinja2>=3.0.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 32.7 MB/s 
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=19be96e513fde2b035bbb3f9c211a7af4e42562575e2cc232c20dd4476c54b9d
  Stored in directory: /root/.cache/pip/wheels/cc/3c/96/3ead31a8e6c20fc0f1a707fde2e05d49a80b1b4b30096573be
Successfully built eli5
Installing collected packages: jinja2, eli5
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently ta

In [35]:
def bert_knn_pipeline(text_array):
  processed = bert_preprocess_model(text_array)
  embedding = bert(processed)['pooled_output']
  weight, indexes = nnk_neighbors(np.float32(BERT_train_transformed), np.float32(embedding))
  results = []
  for i in range(len(indexes)):
    sum = 0
    for index in indexes[i]:
      sum += imdb_y_train[index]
    results.append(np.array([1-sum/50, sum/50]))
  return np.array(results)

In [36]:
def bert_nnk_pipeline(text_array):
  processed = bert_preprocess_model(text_array)
  embedding = bert(processed)['pooled_output']
  weight, indexes = nnk_neighbors(np.float32(BERT_train_transformed), np.float32(embedding))
  results = []
  for i in range(len(indexes)):
    prob = 0 
    j = 0
    for index in indexes[i]:
      if not(weight[i][j] == 0) and (imdb_y_train[index] == 1):
        prob += weight[i][j]
      j += 1
    if prob < 0:
      prob = 0
    elif prob > 1:
      prob = 1
    results.append(np.array([1-prob, prob]))
  return np.array(results)

In [None]:
## Review 1 with KNN
te = TextExplainer(random_state=42, n_samples=750)
te.fit(imdb_X_test_raw[0], bert_knn_pipeline)
te.show_prediction()

0/750 processed...




Contribution?,Feature
8.232,Highlighted in text (sum)
-0.302,<BIAS>


In [37]:
## Review 1 with NNK 
nnkte = TextExplainer(random_state=42, n_samples=750)
nnkte.fit(imdb_X_test_raw[0], bert_nnk_pipeline)
nnkte.show_prediction()

0/750 processed...




Contribution?,Feature
7.016,Highlighted in text (sum)
-0.221,<BIAS>


In [38]:
## Review 2 with KNN
te2 = TextExplainer(random_state=42, n_samples=750)
te2.fit(imdb_X_test_raw[1], bert_knn_pipeline)
te2.show_prediction()

0/750 processed...




Contribution?,Feature
8.257,Highlighted in text (sum)
-0.083,<BIAS>


In [39]:
## Review 2 with NNK
nnkte2 = TextExplainer(random_state=42, n_samples=750)
nnkte2.fit(imdb_X_test_raw[1], bert_nnk_pipeline)
nnkte2.show_prediction()

0/750 processed...




Contribution?,Feature
7.976,Highlighted in text (sum)
0.054,<BIAS>
