In [None]:
!pip install shap
!pip install lime
!pip install transformers

In [None]:
import shap
import pickle
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# LIME packages
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from lime.lime_text import IndexedString,IndexedCharacters
from lime.lime_base import LimeBase
from sklearn.linear_model import Ridge, lars_path
from lime.lime_text import explanation
from functools import partial
import scipy as sp
from sklearn.utils import check_random_state
from sklearn.metrics import classification_report, confusion_matrix

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tqdm import tqdm

In [None]:
!pip install gdown

In [None]:
'''
https://drive.google.com/file/d/1Fs0PTy_xPsoX5bg4QTCrX6JAaQCHRzis/view?usp=drive_link
https://drive.google.com/file/d/1JLa-ELhUskQINi0syf3YJWBX8jcCXdn4/view?usp=drive_link
'''

In [None]:
!gdown 1Fs0PTy_xPsoX5bg4QTCrX6JAaQCHRzis
!gdown 1JLa-ELhUskQINi0syf3YJWBX8jcCXdn4

In [None]:
FILE_PATH = "/kaggle/working/train.pkl"

with open(FILE_PATH, 'rb') as f:
    df_train = pickle.load(f)

In [None]:
FILE_PATH = "/kaggle/working/test.pkl"

with open(FILE_PATH, 'rb') as f:
    df_test = pickle.load(f)

In [None]:
df_train.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

In [None]:
df_train.head()

In [None]:
marbert_model_path = 'UBC-NLP/MARBERT'
tokenizer = AutoTokenizer.from_pretrained(marbert_model_path, from_tf=True)
marbert_model = TFAutoModel.from_pretrained(marbert_model_path, output_hidden_states=True)

In [None]:
remove_special_tokens=1  #change this to 0 if you want to keep the special token
stemtype = 'Light Stemming'

In [None]:
def bert_tokenize(text: str) -> dict:
  tokens = tokenizer(text, padding='max_length', truncation=True, max_length=50)
  if remove_special_tokens == 1:
    shape = np.array(tokens['input_ids']).shape
    modified_input_ids = np.zeros(shape).astype(np.int32)
    modified_attention_mask = np.zeros(shape).astype(np.int32)
    # Modify the input IDs and attention mask as per your requirement
    for i in range(shape[0]):
      modified_input_ids[i] = [0 if token_id == 1 else 0 if token_id == 3 else 0 if token_id == 0 else 0 if token_id == 2 else 0 if token_id == 4 else token_id for token_id in tokens['input_ids'][i]]
      modified_attention_mask[i] = [0 if token_id in [1, 3, 0, 2, 4] else 1 for token_id in tokens['input_ids'][i]]
    # Update the input IDs and attention mask in the tokens dictionary
    tokens['input_ids'] = modified_input_ids
    tokens['attention_mask'] = modified_attention_mask
  return tokens

In [None]:
def get_embeddings(ids, mask, type_ids):
  ids = tf.convert_to_tensor(ids)
  #print(ids.shape)
  mask = tf.convert_to_tensor(mask)
  #print(mask.shape)
  #print(mask)
  type_ids = tf.convert_to_tensor(type_ids)
  #print(type_ids.shape)
  hidden_states = marbert_model(input_ids=ids, attention_mask=mask, token_type_ids=type_ids)[0]
  averaged_embedding = tf.reduce_mean(hidden_states, axis=1)
  return averaged_embedding.numpy()

In [None]:
label_to_class = [
    'none',
    'anger',
    'joy',
    'sadness',
    'love',
    'sympathy',
    'surprise',
    'fear'
]

In [None]:
x_train = df_train[stemtype].values.astype(str).tolist()
x_train = bert_tokenize(x_train)

#x_train = get_embeddings(x_train)
#emb = emb.reshape(1, -1)
xlen = np.array(x_train['input_ids']).shape[0]

In [None]:
xlen

In [None]:
x_train_emb = np.zeros((xlen,768))
for i in range(0,xlen,100):
    if(i+100 < xlen):
        input_ids = x_train['input_ids'][i:i+100]
        attention_mask = x_train['attention_mask'][i:i+100]
        token_type_ids = x_train['token_type_ids'][i:i+100]
        x_train_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
    else:
        input_ids = x_train['input_ids'][i:xlen]
        attention_mask = x_train['attention_mask'][i:xlen]
        token_type_ids = x_train['token_type_ids'][i:xlen]
        x_train_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)

In [None]:
x_train_emb.shape

In [None]:
x_test = df_test[stemtype].values.astype(str).tolist()
x_test = bert_tokenize(x_test)

#x_train = get_embeddings(x_train)
#emb = emb.reshape(1, -1)
xlen = np.array(x_test['input_ids']).shape[0]

In [None]:
xlen

In [None]:
x_test_emb = np.zeros((xlen,768))
for i in range(0,xlen,100):
    if(i+100 < xlen):
        input_ids = x_test['input_ids'][i:i+100]
        attention_mask = x_test['attention_mask'][i:i+100]
        token_type_ids = x_test['token_type_ids'][i:i+100]
        x_test_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
    else:
        input_ids = x_test['input_ids'][i:xlen]
        attention_mask = x_test['attention_mask'][i:xlen]
        token_type_ids = x_test['token_type_ids'][i:xlen]
        x_test_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)

In [None]:
x_test_emb.shape

## Naive Bayes Model

In [None]:
# use Tfidf Vectorizer root stemming
tfidf_vectorizer_rs = TfidfVectorizer()
# fit and transform the training data
X_train_tfidf_rs = tfidf_vectorizer_rs.fit_transform(df_train['Root Stemming'])
# transform the test data using the same vectorizer
X_test_tfidf_rs = tfidf_vectorizer_rs.transform(df_test['Root Stemming'])

# use Tfidf Vectorizer light stemming
tfidf_vectorizer_ls = TfidfVectorizer()
# fit and transform the training data
X_train_tfidf_ls = tfidf_vectorizer_ls.fit_transform(df_train['Light Stemming'])
# transform the test data using the same vectorizer
X_test_tfidf_ls = tfidf_vectorizer_ls.transform(df_test['Light Stemming'])

### Using CategoricalNB

In [None]:
nb_vec = CategoricalNB()
nb_vec.fit(X_train_tfidf_rs.toarray(), df_train['label'])

# train accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_train_tfidf_rs.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_train['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
nb_vec = CategoricalNB()
nb_vec.fit(X_train_tfidf_ls.toarray(), df_train['label'])

# train accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_train_tfidf_ls.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_train['label'], y_pred)
print("Accuracy:", accuracy)

### Using GaussianNB

In [None]:
nb_vec = GaussianNB()
nb_vec.fit(X_train_tfidf_rs.toarray(), df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_test_tfidf_rs.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

In [None]:
nb_vec = GaussianNB()
nb_vec.fit(X_train_tfidf_ls.toarray(), df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_test_tfidf_ls.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

### Using Light Stemming Embeddings

In [None]:
nb_ls = GaussianNB()
nb_ls.fit(x_train_emb, df_train['label'])

# test accuracy of light stemming embeddings
y_pred = nb_ls.predict(x_test_emb)
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

### Random Forest

In [None]:
rf_vec = RandomForestClassifier()
rf_vec.fit(X_train_tfidf_rs, df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = rf_vec.predict(X_test_tfidf_rs)
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

In [None]:
rf_vec = RandomForestClassifier()
rf_vec.fit(X_train_tfidf_ls, df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = rf_vec.predict(X_test_tfidf_ls)
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

In [None]:
rf_ls = RandomForestClassifier()
rf_ls.fit(x_train_emb, df_train['label'])

# test accuracy of light stemming embeddings
y_pred = rf_ls.predict(x_test_emb)
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

## Using MultinomialNB

In [None]:
nb_vec = MultinomialNB(force_alpha=True, alpha=0.1, fit_prior=True)
nb_vec.fit(X_train_tfidf_rs, df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_test_tfidf_rs.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

In [None]:
nb_vec = MultinomialNB(force_alpha=True, alpha=0.1, fit_prior=True)
nb_vec.fit(X_train_tfidf_ls, df_train['label'])

# test accuracy of tf-idf vectorizer
y_pred = nb_vec.predict(X_test_tfidf_ls.toarray())
# calculating the accuracy of the classifier
accuracy = accuracy_score(df_test['label'], y_pred)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(df_test['label'], y_pred, target_names = label_to_class))

In [None]:
# creatin a confusion matrix
confusion_matrix(df_test['label'], y_pred)

## LIME

In [None]:
def model_predict(text):
    vec = tfidf_vectorizer_ls.transform(text)
    return nb_vec.predict_proba(vec)

## SHAP

In [None]:
def shap_model_predict(text):
  text = text.astype(str).tolist()
  tokens = bert_tokenize(text)
  xlen = np.array(tokens['input_ids']).shape[0]
  x_emb = np.zeros((xlen,768))
  for i in range(0,xlen,100):
    if(i+100 < xlen):
        input_ids = tokens['input_ids'][i:i+100]
        attention_mask = tokens['attention_mask'][i:i+100]
        token_type_ids = tokens['token_type_ids'][i:i+100]
        x_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
    else:
        input_ids = tokens['input_ids'][i:xlen]
        attention_mask = tokens['attention_mask'][i:xlen]
        token_type_ids = tokens['token_type_ids'][i:xlen]
        x_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)
  return nb_ls.predict_proba(x_emb)

In [None]:
'''
function that initializes shap with the appropriate model
parameters
model: an instance of your model, ex: LogisticRegression instance
'''
def initializeShap(model):
  # split input text into tokens
  masker = shap.maskers.Text(tokenizer=r"\W+")
  if isinstance(model, LogisticRegression):
    explainer = shap.Explainer(model_predict, masker=masker, output_names=label_to_class)
  elif isinstance(model, MultinomialNB):
    explainer = shap.Explainer(model_predict, masker=masker, output_names=label_to_class)
  else: print("Please use a valid model !")
  return explainer

In [None]:
nb_explainer = initializeShap(model=nb_vec)

## Exploration

In [None]:
tp = df_test['label'] != y_pred

In [None]:
tp = ((df_test['label'] == 7)*tp)

In [None]:
tp[tp == True]

In [None]:
label_to_class

In [None]:
def explain_example_lime(i):
    instance = df_test[stemtype].iloc[i]
    print("True label: ", label_to_class[df_test['label'].iloc[i]])
    print("Original tweet: ", df_test['tweet'].iloc[i])
    # creating a LimeTextExplainer
    explainer = LimeTextExplainer(class_names=label_to_class)

    # explaining the prediction
    explanation = explainer.explain_instance(instance, model_predict, num_features=10, labels = range(8))

    # showing the explanation
    explanation.show_in_notebook()

In [None]:
def explain_example_shap(i):
    instance = df_test[stemtype].iloc[i:i+1].values.astype(str).tolist()
    print("True label: ", label_to_class[df_test['label'].iloc[i]])
    print("Original tweet: ", df_test['tweet'].iloc[i])
    shap_values = nb_explainer(instance)
    shap.text_plot(shap_values)

## Label None predicted correctly

In [None]:
explain_example_lime(2)

In [None]:
explain_example_shap(2)

## Label None predicted incorrectly

In [None]:
explain_example_lime(244)

In [None]:
explain_example_shap(244)

## Label Anger predicted correctly

In [None]:
explain_example_lime(31)

In [None]:
explain_example_shap(31)

## Label Anger predicted incorrectly

In [None]:
explain_example_lime(49)

In [None]:
explain_example_shap(49)

## Label Joy predicted correctly

In [None]:
explain_example_lime(3)

In [None]:
explain_example_shap(3)

## Label Joy predicted incorrectly

In [None]:
explain_example_lime(21)

In [None]:
explain_example_shap(21)

## Label Sadness predicted correctly

In [None]:
explain_example_lime(65)

In [None]:
explain_example_shap(65)

## Label Sadness predicted incorrectly

In [None]:
explain_example_lime(12)

In [None]:
explain_example_shap(12)

## Label Love predicted correctly

In [None]:
explain_example_lime(0)

In [None]:
explain_example_shap(0)

## Label Love predicted incorrectly

In [None]:
explain_example_lime(10)

In [None]:
explain_example_shap(10)

## Label Sympathy predicted correctly

In [None]:
explain_example_lime(4)

In [None]:
explain_example_shap(4)

## Label Sympathy predicted incorrectly

In [None]:
explain_example_lime(98)

In [None]:
explain_example_shap(98)

## Label Surprise predicted correctly

In [None]:
explain_example_lime(48)

In [None]:
explain_example_shap(48)

## Label Surprise explained incorrectly

In [None]:
explain_example_lime(23)

In [None]:
explain_example_shap(23)

## Label Fear predicted correctly

In [None]:
explain_example_lime(1)

In [None]:
explain_example_shap(1)

## Label Fear predicted correctly

In [None]:
explain_example_lime(38)

In [None]:
explain_example_shap(38)