In [None]:
!pip install datasets
!pip install shap

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [None]:
import pandas as pd
import numpy as np

import transformers
import pickle
import torch
import shap

from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoTokenizer, pipeline
from collections import defaultdict
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print(f"Using '{device}' device")

Using 'cuda' device


In [None]:
with open("drive/MyDrive/multilabel_emoji_prediction/bert.pkl", 'rb') as f:
    model = pickle.load(f)
model_name  = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = 128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
df_test = pd.read_csv("drive/MyDrive/multilabel_emoji_prediction/balanced_test.csv")
test_dataset = Dataset.from_pandas(df_test)

ROW_NUM = df_test.shape[0]
labels  = df_test.columns[2:]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items( )}

def preprocess_data(examples):
    def tokenize_function(examples, padding="longest", truncation=True):
        return tokenizer(examples["text"], padding = padding, truncation = truncation)

    encoding = tokenize_function(examples)
    batch_size = len(examples["text"])
    num_labels = len(labels)
    label_array = np.zeros((batch_size, num_labels))

    for label_name in labels:
        if label_name in examples:
            j = label2id[label_name]
            for i, val in enumerate(examples[label_name]):
                if val:
                    label_array[i, j] = 1

    encoding['labels'] = label_array

    return encoding

test_preprocessed = test_dataset.map(preprocess_data, batched = True, batch_size = None)

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

In [None]:
pred = pipeline(
    "text-classification",
    model = model,
    tokenizer = tokenizer,
    device = device,
    return_all_scores = True,
)
explainer = shap.Explainer(pred)



In [None]:
# shap_values = explainer(test_preprocessed["text"]) ### 4 hours needed for 3000 rows

with open("drive/MyDrive/multilabel_emoji_prediction/shap_values.pkl", 'rb') as f:
    shap_values = pickle.load(f)

In [None]:
encoded_list = []

for i in tqdm(range(ROW_NUM)):
    input_text = test_preprocessed['text'][i]
    tokens = tokenizer.encode(input_text, add_special_tokens=True)
    tokens_list = tokenizer.convert_ids_to_tokens(tokens)
    encoded_list.append(tokens_list)

100%|██████████| 3001/3001 [00:23<00:00, 130.18it/s]


In [None]:
truth_arr = np.load("drive/MyDrive/multilabel_emoji_prediction/truth.npy")
preds_arr = np.load("drive/MyDrive/multilabel_emoji_prediction/preds.npy")

In [None]:
impact_dict = defaultdict(dict)

def update_case_dict(case_code, i, label_idx):
    impact_list = shap_values[i, :, label_idx].values
    tokens_list = encoded_list[i]
    for j in range(len(tokens_list)):
        token, impact = tokens_list[j], impact_list[j]
        if case_code == "FP":
            false_positive_dict[token].append(impact)
        elif case_code == "FN":
            false_negative_dict[token].append(impact)
        elif case_code == "TP":
            true_positive_dict[token].append(impact)

def generate_sorted_impact(case_dict, flag = True):
    mean_impact = {key: sum(values) / len(values) for key, values in case_dict.items() if len(values) > 1}
    sorted_impact = sorted(mean_impact.items(), key=lambda item: item[1], reverse=flag)
    return sorted_impact

for label in labels:
    print(label)

    true_positive_dict  = defaultdict(list)
    false_positive_dict = defaultdict(list)
    false_negative_dict = defaultdict(list)

    label_idx = label2id[label]

    for i in tqdm(range(ROW_NUM)):
        truth, pred = truth_arr[i][label_idx], preds_arr[i][label_idx]

        if truth and pred: # TP
            update_case_dict("TP", i, label_idx)
        elif not truth and pred: # FP
            update_case_dict("FP", i, label_idx)
        elif truth and not pred: # FN
            update_case_dict("FN", i, label_idx)

    true_positive_sorted_impact  = generate_sorted_impact(true_positive_dict, True)
    false_positive_sorted_impact = generate_sorted_impact(false_positive_dict, True)
    false_negative_sorted_impact = generate_sorted_impact(false_negative_dict, True)

    impact_dict[label]['TP'] = true_positive_sorted_impact
    impact_dict[label]['FP'] = false_positive_sorted_impact
    impact_dict[label]['FN'] = false_negative_sorted_impact

👍


100%|██████████| 3001/3001 [01:16<00:00, 39.28it/s]


🙏


100%|██████████| 3001/3001 [01:38<00:00, 30.37it/s]


🎉


100%|██████████| 3001/3001 [02:12<00:00, 22.64it/s]


🔥


100%|██████████| 3001/3001 [01:40<00:00, 29.78it/s] 


🤦


100%|██████████| 3001/3001 [00:44<00:00, 67.77it/s] 


❤️


100%|██████████| 3001/3001 [02:10<00:00, 23.06it/s]


🤷


100%|██████████| 3001/3001 [00:52<00:00, 56.76it/s]


🎂


100%|██████████| 3001/3001 [00:45<00:00, 66.38it/s] 


👏


100%|██████████| 3001/3001 [01:09<00:00, 43.19it/s] 


👌


100%|██████████| 3001/3001 [00:51<00:00, 58.47it/s] 


💪


100%|██████████| 3001/3001 [01:01<00:00, 48.69it/s]


✨


100%|██████████| 3001/3001 [01:35<00:00, 31.58it/s]


👀


100%|██████████| 3001/3001 [01:17<00:00, 38.77it/s]


👉


100%|██████████| 3001/3001 [01:02<00:00, 48.39it/s]


🌹


100%|██████████| 3001/3001 [00:27<00:00, 109.30it/s]


🎈


100%|██████████| 3001/3001 [00:49<00:00, 60.37it/s]


💐


100%|██████████| 3001/3001 [00:37<00:00, 80.80it/s] 


🤞


100%|██████████| 3001/3001 [00:32<00:00, 93.63it/s] 


🙌


100%|██████████| 3001/3001 [01:41<00:00, 29.70it/s]


👇


100%|██████████| 3001/3001 [01:11<00:00, 41.94it/s]


🌞


100%|██████████| 3001/3001 [00:32<00:00, 91.02it/s] 


🌸


100%|██████████| 3001/3001 [00:35<00:00, 84.39it/s] 


🎶


100%|██████████| 3001/3001 [01:00<00:00, 49.55it/s]


✌️


100%|██████████| 3001/3001 [00:32<00:00, 91.84it/s]


🎊


100%|██████████| 3001/3001 [00:55<00:00, 54.06it/s] 


☀️


100%|██████████| 3001/3001 [01:05<00:00, 46.07it/s]


💰


100%|██████████| 3001/3001 [00:33<00:00, 89.74it/s]


👑


100%|██████████| 3001/3001 [00:39<00:00, 75.42it/s]


🎁


100%|██████████| 3001/3001 [01:05<00:00, 45.66it/s]


🙋


100%|██████████| 3001/3001 [00:32<00:00, 91.37it/s]


In [None]:
file_path = 'impact_dict.pkl'

with open(file_path, 'wb') as f:
    pickle.dump(impact_dict, f)

print("Dict saved to", file_path)

Dict saved to impact_dict.pkl
