In [1]:
import numpy as np
import pandas as pd
import sklearn
import os
import gc
import random
import re
import json
import csv
import torch
import torchtext
from torchtext.datasets import IMDB
from torch import Tensor
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from torch.utils.data import dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from itertools import *
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from tempfile import TemporaryDirectory
from typing import Tuple
from captum.concept import TCAV
from captum.concept import Concept
from captum.concept._utils.common import concepts_to_str
import matplotlib.pyplot as plt
from captum.attr._core.layer.layer_activation import LayerActivation 



  from .autonotebook import tqdm as notebook_tqdm


In [22]:
model = TextClassificationModel(768, 2)
model.load_state_dict(torch.load(f"models/bert_suicide_BCELoss_27June.pt"))

embedding dimension 768


<All keys matched successfully>

## Logging Attempts

In [5]:
model

TextClassificationModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [6]:
for x in model._modules:
    print(f"1model.{x}")
    for y in model.__getattr__(x)._modules:
        print(f"2model.{x}.{y}")
        for z in model.__getattr__(x).__getattr__(y)._modules:
            print(f"3model.{x}.{y}.{z}")
            for w in model.__getattr__(x).__getattr__(y).__getattr__(z)._modules:
                print(f"4model.{x}.{y}.{z}.{w}")
                for m in model.__getattr__(x).__getattr__(y).__getattr__(z).__getattr__(w)._modules:
                    print(f"5model.{x}.{y}.{z}.{w}.{m}")

1model.bert
2model.bert.embeddings
3model.bert.embeddings.word_embeddings
3model.bert.embeddings.position_embeddings
3model.bert.embeddings.token_type_embeddings
3model.bert.embeddings.LayerNorm
3model.bert.embeddings.dropout
2model.bert.encoder
3model.bert.encoder.layer
4model.bert.encoder.layer.0
5model.bert.encoder.layer.0.attention
5model.bert.encoder.layer.0.intermediate
5model.bert.encoder.layer.0.output
4model.bert.encoder.layer.1
5model.bert.encoder.layer.1.attention
5model.bert.encoder.layer.1.intermediate
5model.bert.encoder.layer.1.output
4model.bert.encoder.layer.2
5model.bert.encoder.layer.2.attention
5model.bert.encoder.layer.2.intermediate
5model.bert.encoder.layer.2.output
4model.bert.encoder.layer.3
5model.bert.encoder.layer.3.attention
5model.bert.encoder.layer.3.intermediate
5model.bert.encoder.layer.3.output
4model.bert.encoder.layer.4
5model.bert.encoder.layer.4.attention
5model.bert.encoder.layer.4.intermediate
5model.bert.encoder.layer.4.output
4model.bert.encode

In [7]:
all_layers = {}
for x in model._modules:
    level=1
    for y in model.__getattr__(x)._modules:
        level=2
        for z in model.__getattr__(x).__getattr__(y)._modules:
            level=3
            for w in model.__getattr__(x).__getattr__(y).__getattr__(z)._modules:
                level=4
                # for m in model.__getattr__(x).__getattr__(y).__getattr__(z).__getattr__(w)._modules:
                #     level=5 
                #     if level==5:
                #         all_layers[f"model.{x}.{y}.{z}.{w}.{m}"]=(model.__getattr__(x).__getattr__(y).__getattr__(z).__getattr__(w).__getattr__(m))
                if level==4:
                    all_layers[f"model.{x}.{y}.{z}.{w}"]=(model.__getattr__(x).__getattr__(y).__getattr__(z).__getattr__(w))
            if level==3:
                all_layers[f"model.{x}.{y}.{z}"]=(model.__getattr__(x).__getattr__(y).__getattr__(z))
        if level==2:
            all_layers[f"model.{x}.{y}"]=(model.__getattr__(x).__getattr__(y))
    if level==1:
        all_layers[f"model.{x}"]=model.__getattr__(x)

In [8]:
(all_layers)

{'model.bert.embeddings.word_embeddings': Embedding(30522, 768, padding_idx=0),
 'model.bert.embeddings.position_embeddings': Embedding(512, 768),
 'model.bert.embeddings.token_type_embeddings': Embedding(2, 768),
 'model.bert.embeddings.LayerNorm': LayerNorm((768,), eps=1e-12, elementwise_affine=True),
 'model.bert.embeddings.dropout': Dropout(p=0.1, inplace=False),
 'model.bert.encoder.layer.0': BertLayer(
   (attention): BertAttention(
     (self): BertSelfAttention(
       (query): Linear(in_features=768, out_features=768, bias=True)
       (key): Linear(in_features=768, out_features=768, bias=True)
       (value): Linear(in_features=768, out_features=768, bias=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (output): BertSelfOutput(
       (dense): Linear(in_features=768, out_features=768, bias=True)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
   )
   (intermediate): BertIntermed

In [9]:
type(all_layers)

dict

In [9]:
all_layers.keys()

dict_keys(['model.bert.embeddings.word_embeddings', 'model.bert.embeddings.position_embeddings', 'model.bert.embeddings.token_type_embeddings', 'model.bert.embeddings.LayerNorm', 'model.bert.embeddings.dropout', 'model.bert.encoder.layer.0', 'model.bert.encoder.layer.1', 'model.bert.encoder.layer.2', 'model.bert.encoder.layer.3', 'model.bert.encoder.layer.4', 'model.bert.encoder.layer.5', 'model.bert.encoder.layer.6', 'model.bert.encoder.layer.7', 'model.bert.encoder.layer.8', 'model.bert.encoder.layer.9', 'model.bert.encoder.layer.10', 'model.bert.encoder.layer.11', 'model.bert.pooler.dense', 'model.bert.pooler.activation', 'model.fc1', 'model.fc2', 'model.fc3', 'model.fc4'])

In [16]:
# print(len(attribution))
# for (l,x) in zip(list(all_layers.keys()),attribution[0]):
#     print(l,"   ",x.size())


model.bert.embeddings.word_embeddings     torch.Size([5, 512, 768]) \
model.bert.embeddings.position_embeddings     torch.Size([1, 512, 768]) \
model.bert.embeddings.token_type_embeddings     torch.Size([5, 512, 768]) \
model.bert.embeddings.LayerNorm     torch.Size([5, 512, 768]) \
model.bert.embeddings.dropout     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.0     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.1     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.2     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.3     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.4     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.5     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.6     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.7     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.8     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.9     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.10     torch.Size([5, 512, 768]) \
model.bert.encoder.layer.11     torch.Size([5, 512, 768]) \
model.bert.pooler.dense     torch.Size([5, 768]) \
model.bert.pooler.activation     torch.Size([5, 768]) \
model.fc1     torch.Size([5, 64]) \
model.fc2     torch.Size([5, 32]) \
model.fc3     torch.Size([5, 8]) \
model.fc4     torch.Size([5, 2])


In [5]:
gc.collect()

25

## Actual Logging

In [2]:
data = pd.read_csv("suicide/Suicide_Detection.csv", index_col=0)
data.head(2)

Unnamed: 0,text,class
2,Ex Wife Threatening SuicideRecently I left my ...,suicide
3,Am I weird I don't get affected by compliments...,non-suicide


In [3]:
data['label'] = (data['class']=='suicide').astype('Int64')
data.head(3)

Unnamed: 0,text,class,label
2,Ex Wife Threatening SuicideRecently I left my ...,suicide,1
3,Am I weird I don't get affected by compliments...,non-suicide,0
4,Finally 2020 is almost over... So I can never ...,non-suicide,0


In [4]:
X= np.array(data['text'])
y = np.array(data['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, shuffle=True, stratify= y, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((185659,), (185659,), (46415,), (46415,))

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, max_length=512)
def bert_tokenizer(text: str) -> Tuple[Tensor, Tensor]:
    tokens = tokenizer.__call__(text, return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    words = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])

    return tokens['input_ids'], tokens['attention_mask'], words
def encode_labels(labels) -> Tensor:
    return F.one_hot(torch.tensor([y for y in labels])).to(torch.float32)

In [6]:
X_train_neg = X_train[np.invert(y_train.astype(bool))]
X_train_pos = X_train[y_train.astype(bool)]

In [7]:
X_train_neg

array(['normalise watching dubbed anime i don’t care if it’s not the original audio i prefer it',
       'sad gorl hours I have school in a few minutes and I feel like fuckin shet bro\n\n\nI want to cuddle with someone so bad it hurts',
       'THE REDDIT APP ICON IS GOOD AGAIN It looks so weird seeing it with color again',
       ...,
       'Is posting a photo of me edited into an old photo with my dad karma whoring? I want to know if I should delete it. Recently posted a photo of my dad and a younger me in which I edited myself into it to have an updated photo because I can take one with him anymore. I simply posted it because it made me happy, but I want to know if that’s karma whoring because I know people have posted before purely just to get karma and awards which is horrible. Just want to get an idea if I should delete the post or not because I don’t want to be a karma whore.',
       'I just took a online test And i got 25% because some of the question teacher needs to grade a

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def prepare_no_labels(data,start_idx=None, end_idx=None):
    input_ids = []
    attention_masks = []
    for t in data[start_idx:end_idx]:
        tokens = bert_tokenizer(t)
        input_ids.append(tokens[0])
        attention_masks.append(tokens[1])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    print(input_ids.shape, attention_masks.shape)
    return torch.utils.data.TensorDataset(input_ids.to(device), attention_masks.to(device))

train_dataset_neg =prepare_no_labels(X_train_neg, 0, 5000)
train_dataset_pos =prepare_no_labels(X_train_pos, 0, 5000)

torch.Size([5000, 512]) torch.Size([5000, 512])
torch.Size([5000, 512]) torch.Size([5000, 512])


In [10]:
train_dataset_pos[0]

(tensor([  101,  2296,  2210,  3291,  3084,  2033,  2215,  2000,  3102,  2870,
          1012,  4714, 11809,  4485,  3957,  2033,  2200,  2172,  3255,  1012,
          1045,  2215,  2000,  3102,  2870,  2138,  2026,  2482,  2003,  3714,
          1010,  1045,  2215,  2000,  3102,  2870,  2738,  2084,  4550,  2026,
          2282,  1010,  1045,  2215,  2000,  3102,  2870,  7188,  1045,  2031,
          2000,  2079,  2070,  2524,  4485,  2030,  1045,  4558,  2030,  3338,
          2242,  1012,  2023,  2003,  2035,  3722,  4485,  2008,  6433,  2000,
          3071,  2021,  2035,  2023,  2785,  1997, 14636,  2064,  3426,  2200,
          2172,  3255,  2000,  2033,  1012,  1998,  2045,  1005,  1055,  2053,
          2126,  2000,  2644,  2383,  2023,  2785,  1997,  4485,  2144,  2166,
          2003,  2440,  1997,  2210,  3471,  2005,  3071,  1012,  1045,  2031,
          2060,  1010,  7046,  4436,  2005,  5782,  2000,  3280,  2029,  1045,
          2453,  2191,  2488,  8466,  2055,  2021,  

In [30]:
del data
del X_test
gc.collect()

9

In [31]:
model = model.to(device)

In [32]:
batch_size = 32
train_dataloader_neg = DataLoader(
            train_dataset_neg,  # The training samples.
            sampler = None,# SequentialSampler(train_dataset), 
            batch_size = batch_size, # Trains with this batch size.
            shuffle=False
            # num_workers= 4
        )

train_dataloader_pos = DataLoader(
            train_dataset_pos,  # The training samples.
            sampler = None,# SequentialSampler(train_dataset), 
            batch_size = batch_size, # Trains with this batch size.
            shuffle=False
            # num_workers= 4
        )

In [None]:
#create dir for saving
layer_num = 10
save_dir = f"attribution_results/non-suicide{layer_num}"

os.makedirs(save_dir, exist_ok=True)

#limit to 12th layer
all_layers = {f'model.bert.encoder.layer.{layer_num}':model.__getattr__('bert').__getattr__('encoder').__getattr__('layer').__getattr__(f'{layer_num}')}

layer_names = list(all_layers.keys())

layer_act = LayerActivation(model, list(all_layers.values()))
for batch_idx, (input_ids, attention_masks) in enumerate(train_dataloader_neg):
    
    attribution = layer_act.attribute((input_ids, attention_masks))
    batch_attributions ={k:v.detach().cpu().tolist() for (k,v) in zip(layer_names , attribution)} 
    #attribution = [Tensor( batch1_act, batch2_act....)]
    # print(len(attribution))
    json_obj = json.dumps(batch_attributions, indent=4)
    filename = f"{save_dir}/layer{layer_num}_batch{batch_idx}_attrs.json"
    with open(filename, "w") as outfile:
        outfile.write(json_obj)

     
    print(f"Saved attribution for batch {batch_idx} to {filename}")
    del attribution
    del batch_attributions
    gc.collect()
    
print("Attribution saving complete.")

Saved attribution for batch 151 to attribution_results/non-suicide10/layer10_batch151_attrs.json
Saved attribution for batch 152 to attribution_results/non-suicide10/layer10_batch152_attrs.json
Saved attribution for batch 153 to attribution_results/non-suicide10/layer10_batch153_attrs.json
Saved attribution for batch 154 to attribution_results/non-suicide10/layer10_batch154_attrs.json
Saved attribution for batch 155 to attribution_results/non-suicide10/layer10_batch155_attrs.json
Saved attribution for batch 156 to attribution_results/non-suicide10/layer10_batch156_attrs.json
Attribution saving complete.


In [33]:
#create dir for saving
save_dir = f"attribution_results/suicide{layer_num}"

os.makedirs(save_dir, exist_ok=True)

#limit to 12th layer
all_layers = {f'model.bert.encoder.layer.{layer_num}':model.__getattr__('bert').__getattr__('encoder').__getattr__('layer').__getattr__(f'{layer_num}')}

layer_names = list(all_layers.keys())

layer_act = LayerActivation(model, list(all_layers.values()))
for batch_idx, (input_ids, attention_masks) in enumerate(train_dataloader_pos):
    if batch_idx<150:
        continue
    attribution = layer_act.attribute((input_ids, attention_masks))
    batch_attributions ={k:v.detach().cpu().tolist() for (k,v) in zip(layer_names , attribution)} 
    #attribution = [Tensor( batch1_act, batch2_act....)]
    # print(len(attribution))
    json_obj = json.dumps(batch_attributions, indent=4)
    filename = f"{save_dir}/layer{layer_num}_batch{batch_idx}_attrs.json"
    with open(filename, "w") as outfile:
        outfile.write(json_obj)

     
    print(f"Saved attribution for batch {batch_idx} to {filename}")
    del attribution
    del batch_attributions
    gc.collect()
print("Attribution saving complete.")

Saved attribution for batch 150 to attribution_results/suicide10/layer10_batch150_attrs.json
Saved attribution for batch 151 to attribution_results/suicide10/layer10_batch151_attrs.json
Saved attribution for batch 152 to attribution_results/suicide10/layer10_batch152_attrs.json
Saved attribution for batch 153 to attribution_results/suicide10/layer10_batch153_attrs.json
Saved attribution for batch 154 to attribution_results/suicide10/layer10_batch154_attrs.json
Saved attribution for batch 155 to attribution_results/suicide10/layer10_batch155_attrs.json
Saved attribution for batch 156 to attribution_results/suicide10/layer10_batch156_attrs.json
Attribution saving complete.


In [58]:
X[1]
sample_tok = bert_tokenizer(X[1])

In [59]:
X[1]

"Am I weird I don't get affected by compliments if it's coming from someone I know irl but I feel really good when internet strangers do it"

In [83]:
layer_act = LayerActivation(model, model.__getattr__('bert').__getattr__('embeddings').__getattr__('word_embeddings'))
attribution = layer_act.attribute(sample_tok)
attribution[0][0:500]
# model.bert.embeddings.word_embeddings

tensor([[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
        [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
        [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
        ...,
        [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098]])

In [79]:
layer_act = LayerActivation(model, model.__getattr__('bert').__getattr__('encoder').__getattr__('layer').__getattr__('5'))
attribution = layer_act.attribute(sample_tok)
attribution[0][350:500]

tensor([[-0.3966,  0.0843, -0.1734,  ..., -0.1993,  0.0263,  0.2048],
        [-0.0821, -0.0142,  0.2886,  ...,  0.2647,  0.1573, -0.1333],
        [-0.2309, -0.0342, -0.1535,  ..., -0.1660, -0.0533,  0.2310],
        ...,
        [ 0.1632, -0.1154,  0.6387,  ...,  0.2678, -0.0523,  0.1187],
        [ 0.3812, -0.0009,  0.6487,  ...,  0.3838, -0.0097, -0.0421],
        [-0.0630,  0.0188,  0.2642,  ..., -0.0731, -0.1828,  0.3592]])

In [82]:
layer_act = LayerActivation(model, model.__getattr__('bert').__getattr__('encoder').__getattr__('layer').__getattr__('11'))
attribution = layer_act.attribute(sample_tok)
attribution[0][350:500]


tensor([[-0.1375, -0.8790,  0.1797,  ...,  0.1689,  0.5940, -0.3195],
        [-0.0205, -0.7896,  0.1557,  ...,  0.1930,  0.5488, -0.4349],
        [-0.0996, -0.8993,  0.1445,  ...,  0.2475,  0.5433, -0.3921],
        ...,
        [ 0.0565, -0.4912,  0.1698,  ...,  0.1381,  0.5907, -0.2065],
        [ 0.0384, -0.2033,  0.2311,  ...,  0.0278,  0.4072, -0.0010],
        [ 0.0765, -0.7767,  0.1176,  ...,  0.2635,  0.5953, -0.3277]])

In [72]:
attribution[np.invert(sample_tok[1].bool())]

  attribution[np.invert(sample_tok[1].bool())]


tensor([[ 0.0226, -0.4424,  0.0795,  ...,  0.1186,  0.6583, -0.2850],
        [-0.0712, -0.7913,  0.0977,  ...,  0.2465,  0.6118, -0.3723],
        [-0.0239, -0.5132,  0.1971,  ...,  0.1724,  0.6006, -0.2474],
        ...,
        [-0.2629, -0.6130,  0.2353,  ...,  0.0360,  0.5379, -0.3160],
        [-0.2251, -0.8889,  0.1287,  ...,  0.0596,  0.6490, -0.4362],
        [-0.1725, -0.5995,  0.2570,  ..., -0.0263,  0.5621, -0.3175]])

In [62]:
len(X[1].split(" "))

27

In [67]:
sum(sample_tok[0])

tensor([  101,  2572,  1045,  6881,  1045,  2123,  1005,  1056,  2131,  5360,
         2011, 19394,  2015,  2065,  2009,  1005,  1055,  2746,  2013,  2619,
         1045,  2113, 20868,  2140,  2021,  1045,  2514,  2428,  2204,  2043,
         4274, 12358,  2079,  2009,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [98]:
somevarname = bert_tokenizer(X[1])

torch.Size([1, 512])


In [99]:
somevarname

(tensor([[  101,  2572,  1045,  6881,  1045,  2123,  1005,  1056,  2131,  5360,
           2011, 19394,  2015,  2065,  2009,  1005,  1055,  2746,  2013,  2619,
           1045,  2113, 20868,  2140,  2021,  1045,  2514,  2428,  2204,  2043,
           4274, 12358,  2079,  2009,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   