In [None]:
pip install transformers captum torch-summary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from torchsummary import summary

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from captum.attr import LayerIntegratedGradients
from captum.attr import visualization as viz

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/distilrubert-base-cased-conversational")

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(BertClassifier, self).__init__()

        input_layer = 768
        hidden_layer = 50
        output_layer = 3

        self.bert = AutoModel.from_pretrained("DeepPavlov/distilrubert-base-cased-conversational")

        self.fc1 = nn.Linear(input_layer*2, input_layer//2)
        self.fc2 = nn.Linear(input_layer*2 + input_layer//2, input_layer)
        self.fc3 = nn.Linear(input_layer, output_layer)

        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.1)

        self.act = nn.ReLU()

        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        sentence_embeddings = mean_pooling(outputs[1], attention_mask)
        embeddings = torch.cat([outputs[0][:,0,:], sentence_embeddings], dim=1)
        embeddings1 = self.fc1(embeddings)
        embeddings = torch.cat([embeddings, embeddings1], dim=1)
        embeddings = self.act(embeddings)
        embeddings = self.drop1(embeddings)
        embeddings = self.fc2(embeddings)
        embeddings = self.act(embeddings)
        embeddings = self.drop2(embeddings)
        logits = self.fc3(embeddings)

        return logits

In [None]:
model = BertClassifier().to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/weights/dbert_tg_SNGS_1_day_mean.pth'))
model.eval()

Some weights of the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
def model_output(inputs, mask):
  return model(inputs, mask)[0]

model_input = model.bert.embeddings.to(device)

In [None]:
lig = LayerIntegratedGradients(model_output, model_input)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, targets):
    
      self.labels = targets.astype(int)
      self.texts = [tokenizer(text, 
                          padding='max_length', max_length = 512, truncation=True,
                          return_tensors="pt") for text in texts]

    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):

        batch_texts = self.texts[idx]
        batch_y = self.labels[idx]

        return batch_texts, batch_y

df = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps_raw_1,5sigma_.parquet')
df = df[df['SNGS'] == True].copy()

df.dropna(subset=['message'], inplace=True)

df_train = df[df.date < '2021-01-01'].copy()
df_val = df[(df.date > '2021-01-01') & (df.date < '2021-06-01')].copy()
df_test = df[(df.date < '2022-01-01') & (df.date > '2021-06-01')].copy()

test_texts = df_test['message'].values
test_target = df_test['SNGS_1_day_mean'].values

test_dataset = NewsDataset(test_texts, test_target)
batch_size = 32
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
def construct_input_and_baseline(text):

    max_length = 510
    baseline_token_id = tokenizer.pad_token_id 
    sep_token_id = tokenizer.sep_token_id 
    cls_token_id = tokenizer.cls_token_id 

    text_ids = tokenizer.encode(text, max_length=max_length, truncation=True, add_special_tokens=False)
   
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    mask = [1] * len(input_ids)
    token_list = tokenizer.convert_ids_to_tokens(input_ids)

    baseline_input_ids = [cls_token_id] + [baseline_token_id] * len(text_ids) + [sep_token_id]
    return torch.tensor([input_ids], device=device), torch.tensor(mask, device=device), torch.tensor([baseline_input_ids], device=device), torch.tensor(mask, device=device), token_list

text = test_texts[56]
input_ids, mask, baseline_input_ids, baseline_mask, all_tokens = construct_input_and_baseline(text)

In [None]:
attributions, delta = lig.attribute(inputs=(input_ids, mask),
                                    baselines=(baseline_input_ids, baseline_mask),
                                    return_convergence_delta=True,
                                    internal_batch_size=1
                                    )

In [None]:
def summarize_attributions(attributions):

    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    
    return attributions

attributions_sum = summarize_attributions(attributions)

In [None]:
score_vis = viz.VisualizationDataRecord(
                        word_attributions = attributions_sum,
                        pred_prob = torch.max(model(input_ids, mask)[0]),
                        pred_class = torch.argmax(model(input_ids, mask)[0]).cpu().numpy()-1,
                        true_class = 1,
                        attr_class = text,
                        attr_score = attributions_sum.sum(),       
                        raw_input_ids = all_tokens,
                        convergence_score = delta)

viz.visualize_text([score_vis])

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,-1 (0.09),⚠ Решение акционеров Сургутнефтегаз #SNGS «Утвердить годовую бухгалтерскую (финансовую) отчетность ПАО «Сургутнефтегаз» за 2020 год».,0.19,[CLS] [UNK] Решение акционеров Сургут ##нефте ##газ # SN ##GS « Утвер ##дить годов ##ую бухгалтер ##скую ( финансовую ) отчетность ПАО « Сургут ##нефте ##газ » за 2020 год » . [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,-1 (0.09),⚠ Решение акционеров Сургутнефтегаз #SNGS «Утвердить годовую бухгалтерскую (финансовую) отчетность ПАО «Сургутнефтегаз» за 2020 год».,0.19,[CLS] [UNK] Решение акционеров Сургут ##нефте ##газ # SN ##GS « Утвер ##дить годов ##ую бухгалтер ##скую ( финансовую ) отчетность ПАО « Сургут ##нефте ##газ » за 2020 год » . [SEP]
,,,,


In [None]:
def evaluate(test_dataloader):

  all_test = []

  for batch in tqdm(test_dataloader):

      batch_data, batch_labels = batch

      batch_inputs = batch_data['input_ids'].squeeze(1).to(device)
      batch_masks = batch_data['attention_mask'].squeeze(1).to(device)
      batch_labels = batch_labels.to(device)

      with torch.no_grad():
          logits = model(batch_inputs, batch_masks)

      preds_bin = logits.argmax(dim=-1) - 1
      preds_bin = preds_bin.detach().cpu().numpy()
      
      all_test.append(preds_bin)

  all_test = np.concatenate(all_test, axis=0)

  return all_test

In [None]:
sngs_1day_preds = evaluate(test_dataloader)

100%|██████████| 11/11 [00:05<00:00,  2.07it/s]


In [None]:
sngs_1day_true = df_test['SNGS_1_day_mean'].values

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps_raw_1,5sigma_.parquet')
df = df[df['VTBR'] == True].copy()

df['title'] = df['title'].str.replace('no title', ' ')
df['announce'] = df['announce'].str.replace('no announce', ' ')

df['message'] = df['title'] + ' ' + df['announce']
df = df[df['message'] != ' '].copy()
df.dropna(subset=['message'], inplace=True)

df_train = df[df.date < '2021-01-01'].copy()
df_val = df[(df.date > '2021-01-01') & (df.date < '2021-06-01')].copy()
df_test = df[(df.date < '2022-01-01') & (df.date > '2021-06-01')].copy()

test_texts = df_test['message'].values
test_target = df_test['VTBR_30_min_mean'].values

test_dataset = NewsDataset(test_texts, test_target)
batch_size = 32
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
df[df['message'].str.contains('экосистему рынка')]

Unnamed: 0,link,date,title,announce,text,ALRS,AFLT,VTBR,GAZP,GMKN,...,HHRU_30_min_mean,HHRU_1_hour_mean,HHRU_1_day_mean,YNDX_5_min_mean,YNDX_10_min_mean,YNDX_15_min_mean,YNDX_30_min_mean,YNDX_1_hour_mean,YNDX_1_day_mean,message
151856,https://lenta.ru//news/2021/07/28/ecosistema/,2021-07-28 14:36:00,ВТБ создаст экосистему рынка имущественных торгов,,"ВТБ стал первым российским банком, начавшим ра...",False,False,True,False,False,...,1,1,0,1,1,1,0,1,0,ВТБ создаст экосистему рынка имущественных тор...


In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/weights/dbert_news_GAZP_30_min_mean.pth'))
model.eval()

BertClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
gazp_30min_preds = evaluate(test_dataloader)

100%|██████████| 56/56 [00:26<00:00,  2.12it/s]


In [None]:
gazp_30min_true = df_test['GAZP_30_min_mean'].values

In [None]:
true_preds = np.concatenate([np.argwhere(gazp_30min_true == gazp_30min_preds), gazp_30min_preds[gazp_30min_true == gazp_30min_preds].reshape(-1, 1)], axis=1)

In [None]:
text = test_texts[757]
input_ids, mask, baseline_input_ids, baseline_mask, all_tokens = construct_input_and_baseline(text)

attributions, delta = lig.attribute(inputs=(input_ids, mask),
                                    baselines=(baseline_input_ids, baseline_mask),
                                    return_convergence_delta=True,
                                    internal_batch_size=1
                                    )
attributions_sum = summarize_attributions(attributions)
score_vis = viz.VisualizationDataRecord(
                        word_attributions = attributions_sum,
                        pred_prob = torch.max(model(input_ids, mask)[0]),
                        pred_class = torch.argmax(model(input_ids, mask)[0]).cpu().numpy()-1,
                        true_class = -1,
                        attr_class = text,
                        attr_score = attributions_sum.sum(),       
                        raw_input_ids = all_tokens,
                        convergence_score = delta)

viz.visualize_text([score_vis])

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
-1.0,-1 (0.44),«Газпром» урежет транзит через Польшу Компания забронировала на октябрь только треть мощностей,-2.72,[CLS] « Газпром » уре ##жет транзит через Польшу Компания заброни ##ровала на октябрь только треть мощностей [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
-1.0,-1 (0.44),«Газпром» урежет транзит через Польшу Компания забронировала на октябрь только треть мощностей,-2.72,[CLS] « Газпром » уре ##жет транзит через Польшу Компания заброни ##ровала на октябрь только треть мощностей [SEP]
,,,,


In [None]:
df = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps_raw_1,5sigma_.parquet')
df = df[df['GAZP'] == True].copy()

df['title'] = df['title'].str.replace('no title', ' ')
df['announce'] = df['announce'].str.replace('no announce', ' ')

df['message'] = df['title'] + ' ' + df['announce']
df = df[df['message'] != ' '].copy()
df.dropna(subset=['message'], inplace=True)

df_train = df[df.date < '2021-01-01'].copy()
df_val = df[(df.date > '2021-01-01') & (df.date < '2021-06-01')].copy()
df_test = df[(df.date < '2022-01-01') & (df.date > '2021-06-01')].copy()

test_texts = df_test['message'].values
test_target = df_test['GAZP_30_min_mean'].values

test_dataset = NewsDataset(test_texts, test_target)
batch_size = 32
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
df_test[df_test['message'].str.contains('урежет транзит')]

Unnamed: 0,link,date,title,announce,text,ALRS,AFLT,VTBR,GAZP,GMKN,...,HHRU_30_min_mean,HHRU_1_hour_mean,HHRU_1_day_mean,YNDX_5_min_mean,YNDX_10_min_mean,YNDX_15_min_mean,YNDX_30_min_mean,YNDX_1_hour_mean,YNDX_1_day_mean,message
153293,https://www.kommersant.ru//doc/4996237,2021-09-20 17:19:41,«Газпром» урежет транзит через Польшу,Компания забронировала на октябрь ...,«Газпром» в очередной раз отказался бронироват...,False,False,False,True,False,...,1,1,0,0,-1,-1,-1,0,0,«Газпром» урежет транзит через Польшу ...
