# Imports

In [3]:
from pathlib import Path
import sys
import warnings

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

abs_path = Path().resolve()
sys.path.append(str(abs_path.parent / "modules"))

from files_funcs import *


warnings.filterwarnings("ignore")

# Config

In [4]:
main_dir = abs_path.parent.parent

config_path = abs_path.parent.parent / "config" / "config.yaml"
config = load_yaml(config_path)

data_dir = main_dir / "data" / "ds"

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cpu


# BERT init

In [5]:
bert_model = (
    BertModel
    .from_pretrained(config["bert"], output_hidden_states=True)
    .to(DEVICE)
)
tokenizer = BertTokenizer.from_pretrained(config["bert"])

display(bert_model)
tokenizer

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [22]:
def extract_text_vectors(df, column, model, tokenizer, device, verbose=True, verbose_interval=100):
    df = df.copy()
    model = model.to(device)
    if verbose:
        total_images = df.shape[0]
        _ = 0

    vectors = []
    for index, row in df.reset_index(drop=True).iterrows():
        index += 1
        text = row[column][:1800]
        input_ids = torch.tensor(
            [tokenizer.encode(text, add_special_tokens=True)]).to(device)
        with torch.no_grad():
            model_output = model(input_ids)
        vectors.append(
            model_output.last_hidden_state.mean(dim=1).squeeze().tolist())

        if verbose and (index % verbose_interval == 0 or index == total_images - 1):
            print(f"{_}. Processed text {index}/{total_images}.")
            _ += 1

    assert all(len(vector) == len(vectors[0]) for vector in vectors), "Text vectors have different dimensions"

    vectors_np = np.array(vectors)
    df[[f"text_vector_el{x}"
        for x in range(len(vectors_np[0]))]] = vectors_np

    return df

In [152]:
ds = pd.read_csv(data_dir / "ds.unl", sep="|")
ds.columns

Index(['id', 'case_name', 'full_uc_text', 'full_ssts_text',
       'alternative_scenario', 'main_scenario', 'goal', 'other',
       'preconditions', 'postconditions', 'differences', 'description',
       'complience_level', 'target'],
      dtype='object')

In [124]:
ssts_texts = pd.DataFrame(ds["full_ssts_text"]).fillna("EMPTY")

ssts_texts_vectorized = extract_text_vectors(
    ssts_texts,
    "full_ssts_text",
    bert_model,
    tokenizer,
    DEVICE
).drop("full_ssts_text", axis=1)

0. Processed text 11/12.


In [125]:
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances


In [153]:
def create_cosinus_feature(texts: pd.Series, ssts_texts_vectorized: pd.DataFrame) -> pd.DataFrame:
    feature = texts.name
    vec = extract_text_vectors(
        pd.DataFrame(texts).fillna("EMPTY"),
        feature,
        bert_model,
        tokenizer,
        DEVICE
    ).drop(feature, axis=1)

    cosins = cosine_similarity(vec, ssts_texts_vectorized).diagonal()
    manh = manhattan_distances(vec, ssts_texts_vectorized).diagonal()
    euc = euclidean_distances(vec, ssts_texts_vectorized).diagonal()

    return cosins, manh# , euc

In [154]:
to_features = [
    "full_uc_text",
    # "alternative_scenario",
    # "main_scenario",
    # "goal",
    # "other",
    # "preconditions",
    # "postconditions"
]
for f in to_features:
    # ds[f'cos_{f}'], ds[f'man_{f}'], ds[f'euc_{f}'] = create_cosinus_feature(ds[f], ssts_texts_vectorized)
    ds[f'cos_{f}'], ds[f'man_{f}'] = create_cosinus_feature(ds[f], ssts_texts_vectorized)
    # ds[f'cos_{f}'] = create_cosinus_feature(ds[f], ssts_texts_vectorized)

0. Processed text 11/12.


In [155]:
Xy = ds[[f for f in ds.columns if f.startswith(("cos_", "man_", "euc_"))] + ["target"]]

display(Xy.corr())

X = Xy.drop("target", axis=1)
y = Xy["target"]

Unnamed: 0,cos_full_uc_text,man_full_uc_text,target
cos_full_uc_text,1.0,-0.989051,0.622494
man_full_uc_text,-0.989051,1.0,-0.600218
target,0.622494,-0.600218,1.0


In [158]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


model = LinearRegression()
model.fit(X.head(100), y.head(100))
preds = model.predict(X)
mean_squared_error(y, preds)

np.float64(0.74772893361318)

In [159]:
Xy.assign(preds=preds)[["target", "preds"]]

Unnamed: 0,target,preds
0,3,3.677595
1,2,3.654093
2,4,3.699339
3,5,3.576079
4,4,3.551161
5,4,3.557618
6,4,3.651002
7,2,3.650755
8,4,3.609977
9,4,3.626707
