# Imports

In [1]:
from pathlib import Path
import sys
import warnings
import re

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

abs_path = Path().resolve()
sys.path.append(str(abs_path.parent / "modules"))

from files_funcs import *
from nlp_funcs import *
from ml_funcs import *


warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Config

In [2]:
main_dir = abs_path.parent.parent

config_path = abs_path.parent.parent / "config" / "config.yaml"
config = load_yaml(config_path)

data_dir = main_dir / "data" / "ds"
model_dir = main_dir / "models"

text_cols = [f'{col}' for col in set(config["ds_topology"]["hmi_subheaders"].values())]

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cpu


# BERT init

In [3]:
bert_model = (
    BertModel
    .from_pretrained(config["bert"], output_hidden_states=True)
    .to(DEVICE)
)
tokenizer = BertTokenizer.from_pretrained(config["bert"])

display(bert_model)
tokenizer

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Loading

In [4]:
ds = pd.read_csv(data_dir / "ds.unl", sep="|")
ds.columns

Index(['id', 'case_name', 'full_uc_text', 'full_ssts_text', 'other',
       'preconditions', 'goal', 'postconditions', 'main_scenario',
       'differences', 'description', 'complience_level', 'target'],
      dtype='object')

# Processing & Fitting

In [5]:
ssts_texts = pd.DataFrame(ds["full_ssts_text"]).fillna("EMPTY")
ssts_texts_vectorized = extract_text_vectors(
    ssts_texts,
    "full_ssts_text",
    bert_model,
    tokenizer,
    DEVICE,
    verbose=False
).drop("full_ssts_text", axis=1)

for col in text_cols:
    ds[f'f_{col}'] = get_pairwise_dist_with_ssts(
        ds[col],
        ssts_texts_vectorized,
        config["distance_metric"],
        bert_model=bert_model,
        tokenizer=tokenizer,
        device=DEVICE,
        verbose=False
    )

# Linear Regression

In [6]:
Xy = ds[[f for f in ds.columns if f.startswith(("f_"))] + ["target"]]
X = Xy.drop("target", axis=1)
y = Xy["target"]

model = LinearRegression()
model.fit(X, y)
preds = model.predict(X)
mse = mean_squared_error(y, preds)
atom_mse = atom_score(mse)

display(Xy.assign(preds=preds)[["target", "preds"]])

print(f'MSE: {mse}')
print(f'Atom metric: {atom_mse}')

Unnamed: 0,target,preds
0,4,3.261212
1,4,3.68943
2,2,1.993804
3,4,4.395567
4,5,3.782691
5,4,3.890308
6,4,4.093768
7,4,3.840981
8,2,2.1598
9,3,3.084027


MSE: 0.3346075613621902
Atom metric: 0.7769282924252066


# Model EDA

In [7]:
model.coef_

array([  0.55740915, -20.97915001,  -1.69028758,  21.09215399,
        -3.31688254])

# Saving model

In [8]:
obj_to_pickle(model, model_dir / create_model_name(model))