# Debug feature engineering

# 1. Imports

## 1.1 Packages

In [2]:
import numpy as np
import pandas as pd


## 1.2 Options

## 1.3 Datasets

In [3]:
# dict_metadata_datasets = pd.read_pickle("../data/02_intermediate/dict_metadata_datasets.pkl")

## 1.4 Functions

In [4]:
from impostor_hunt_in_texts.pipelines.feature_engineering.nodes import (
    extract_features,
    load_model_and_tokenizer,
)
from impostor_hunt_in_texts.utils.utils import load_hf_datasetdict, split_dataset_dict

  from .autonotebook import tqdm as notebook_tqdm


# 2. Feature engineering

In [5]:
dict_metadata_datasets = {"save_path": "../data/02_intermediate/dict_datasets"}

In [6]:
dataset_dict = load_hf_datasetdict(dict_metadata_datasets)
dataset_train, dataset_test = split_dataset_dict(dataset_dict)

In [7]:
model, tokenizer = load_model_and_tokenizer("bert-base-uncased")

In [8]:
dataset_train_features, train_ids = extract_features(
    dataset=dataset_train,
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    stride=216,
    device="cpu",
)

Extracting features: 100%|██████████| 95/95 [00:36<00:00,  2.57it/s]


In [10]:
dataset_train_features

array([[-1.8221563e-01,  1.8503173e-01,  2.6603371e-01, ...,
         1.9781753e-02,  5.7555810e-03,  1.6396128e-03],
       [-1.6375388e-01,  3.2518438e-01,  4.7533748e-01, ...,
         3.5610597e-04, -3.1663883e-03, -7.9679349e-03],
       [-1.8221563e-01,  1.8503173e-01,  2.6603371e-01, ...,
         1.9781753e-02,  5.7555810e-03,  1.6396128e-03],
       ...,
       [-1.6375388e-01,  3.2518438e-01,  4.7533748e-01, ...,
         3.5610597e-04, -3.1663883e-03, -7.9679349e-03],
       [-1.6375388e-01,  3.2518438e-01,  4.7533748e-01, ...,
         3.5610597e-04, -3.1663883e-03, -7.9679349e-03],
       [-1.8221563e-01,  1.8503173e-01,  2.6603371e-01, ...,
         1.9781753e-02,  5.7555810e-03,  1.6396128e-03]],
      shape=(95, 3072), dtype=float32)

In [18]:
def convert_features_to_dataframe(dataset_features: np.ndarray, ids: list[int]) -> pd.DataFrame:
    """
    Convert the features extracted from the texts to a pandas DataFrame.

    Args:
        dataset_features (np.array): The features extracted from the text using the huggingface model.
        ids (list[int]): List of the IDs.

    Returns:
        (pd.DataFrame): A DataFrame containing the features with columns for each feature and the ids.
    """
    return pd.concat([
        pd.DataFrame({"id": ids}),
        pd.DataFrame(dataset_features, columns=[f"token_feat_{i}" for i in range(dataset_features.shape[1])]),
    ], axis=1)

df_train_feat = convert_features_to_dataframe(dataset_train_features, train_ids)

In [19]:
df_train_feat

Unnamed: 0,id,token_feat_0,token_feat_1,token_feat_2,token_feat_3,token_feat_4,token_feat_5,token_feat_6,token_feat_7,token_feat_8,...,token_feat_3062,token_feat_3063,token_feat_3064,token_feat_3065,token_feat_3066,token_feat_3067,token_feat_3068,token_feat_3069,token_feat_3070,token_feat_3071
0,1,-0.182216,0.185032,0.266034,-0.021760,0.305553,-0.012980,0.108271,0.246084,-0.007443,...,0.041054,0.001174,-0.001081,0.006440,0.023864,0.171575,-0.007841,0.019782,0.005756,0.001640
1,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
2,1,-0.182216,0.185032,0.266034,-0.021760,0.305553,-0.012980,0.108271,0.246084,-0.007443,...,0.041054,0.001174,-0.001081,0.006440,0.023864,0.171575,-0.007841,0.019782,0.005756,0.001640
3,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
4,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
91,1,-0.182216,0.185032,0.266034,-0.021760,0.305553,-0.012980,0.108271,0.246084,-0.007443,...,0.041054,0.001174,-0.001081,0.006440,0.023864,0.171575,-0.007841,0.019782,0.005756,0.001640
92,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
93,2,-0.163754,0.325184,0.475337,-0.124496,0.336826,-0.098111,0.001948,0.481946,-0.052951,...,0.027463,-0.001525,-0.008738,-0.005016,0.034096,0.202270,0.010722,0.000356,-0.003166,-0.007968
