# 06 - Feature Engineering

## 6.1 - Set-Up

### 6.1.1 Setting Up Project Root

In [15]:
# logger.info("Setting up root by appending the parent to the sys...")
from jupyter_init import setup

setup()

from src_code.config import *

### 6.1.2 Setting Up Logger

In [16]:
from notebooks.logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="ENGINEERING")

log_start(print_to_console=True)
log_result("Logging configured.", print_to_console=True)

[ENGINEERING RESULT] Logging configured.


### 6.1.3 Loading Dataset

In [17]:
log_check("Loading the dataset...")
import pandas as pd
import numpy as np
import seaborn as sns

SUBSET: SubsetType = 'train'
# TRANSFORMED_DF = EXTRACTED_DATA_DIR / "test_labeled_features_partial_copy.feather"
# DF = PROCESSED_DATA_DIR / "train_preprocessed.feather"
DF_PATH = ENGINEERING_MAPPINGS[SUBSET]['input']

# ---- LOAD ----
df = pd.read_feather(DF_PATH)
msg = f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns\n"
# print(msg)
log_result(msg, print_to_console=True)

# For large datasets
pd.set_option('display.max_columns', 50)
sns.set_theme(style="whitegrid", context="notebook", palette="muted")

[ENGINEERING CHECK] Loading the dataset...
[ENGINEERING RESULT] Loaded dataframe with 139545 rows and 29 columns



## 6.2 - Data Engineering

### 6.2.1 - Create interaction / derived features

In [18]:
# Example: churn ratio
if "loc_added" in df.columns and "loc_deleted" in df.columns:
    df["loc_churn_ratio"] = df["loc_added"] / (df["loc_deleted"] + 1)  # avoid division by zero

# Example: recent activity per experience
if "author_recent_activity_pre" in df.columns and "author_exp_pre" in df.columns:
    df["activity_per_exp"] = df["author_recent_activity_pre"] / (df["author_exp_pre"] + 1)

In [19]:
df.columns

Index(['winsorize__author_exp_pre', 'winsorize__author_recent_activity_pre',
       'winsorize__loc_added', 'winsorize__loc_deleted',
       'winsorize__files_changed', 'winsorize__hunks_count',
       'winsorize__msg_len', 'winsorize__ast_delta',
       'winsorize__complexity_delta', 'winsorize__max_func_change',
       'winsorize__time_since_last_change', 'winsorize__recent_churn',
       'log_tokens__todo', 'log_tokens__fixme', 'log_tokens__try',
       'log_tokens__except', 'log_tokens__raise',
       'log_numeric__author_exp_pre',
       'log_numeric__author_recent_activity_pre', 'log_numeric__loc_added',
       'log_numeric__loc_deleted', 'log_numeric__files_changed',
       'log_numeric__hunks_count', 'log_numeric__msg_len',
       'log_numeric__ast_delta', 'log_numeric__complexity_delta',
       'log_numeric__max_func_change', 'log_numeric__time_since_last_change',
       'log_numeric__recent_churn'],
      dtype='object')

### 6.2.2 - Binning / categorical transformations

In [23]:
# Example: bucket commits by size
from sklearn.compose import ColumnTransformer

from notebooks.transformers import QuantileThresholdFlag
from sklearn import set_config
import joblib

set_config(transform_output='pandas')

if "log_numeric__loc_added" in df.columns:
    # bins = [0, 10, 50, 200, 1000, np.inf]
    # labels = ["very_small", "small", "medium", "large", "very_large"]
    # df["loc_added_bucket"] = pd.cut(df["loc_added"], bins=bins, labels=labels)
    bins = [0, 2.3, 3.9, 5.3, 7.0, np.inf]
    labels = ["very_small", "small", "medium", "large", "very_large"]

    df["loc_added_bucket"] = pd.cut(df["log_numeric__loc_added"], bins=bins, labels=labels, include_lowest=True)

# # Example: boolean feature for extreme churn
# if "recent_churn" in df.columns:
#     threshold = df["recent_churn"].quantile(0.95)
#     df["extreme_churn_flag"] = (df["recent_churn"] > threshold).astype(int)

# df['extreme_churn_flag'].describe()

# You'd need to create a list of features for which you want to create a flag
# EXTREME_FLAG_FEATURES = ["recent_churn"]


# if SUBSET == 'train':
#     log_check("Detected train subset. Creating new preprocessor...", print_to_console=True)

#     preprocessor = ColumnTransformer(
#         # ... existing transformers ...
#         transformers=[('extreme_flags', QuantileThresholdFlag(quantile=0.95), EXTREME_FLAG_FEATURES),
#         ]
#         # ...
#     )

#     preprocessor.fit(df)
#     df = preprocessor.transform(df)
#     joblib.dump(preprocessor, ENGINEERING_PREPROCESSOR)
# elif SUBSET in ('test', 'validate'):
#     log_check("Detected test subset. Loading fitted preprocessor...", print_to_console=True)
#     loaded_preprocessor = joblib.load(ENGINEERING_PREPROCESSOR)
#     df = loaded_preprocessor.transform(df)
# else:
#     msg = "Unknown subset value!"
#     logger.error(msg)
#     raise ValueError(msg)


In [24]:
df["loc_added_bucket_cat"] = df["loc_added_bucket"].cat.codes
print(df["loc_added_bucket_cat"].corr(df["label"]))
print(df['loc_added'].corr(df['label']))

KeyError: 'label'

#### why loc_add_bucket?

Because ML models often perform better when very skewed numeric features are also represented in categorical (binned) form.

✓ Models detect thresholds better

Bug likelihood typically increases when a commit crosses certain “size” thresholds:

- tiny commits (<10 LOC) rarely introduce bugs
- medium commits (50–200 LOC) are more risky
- huge commits (1000+ LOC) are extremely risky

Binning makes these thresholds explicit rather than hidden inside a numeric feature.

✓ Models become more robust to noise

- Instead of memorizing exact values like “3.044522” (your log-transformed LOC),
the model gets a stable category: "small".

✓ Helps tree-based models (XGBoost, RF, LightGBM)

Trees thrive on categorical thresholds.
One-hot-encoded buckets give them interpretable splits.


#### Why extreme_churn_flag?

*recent_churn* = how many lines were changed recently in the project

High churn = a project area under rapid change

High churn is known in research to correlate with bug-inducing commits
(rapidly changing files are less stable)

So the idea is:
- commits with huge previous churn → more likely to be unstable → possibly bug-inducing

This is a domain-inspired feature.

### 6.2.3 - Aggregate LINE_TOKEN_FEATURES

In [None]:
from notebooks.constants import LINE_TOKEN_FEATURES


df["line_token_total"] = df[LINE_TOKEN_FEATURES].sum(axis=1)

# Optionally create ratios per total lines (if loc_added exists)
if "loc_added" in df.columns:
    for token in LINE_TOKEN_FEATURES:
        df[f"{token}_ratio"] = df[token] / (df["loc_added"] + 1)

### 6.2.4 - Feature interactions (example)

In [None]:
# interaction_features = ["loc_added", "loc_deleted", "hunks_count"]

from notebooks.constants import INTERACTION_FEATURES


for i in range(len(INTERACTION_FEATURES)):
    for j in range(i+1, len(INTERACTION_FEATURES)):
        f1 = INTERACTION_FEATURES[i]
        f2 = INTERACTION_FEATURES[j]
        df[f"{f1}_x_{f2}"] = df[f1] * df[f2]

## 6.3 - Summary of engineered features

In [None]:
from notebooks.constants import ENGINEERED_FEATURES


# engineered_cols = [c for c in df.columns if c not in NUMERIC_FEATURES + LINE_TOKEN_FEATURES]
# msg = "Engineered features:", ENGINEERED_FEATURES

log_result(f"Engineered features: {ENGINEERED_FEATURES}", print_to_console=True)

[ENGINEERING RESULT] Engineered features: ['loc_churn_ratio', 'activity_per_exp', 'loc_added_bucket', 'extreme_churn_flag', 'line_token_total', 'todo_ratio', 'fixme_ratio', 'try_ratio', 'except_ratio', 'raise_ratio', 'loc_added_x_loc_deleted', 'loc_added_x_hunks_count', 'loc_deleted_x_hunks_count']


## 6.4 Save the dataset


In [None]:

log_check("Saving preprocessed dataset...")
# OUTPUT_PATH = PROCESSED_DATA_DIR / "train_engineered.feather"
OUTPUT_PATH = ENGINEERING_MAPPINGS[SUBSET]['output']
df.to_feather(OUTPUT_PATH)

log_result(f"Preprocessed dataset saved to {OUTPUT_PATH}", print_to_console=True)

Preprocessed dataset saved to C:\Users\fmojt\Code\Software Projects\DiplomaThesis\data\preprocessed\train_engineered.feather
