# 05 - Preprocessing

## 5.1 - Set-Up

### 5.1.1 - Logger

In [1]:
from logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="PREPROCESSING")

### 5.1.2 Configuring Root

In [2]:
log_check("Setting up root by appending the parent to the sys...", print_to_console=True)
from jupyter_init import setup

setup()

from src_code.config import *

[PREPROCESSING CHECK] Setting up root by appending the parent to the sys...


### 5.1.3 Loading Dataset

In [3]:
log_check("Loading the dataset...", print_to_console=True)
import pandas as pd
import numpy as np
import seaborn as sns

# TARGET_DF_FILE = ETL_MAPPINGS['test']['current_newest']
subset: SubsetType = 'test'
TARGET_DF_FILE = PREPROCESSING_MAPPINGS[subset]['input']
print(ETL_MAPPINGS[subset]['current_newest'])

# ---- LOAD ----
df = pd.read_feather(TARGET_DF_FILE)
log_result(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns\n", print_to_console=True)

# For large datasets
pd.set_option('display.max_columns', 50)
sns.set_theme(style="whitegrid", context="notebook", palette="muted")

[PREPROCESSING CHECK] Loading the dataset...
C:\Users\fmojt\Code\DPThesis\DP_Thesis\data\interim\test_labeled_features_partial_v10.feather
[PREPROCESSING RESULT] Loaded dataframe with 7363 rows and 31 columns



## 5.2 - Preprocessing

In [4]:
# from sklearn.compose import ColumnTransformer


# preprocessor = ColumnTransformer(transformers=[])
# append a transformer tuple (name, transformer, columns)
# preprocessor.transformers.append(('new_passthrough', 'passthrough', ['col1', 'col2']))

### 5.2.1 Data Cleansing

#### 5.2.1.1 - Fix negative values before log transform
Some features (e.g., time_since_last_change) contain negative values.

We shift them to be ≥ 0 before applying log1p:

In [5]:
# def shift_min_to_zero(df, col):
#     """Shift column so minimum is 0 if negative values exist."""
#     min_val = df[col].min()
#     if min_val < 0:
#         df[col] = df[col] - min_val
#     return df

# for col in NUMERIC_FEATURES:
#     df = shift_min_to_zero(df, col)

from notebooks.utils import contains_negative
from notebooks.constants import NUMERIC_FEATURES

NEG_FEATURES_TO_DROP = ['time_since_last_change']

# List of features to check: NUMERIC_FEATURES excluding NEG_FEATURES_TO_DROP
features_to_check = [
    col for col in NUMERIC_FEATURES 
    if col not in NEG_FEATURES_TO_DROP
]

# Check if any of the features in features_to_check contain negative values
if any(contains_negative(df, col) for col in features_to_check):
    # If True, raise an exception
    raise ValueError("Unexpected negative values found in one or more numeric features that are NOT set to be dropped.")


neg_mask = df["time_since_last_change"] < 0
n_neg = neg_mask.sum()

print(f"Dropping {n_neg} rows with negative time_since_last_change")

df = df[~neg_mask].reset_index(drop=True)

Dropping 0 rows with negative time_since_last_change


#### 5.2.1.2 Assertion Check

In [6]:
log_check("[NEG_FEATURES_TO_DROP] Performing assertion check...")
assert(any(contains_negative(df, col) for col in NEG_FEATURES_TO_DROP) == False)
log_result("[NEG_FEATURES_TO_DROP] Check succesfull!")

[PREPROCESSING CHECK] [NEG_FEATURES_TO_DROP] Performing assertion check...
[PREPROCESSING RESULT] [NEG_FEATURES_TO_DROP] Check succesfull!


### 5.2.2 Data Transformation

#### 5.2.2.1 - Winsorization (IQR-base)

Applies the same bounds your EDA used.

You want preprocessing to match your EDA findings, so we clamp values to the lower/upper fences.

In [7]:
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer
from notebooks.transformers import WinsorizerIQR
from notebooks.constants import NUMERIC_FEATURES, LINE_TOKEN_FEATURES
from sklearn import set_config

set_config(transform_output='pandas')
log_transformer = FunctionTransformer(np.log1p, validate=False)

if subset == 'train':
    log_check("Detected train subset. Creating new preprocessor...", print_to_console=True)
    preprocessor = ColumnTransformer(transformers=[])

    preprocessor.transformers.append(('winsorize', WinsorizerIQR(factor=1.5), NUMERIC_FEATURES))
    preprocessor.transformers.append(('log_tokens', log_transformer, LINE_TOKEN_FEATURES))
    preprocessor.transformers.append(('log_numeric', log_transformer, NUMERIC_FEATURES))

    # 3. FIT the preprocessor ONLY on the training data
    preprocessor.fit(df)
    df = preprocessor.transform(df)

    # 4. SAVE the fitted preprocessor
    # The saved object contains all the calculated Q1, Q3 bounds.
    joblib.dump(preprocessor, FITTED_PREPROCESSOR)

    # print("Fitted preprocessor saved to fitted_preprocessor.joblib")
elif subset in ('test', 'validate'):
    log_check("Detected test subset. Loading fitted preprocessor...", print_to_console=True)
    loaded_preprocessor = joblib.load(FITTED_PREPROCESSOR)
    df = loaded_preprocessor.transform(df)
else:
    msg = "Unknown subset value!"
    logger.error(msg)
    raise ValueError(msg)


log_result("Transformations applied successfully.", print_to_console=True)


# def winsorize_iqr(df, col, preserve_original: bool = False):
#     """
#     Caps extreme outliers using IQR fences.
#     Keeps the distribution shape mostly intact.
#     """
#     Q1 = df[col].quantile(0.25)
#     Q3 = df[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower = Q1 - 1.5 * IQR
#     upper = Q3 + 1.5 * IQR

#     print(f"Df len before winsorization ({col}): {len(df)}")

#     new_col_name = col + "_winsorized" if preserve_original else col

#     df[new_col_name] = df[col].clip(lower=lower, upper=upper)
#     print(f"Df len before winsorization ({col}): {len(df)}")

#     return df


# --- Apply to all numeric columns ---
# for col in NUMERIC_FEATURES:
#     df = winsorize_iqr(df, col, preserve_original=True) if col == 'recent_churn' else winsorize_iqr(df, col)

# for col in LINE_TOKEN_FEATURES:
#     # df = winsorize_iqr(df, col)
#     df[col] = np.log1p(df[col])

[PREPROCESSING CHECK] Detected test subset. Loading fitted preprocessor...
[PREPROCESSING RESULT] Transformations applied successfully.


<!-- ### 5.2.2 - Fix negative values before log transform
Some features (e.g., time_since_last_change) contain negative values.

We shift them to be ≥ 0 before applying log1p: -->

In [8]:
# # def shift_min_to_zero(df, col):
# #     """Shift column so minimum is 0 if negative values exist."""
# #     min_val = df[col].min()
# #     if min_val < 0:
# #         df[col] = df[col] - min_val
# #     return df

# # for col in NUMERIC_FEATURES:
# #     df = shift_min_to_zero(df, col)

# from notebooks.utils import contains_negative
# from notebooks.constants import NUMERIC_FEATURES

# NEG_FEATURES_TO_DROP = ['time_since_last_change']

# # List of features to check: NUMERIC_FEATURES excluding NEG_FEATURES_TO_DROP
# features_to_check = [
#     col for col in NUMERIC_FEATURES 
#     if col not in NEG_FEATURES_TO_DROP
# ]

# # Check if any of the features in features_to_check contain negative values
# if any(contains_negative(df, col) for col in features_to_check):
#     # If True, raise an exception
#     raise ValueError("Unexpected negative values found in one or more numeric features that are NOT set to be dropped.")


# neg_mask = df["time_since_last_change"] < 0
# n_neg = neg_mask.sum()

# print(f"Dropping {n_neg} rows with negative time_since_last_change")

# df = df[~neg_mask].reset_index(drop=True)

### 5.2.3 - Log1p Transformation
Reduces heavy right-skew (your EDA showed skews up to 100+).

In [9]:
# for col in NUMERIC_FEATURES:
#     df[col] = np.log1p(df[col])

## 5.3. - Save preprocessed dataset

In [10]:
log_check("Saving the preprocessed dataset...", print_to_console=True)

OUTPUT_PATH = PREPROCESSING_MAPPINGS[subset]['output']

# 1. Get the names of the final features
# feature_names = preprocessor.get_feature_names_out()

# 2. Reconstruct the DataFrame
# df_transformed = pd.DataFrame(df, columns=feature_names)

df.to_feather(OUTPUT_PATH)

log_result(f"Preprocessed dataset saved to {OUTPUT_PATH}", print_to_console=True)

[PREPROCESSING CHECK] Saving the preprocessed dataset...
[PREPROCESSING RESULT] Preprocessed dataset saved to C:\Users\fmojt\Code\DPThesis\DP_Thesis\data\processed\test_preprocessed.feather


In [11]:
df.columns

Index(['winsorize__author_exp_pre', 'winsorize__author_recent_activity_pre',
       'winsorize__loc_added', 'winsorize__loc_deleted',
       'winsorize__files_changed', 'winsorize__hunks_count',
       'winsorize__msg_len', 'winsorize__ast_delta',
       'winsorize__complexity_delta', 'winsorize__max_func_change',
       'winsorize__time_since_last_change', 'winsorize__recent_churn',
       'log_tokens__todo', 'log_tokens__fixme', 'log_tokens__try',
       'log_tokens__except', 'log_tokens__raise',
       'log_numeric__author_exp_pre',
       'log_numeric__author_recent_activity_pre', 'log_numeric__loc_added',
       'log_numeric__loc_deleted', 'log_numeric__files_changed',
       'log_numeric__hunks_count', 'log_numeric__msg_len',
       'log_numeric__ast_delta', 'log_numeric__complexity_delta',
       'log_numeric__max_func_change', 'log_numeric__time_since_last_change',
       'log_numeric__recent_churn'],
      dtype='object')