# 05 - Preprocessing

## 5.1 - Set-Up

### 5.1.1 - Logger

In [1]:
from logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="PREPROCESSING")

### 5.1.2 Configuring Root

In [2]:
log_check("Setting up root by appending the parent to the sys...", print_to_console=True)
from jupyter_init import setup

setup()

from src_code.config import *

[PREPROCESSING CHECK] Setting up root by appending the parent to the sys...


### 5.1.3 Loading Dataset

In [3]:
log_check("Loading the dataset...", print_to_console=True)
import pandas as pd
import numpy as np
import seaborn as sns


# ---- LOAD ----
df = pd.read_feather(EXTRATED_TRAIN_DF_FILE)
log_result(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns\n", print_to_console=True)

# For large datasets
pd.set_option('display.max_columns', 50)
sns.set_theme(style="whitegrid", context="notebook", palette="muted")

[PREPROCESSING CHECK] Loading the dataset...
[PREPROCESSING RESULT] Loaded dataframe with 133273 rows and 31 columns



In [4]:
df['todo'].describe()

count    133273.000000
mean          0.152326
std           2.480067
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         108.000000
Name: todo, dtype: float64

## 5.2 - Preprocessing

### 5.2.1 - Winsorization (IQR-base)

Applies the same bounds your EDA used.

You want preprocessing to match your EDA findings, so we clamp values to the lower/upper fences.

In [5]:
from notebooks.constants import NUMERIC_FEATURES, LINE_TOKEN_FEATURES


def winsorize_iqr(df, col, preserve_original: bool = False):
    """
    Caps extreme outliers using IQR fences.
    Keeps the distribution shape mostly intact.
    """
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    print(f"Df len before winsorization ({col}): {len(df)}")

    new_col_name = col + "_winsorized" if preserve_original else col

    df[new_col_name] = df[col].clip(lower=lower, upper=upper)
    print(f"Df len before winsorization ({col}): {len(df)}")

    return df

# --- Apply to all numeric columns ---
for col in NUMERIC_FEATURES:
    df = winsorize_iqr(df, col, preserve_original=True) if col == 'recent_churn' else winsorize_iqr(df, col)

for col in LINE_TOKEN_FEATURES:
    # df = winsorize_iqr(df, col)
    df[col] = np.log1p(df[col])

Df len before winsorization (author_exp_pre): 133273
Df len before winsorization (author_exp_pre): 133273
Df len before winsorization (author_recent_activity_pre): 133273
Df len before winsorization (author_recent_activity_pre): 133273
Df len before winsorization (loc_added): 133273
Df len before winsorization (loc_added): 133273
Df len before winsorization (loc_deleted): 133273
Df len before winsorization (loc_deleted): 133273
Df len before winsorization (files_changed): 133273
Df len before winsorization (files_changed): 133273
Df len before winsorization (hunks_count): 133273
Df len before winsorization (hunks_count): 133273
Df len before winsorization (msg_len): 133273
Df len before winsorization (msg_len): 133273
Df len before winsorization (ast_delta): 133273
Df len before winsorization (ast_delta): 133273
Df len before winsorization (complexity_delta): 133273
Df len before winsorization (complexity_delta): 133273
Df len before winsorization (max_func_change): 133273
Df len befor

In [6]:
df['todo'].describe()

count    133273.000000
mean          0.054613
std           0.250436
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.691348
Name: todo, dtype: float64

### 5.2.2 - Fix negative values before log transform
Some features (e.g., time_since_last_change) contain negative values.

We shift them to be â‰¥ 0 before applying log1p:

In [7]:
# def shift_min_to_zero(df, col):
#     """Shift column so minimum is 0 if negative values exist."""
#     min_val = df[col].min()
#     if min_val < 0:
#         df[col] = df[col] - min_val
#     return df

# for col in NUMERIC_FEATURES:
#     df = shift_min_to_zero(df, col)

from notebooks.utils import contains_negative
from notebooks.constants import NUMERIC_FEATURES

NEG_FEATURES_TO_DROP = ['time_since_last_change']

# List of features to check: NUMERIC_FEATURES excluding NEG_FEATURES_TO_DROP
features_to_check = [
    col for col in NUMERIC_FEATURES 
    if col not in NEG_FEATURES_TO_DROP
]

# Check if any of the features in features_to_check contain negative values
if any(contains_negative(df, col) for col in features_to_check):
    # If True, raise an exception
    raise ValueError("Unexpected negative values found in one or more numeric features that are NOT set to be dropped.")


neg_mask = df["time_since_last_change"] < 0
n_neg = neg_mask.sum()

print(f"Dropping {n_neg} rows with negative time_since_last_change")

df = df[~neg_mask].reset_index(drop=True)

Dropping 66 rows with negative time_since_last_change


### 5.2.3 - Log1p Transformation
Reduces heavy right-skew (your EDA showed skews up to 100+).

In [8]:
for col in NUMERIC_FEATURES:
    df[col] = np.log1p(df[col])

## 5.3. - Save preprocessed dataset

In [None]:


OUTPUT_PATH = PROCESSED_DATA_DIR / "train_preprocessed.feather"
df.to_feather(OUTPUT_PATH)

print(f"Preprocessed dataset saved to {OUTPUT_PATH}")

Preprocessed dataset saved to C:\Users\fmojt\Code\Software Projects\DiplomaThesis\data\preprocessed\train_preprocessed.feather
