# 05 - Preprocessing

## 5.1 - Set-Up

### 5.1.1 - Logger

In [32]:
from logging_config import NotebookLogger
from constants import LOG_FILE
logger = NotebookLogger(label="PREPROCESSING", notebook_name=None, file_log_path=LOG_FILE)

### 5.1.2 Project Root

In [33]:
from jupyter_init import setup
from config import setup as cfg_setup

logger.log_check("Setting up root by appending the parent to the sys...", print_to_console=True)

setup()
cfg_setup()

logger.log_result("Done.")

[PREPROCESSING CHECK] Setting up root by appending the parent to the sys...
[PREPROCESSING RESULT] Done.


### 5.1.3 Libs

In [34]:
# -----------------------------------------------------------------------------
# System & External Libs
# -----------------------------------------------------------------------------


import numpy as np


# -----------------------------------------------------------------------------
# My Libs
# -----------------------------------------------------------------------------

from src_code.config import FITTED_TRANSFORMER
from src_code.ml_pipeline.preprocessing.transform import transform
from src_code.config import *
from src_code.ml_pipeline.df_load import load_df
from src_code.ml_pipeline.preprocessing.transform import pca_explained_variance
from src_code.ml_pipeline.df_load import save_df
from src_code.ml_pipeline.preprocessing.preprocessing import drop_invalid_rows
from juputils import display_func

### 5.1.4 Loading Dataset

In [35]:
logger.log_check("Loading the dataset...", print_to_console=True)

# TARGET_DF_FILE = ETL_MAPPINGS['test']['current_newest']
subset: SubsetType = 'test'
TARGET_DF_FILE = PREPROCESSING_MAPPINGS[subset]['input']
print(ETL_MAPPINGS[subset]['current_newest'])

# ---- LOAD ----
df = load_df(df_file_path=TARGET_DF_FILE, logger=logger)
# df = pd.read_feather(TARGET_DF_FILE)
# log_result(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns\n", print_to_console=True)

# # For large datasets
# pd.set_option('display.max_columns', 50)
# sns.set_theme(style="whitegrid", context="notebook", palette="muted")
logger.log_result("Dataset loaded successfully.", print_to_console=True)

[PREPROCESSING CHECK] Loading the dataset...
C:\Users\fmojt\Code\DPThesis\DP_Thesis\data\interim\test_labeled_features_partial_v10.feather
[PREPROCESSING CHECK] Loading the dataset...
[PREPROCESSING RESULT] Loaded dataframe with 7363 rows and 31 columns

[PREPROCESSING RESULT] Dataset loaded successfully.


## 5.2 - Preprocessing

In [36]:
# from sklearn.compose import ColumnTransformer


# preprocessor = ColumnTransformer(transformers=[])
# append a transformer tuple (name, transformer, columns)
# preprocessor.transformers.append(('new_passthrough', 'passthrough', ['col1', 'col2']))

### 5.2.1 Data Cleansing

#### 5.2.1.1 - Fix negative values before log transform
Some features (e.g., time_since_last_change) contain negative values.

We shift them to be ≥ 0 before applying log1p:

In [37]:
display_func(drop_invalid_rows)

In [38]:
# def shift_min_to_zero(df, col):
#     """Shift column so minimum is 0 if negative values exist."""
#     min_val = df[col].min()
#     if min_val < 0:
#         df[col] = df[col] - min_val
#     return df

# for col in NUMERIC_FEATURES:
#     df = shift_min_to_zero(df, col)

# NEG_FEATURES_TO_DROP = ["time_since_last_change"]

# # List of features to check: NUMERIC_FEATURES excluding NEG_FEATURES_TO_DROP
# features_to_check = [col for col in NUMERIC_FEATURES if col not in NEG_FEATURES_TO_DROP]

df = drop_invalid_rows(
    df=df,
    # numeric_features=NUMERIC_FEATURES,
    row_filters={"time_since_last_change": lambda s: s >= 0},
    logger=logger,
    sanity_check=True
)

# # Check if any of the features in features_to_check contain negative values
# if any(contains_negative(df, col) for col in features_to_check):
#     # If True, raise an exception
#     raise ValueError("Unexpected negative values found in one or more numeric features that are NOT set to be dropped.")


# neg_mask = df["time_since_last_change"] < 0
# n_neg = neg_mask.sum()

# print(f"Dropping {n_neg} rows with negative time_since_last_change")

# df = df[~neg_mask].reset_index(drop=True)

[PREPROCESSING CHECK] Applying row-level filters on numeric features...
[PREPROCESSING RESULT] Dropping 0 rows due to filter on 'time_since_last_change'


#### 5.2.1.2 Assertion Check

In [39]:
# log_check("[NEG_FEATURES_TO_DROP] Performing assertion check...")
# assert(any(contains_negative(df, col) for col in NEG_FEATURES_TO_DROP) == False)
# log_result("[NEG_FEATURES_TO_DROP] Check succesfull!")

### 5.2.2 Data Transformation

#### 5.2.2.1 - Reproducibility

In [40]:
# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#### 5.2.2.2 - Transformations

Applies the same bounds your EDA used.

You want preprocessing to match your EDA findings, so we clamp values to the lower/upper fences.

In [41]:
# def expand_embedding(df, col_name, prefix):
#     # Converts a column of arrays into a matrix
#     emb = np.vstack(df[col_name].values)
#     emb_df = pd.DataFrame(
#         emb,
#         index=df.index,
#         columns=[f"{prefix}_{i}" for i in range(emb.shape[1])]
#     )
#     return emb_df
display_func(transform)

In [42]:
df, transformer = transform(df=df,
        subset=subset,
          logger=logger,
          random_state=RANDOM_STATE,
          fitted_transformer=FITTED_TRANSFORMER)

# fitted_transfomer: ColumnTransformer = joblib.load(FITTED_TRANSFORMER)


# set_config(transform_output='pandas')
# log_transformer = FunctionTransformer(np.log1p, validate=False)

# if subset == 'train':
#     # log_check("Detected train subset. Creating new preprocessor...", print_to_console=True)
#     # preprocessor = ColumnTransformer(transformers=[], remainder='passthrough', verbose_feature_names_out=False)

#     # preprocessor.transformers.append(('winsorize', WinsorizerIQR(factor=1.5), NUMERIC_FEATURES))
#     # preprocessor.transformers.append(('log_tokens', log_transformer, LINE_TOKEN_FEATURES))
#     # preprocessor.transformers.append(('log_numeric', log_transformer, NUMERIC_FEATURES))

#     # # 3. FIT the preprocessor ONLY on the training data
#     # preprocessor.fit(df)
#     # df = preprocessor.transform(df)

#     # # 4. SAVE the fitted preprocessor
#     # # The saved object contains all the calculated Q1, Q3 bounds.
#     # joblib.dump(preprocessor, FITTED_PREPROCESSOR)
#     log_check("Detected train subset. Creating new preprocessor...", print_to_console=True)

#     # code_emb_df = expand_embedding(df, "code_embed", "code_emb")
#     # msg_emb_df  = expand_embedding(df, "msg_embed", "msg_emb")
#     # df = pd.concat([df.drop(columns=["code_embed", "msg_embed"]), code_emb_df, msg_emb_df], axis=1)

#     # Update the EMBEDDINGS constant to reflect the NEW flattened column names
#     # FLATTENED_EMBEDDINGS = code_emb_df.columns.tolist() + msg_emb_df.columns.tolist()

#     # Define a pipeline for EACH embedding type
#     # code_emb_pipe = Pipeline([
#     #     ('expand', EmbeddingExpander(prefix="code_emb")),
#     #     ('pca', PCA(n_components=100, random_state=RANDOM_STATE))
#     # ])
#     # Use it in your pipeline like this:
#     code_emb_pipe = Pipeline([
#         ('expand', EmbeddingExpander(prefix="code")),
#         ('pca', NamingPCA(n_components=10, prefix="code_emb_", random_state=RANDOM_STATE))
#     ])

#     msg_emb_pipe = Pipeline([
#         ('expand', EmbeddingExpander(prefix="msg")),
#         # ('pca', PCA(n_components=100, random_state=RANDOM_STATE))
#         ('pca', NamingPCA(n_components=45, prefix="msg_emb_", random_state=RANDOM_STATE))

#     ])

#     # 1. Define a pipeline for numeric features: Winsorize THEN Log
#     numeric_pipeline = Pipeline([
#         ('winsorize', WinsorizerIQR(factor=1.5)),
#         ('log', log_transformer),
#         ("var_thresh", VarianceThreshold(threshold=0.0))
#     ])

#     # embedding_transformer = Pipeline(steps=[
#     #     ("pca", PCA(n_components=100, random_state=RANDOM_STATE))
#     # ])


#     # 2. Setup the ColumnTransformer
#     preprocessor = ColumnTransformer(
#         transformers=[
#             # ('num_transformed', numeric_pipeline, NUMERIC_FEATURES),
#             # ('token_transformed', log_transformer, LINE_TOKEN_FEATURES),
#             # ("embed", embedding_transformer, FLATTENED_EMBEDDINGS),
#             ('num', numeric_pipeline, NUMERIC_FEATURES),
#             ('tokens', log_transformer, LINE_TOKEN_FEATURES),
#             ('code_embed', code_emb_pipe, ['code_embed']), # Pass as list
#             ('msg_embed', msg_emb_pipe, ['msg_embed']),    # Pass as list
#         ],
#         remainder='passthrough',
#         verbose_feature_names_out=False  # This now works because names are unique
#     )

#     # 3. FIT and TRANSFORM
#     preprocessor.fit(df)
#     df = preprocessor.transform(df)

#     # 4. SAVE
#     joblib.dump(preprocessor, FITTED_TRANSFORMER)

#     # print("Fitted preprocessor saved to fitted_preprocessor.joblib")
# elif subset in ('test', 'validate'):
#     log_check("Detected test subset. Loading fitted preprocessor...", print_to_console=True)
#     loaded_preprocessor = joblib.load(FITTED_TRANSFORMER)
#     df = loaded_preprocessor.transform(df)
# else:
#     msg = "Unknown subset value!"
#     logger.error(msg)
#     raise ValueError(msg)


# log_result("Transformations applied successfully.", print_to_console=True)





# numeric_transformer = Pipeline(steps=[
#     ("var_thresh", VarianceThreshold(threshold=0.0))
# ])

# embedding_transformer = Pipeline(steps=[
#     ("pca", PCA(n_components=100, random_state=RANDOM_STATE))
# ])


# preprocessor = ColumnTransformer(
#     transformers=[
#         # ("struct", "passthrough", structured_features),
#         ("struct", numeric_transformer, structured_features),
#         ("embed", embedding_transformer, embedding_features),
#     ],
#     remainder="drop"
# )

# def winsorize_iqr(df, col, preserve_original: bool = False):
#     """
#     Caps extreme outliers using IQR fences.
#     Keeps the distribution shape mostly intact.
#     """
#     Q1 = df[col].quantile(0.25)
#     Q3 = df[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower = Q1 - 1.5 * IQR
#     upper = Q3 + 1.5 * IQR

#     print(f"Df len before winsorization ({col}): {len(df)}")

#     new_col_name = col + "_winsorized" if preserve_original else col

#     df[new_col_name] = df[col].clip(lower=lower, upper=upper)
#     print(f"Df len before winsorization ({col}): {len(df)}")

#     return df


# --- Apply to all numeric columns ---
# for col in NUMERIC_FEATURES:
#     df = winsorize_iqr(df, col, preserve_original=True) if col == 'recent_churn' else winsorize_iqr(df, col)

# for col in LINE_TOKEN_FEATURES:
#     # df = winsorize_iqr(df, col)
#     df[col] = np.log1p(df[col])

[PREPROCESSING CHECK] Performing df transformation...
[PREPROCESSING RESULT] Detected test subset. Loading fitted preprocessor...
[PREPROCESSING RESULT] Transformations applied successfully.


#### 5.2.2.3 - Variance Explanation of Embeddings

In [43]:
display_func(pca_explained_variance)

In [44]:
# Access the PCA step from your fitted preprocessor
# Assuming the step was named 'code_pca' in the ColumnTransformer
# pca_model = fitted_transfomer.named_transformers_['code_embed'].named_steps['pca']
# total_variance = sum(pca_model.explained_variance_ratio_)

# print(f"Your 50 components explain {total_variance:.2%} of the original code data.")

# pca_model = fitted_transfomer.named_transformers_['msg_embed'].named_steps['pca']
# total_variance = sum(pca_model.explained_variance_ratio_)
# print(f"Your 50 components explain {total_variance:.2%} of the original msg data.")

# from src_code.ml_pipeline.preprocessing.transformers import pca_explained_variance

logger.log_check("Checking explanation of variance by embeddings...")


logger.log_result(
    f"Code embeddings explain "
    f"{pca_explained_variance(transformer=transformer, name='code_embed'):.2%} of variance"
)

logger.log_result(
    f"Message embeddings explain "
    f"{pca_explained_variance(transformer=transformer, name='msg_embed'):.2%} of variance"
)


[PREPROCESSING CHECK] Checking explanation of variance by embeddings...
[PREPROCESSING RESULT] Code embeddings explain 85.40% of variance
[PREPROCESSING RESULT] Message embeddings explain 82.61% of variance


<!-- ### 5.2.2 - Fix negative values before log transform
Some features (e.g., time_since_last_change) contain negative values.

We shift them to be ≥ 0 before applying log1p: -->

In [45]:
# # def shift_min_to_zero(df, col):
# #     """Shift column so minimum is 0 if negative values exist."""
# #     min_val = df[col].min()
# #     if min_val < 0:
# #         df[col] = df[col] - min_val
# #     return df

# # for col in NUMERIC_FEATURES:
# #     df = shift_min_to_zero(df, col)

# from notebooks.utils import contains_negative
# from notebooks.constants import NUMERIC_FEATURES

# NEG_FEATURES_TO_DROP = ['time_since_last_change']

# # List of features to check: NUMERIC_FEATURES excluding NEG_FEATURES_TO_DROP
# features_to_check = [
#     col for col in NUMERIC_FEATURES 
#     if col not in NEG_FEATURES_TO_DROP
# ]

# # Check if any of the features in features_to_check contain negative values
# if any(contains_negative(df, col) for col in features_to_check):
#     # If True, raise an exception
#     raise ValueError("Unexpected negative values found in one or more numeric features that are NOT set to be dropped.")


# neg_mask = df["time_since_last_change"] < 0
# n_neg = neg_mask.sum()

# print(f"Dropping {n_neg} rows with negative time_since_last_change")

# df = df[~neg_mask].reset_index(drop=True)

### 5.2.3 - Log1p Transformation
Reduces heavy right-skew (your EDA showed skews up to 100+).

In [46]:
# for col in NUMERIC_FEATURES:
#     df[col] = np.log1p(df[col])

## 5.3. - Save preprocessed dataset

In [47]:
# log_check("Saving the preprocessed dataset...", print_to_console=True)

# OUTPUT_PATH = PREPROCESSING_MAPPINGS[subset]['output']

# # 1. Get the names of the final features
# # feature_names = preprocessor.get_feature_names_out()

# # 2. Reconstruct the DataFrame
# # df_transformed = pd.DataFrame(df, columns=feature_names)

# df.to_feather(OUTPUT_PATH)

# log_result(f"Preprocessed dataset saved to {OUTPUT_PATH}", print_to_console=True)

save_df(df=df, df_file_path=PREPROCESSING_MAPPINGS[subset]["output"], logger=logger)

[PREPROCESSING CHECK] Saving the preprocessed dataset...


[PREPROCESSING RESULT] Preprocessed dataset saved to C:\Users\fmojt\Code\DPThesis\DP_Thesis\data\processed\test_preprocessed.feather
