# 07 - Model Training

## 7.1 Set-Up

### 7.1.1 - Initial Configuration

In [1]:
# logger.info("Setting up root by appending the parent to the sys...")
from jupyter_init import setup

setup()

from src_code.config import *

### 7.1.2 - Setting up Logger

In [2]:
from notebooks.logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="ENGINEERING")

log_start(print_to_console=True)
log_result("Logging configured.", print_to_console=True)

[ENGINEERING RESULT] Logging configured.


### 7.1.3 Imports & Configuration

In [3]:
import numpy as np
import pandas as pd

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance

# Imbalance handling
from sklearn.utils.class_weight import compute_class_weight

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 7.1.4 Loading Dataset

In [4]:
log_check("Loading the dataset...")
import pandas as pd
import numpy as np
import seaborn as sns

# TRANSFORMED_DF = EXTRACTED_DATA_DIR / "train_labeled_features_partial.feather"
PREPROCESSED_DF = PROCESSED_DATA_DIR / "train_engineered.feather"

# ---- LOAD ----
df = pd.read_feather(PREPROCESSED_DF)

## 7.2 Model Training

### 7.2.1 Target & Column Separation

In [5]:
TARGET = "label"

# Drop identifiers & leakage-prone columns
DROP_COLS = [
    "commit",
    "repo",
    "filepath",
    "author_email",
    "datetime",
    "canonical_datetime",
]

df = df.drop(columns=DROP_COLS, errors="ignore")

### 7.2.2 Embedding Handling (CodeBERT)
Transformation on the embedding columns because machine learning models, especially traditional ones like Logistic Regression, Random Forests, or Gradient Boosting, cannot directly process a list or a NumPy array stored as a single entry (a cell) in a pandas DataFrame.

The process is a necessary feature engineering step that converts the single embedding column into many separate numerical columns. This technique is often referred to as feature expansion or flattening the embedding vector.

In [6]:
def expand_embedding(df, col_name, prefix):
    emb = np.vstack(df[col_name].values)
    emb_df = pd.DataFrame(
        emb,
        index=df.index,
        columns=[f"{prefix}_{i}" for i in range(emb.shape[1])]
    )
    return emb_df

In [7]:
code_emb_df = expand_embedding(df, "code_embed", "code_emb")
msg_emb_df  = expand_embedding(df, "msg_embed", "msg_emb")

df = pd.concat(
    [df.drop(columns=["code_embed", "msg_embed"]), code_emb_df, msg_emb_df],
    axis=1
)

### 7.2.3 Feature Type Identification

In [27]:
numeric_features = df.select_dtypes(include=["float64", "int64", "int8"]).columns.tolist()
numeric_features.remove(TARGET)
log_result(f"Numeric features: {numeric_features}", print_to_console=True)

categorical_features = df.select_dtypes(include=["category"]).columns.tolist()
log_result(f"Categorical features: {categorical_features}", print_to_console=True)

structured_features = [
    f for f in numeric_features
    if not f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"Structural features: {structured_features}", print_to_console=True)
log_result(len(structured_features), print_to_console=True)


embedding_features = [
    f for f in numeric_features
    if f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"embedding_features: {embedding_features}", print_to_console=True)



[ENGINEERING RESULT] Numeric features: ['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'files_changed', 'hunks_count', 'msg_len', 'has_fix_kw', 'has_bug_kw', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'todo', 'fixme', 'try', 'except', 'raise', 'recent_churn', 'recent_churn_winsorized', 'loc_churn_ratio', 'activity_per_exp', 'extreme_churn_flag', 'loc_added_bucket_cat', 'line_token_total', 'todo_ratio', 'fixme_ratio', 'try_ratio', 'except_ratio', 'raise_ratio', 'loc_added_x_loc_deleted', 'loc_added_x_hunks_count', 'loc_deleted_x_hunks_count', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'code_emb_10', 'code_emb_11', 'code_emb_12', 'code_emb_13', 'code_emb_14', 'code_emb_15', 'code_emb_16', 'code_emb_17', 'code_emb_18', 'code_emb_19', 'code_emb_20', 'code_emb_21', 'code_emb_22', 'code_emb_23', 'code_emb_24', 'code_emb_25', 'code_em

### 7.2.4 Train / Test Split (Stratified)

In [9]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    # stratify=y, not required since the training subset of the original df is balanced
    random_state=RANDOM_STATE
)

### 7.2.5 Preprocessing Pipeline

Design choices (aligned with the EDA):
- No scaling for trees
- Remove zero-variance features
- Keep engineered features (selection later)

Why VarianceThreshold is useful: This step removes any numerical features that have zero variance (i.e., all values are identical). Features with zero variance provide no information to the model and can sometimes cause issues or slow down training, so it's good practice to remove them.

#### PCA
PCA is an unsupervised linear transformation technique used for dimensionality reduction. Its goal is to reduce the number of features while retaining as much of the original variance (information) as possible.

`n_components` is the target number of dimensions. PCA will transform the original 768 embedding features into a new, smaller set of 100 features.

**In Context:** You are combining 768 Code embeddings and 768 Message embeddings, resulting in 1536 embedding features. PCA reduces this set of 1536 features down to a manageable and non-redundant set of 100 features that still capture most of the semantic meaning.


In [10]:
from sklearn.decomposition import PCA


numeric_transformer = Pipeline(steps=[
    ("var_thresh", VarianceThreshold(threshold=0.0))
])

embedding_transformer = Pipeline(steps=[
    ("pca", PCA(n_components=100, random_state=RANDOM_STATE))
])


preprocessor = ColumnTransformer(
    transformers=[
        # ("struct", "passthrough", structured_features),
        ("struct", numeric_transformer, structured_features),
        ("embed", embedding_transformer, embedding_features),
    ],
    remainder="drop"
)

### 7.2.5 Baseline Random Forest Model

This combined object, model, is a Pipeline, which ensures that the preprocessing steps are always applied correctly before the Random Forest Classifier is trained or used for prediction.

In [20]:
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=None,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    # class_weight=CLASS_WEIGHT,
    n_jobs=1              # ðŸ”´ IMPORTANT
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("rf", rf)
])

### 7.2.6 Cross-Validation (Primary Evaluation)

In [12]:
# cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

# scoring = {
#     "roc_auc": "roc_auc",
#     "f1": "f1",
#     "precision": "precision",
#     "recall": "recall"
# }

# cv_results = cross_validate(
#     model,
#     X_train,
#     y_train,
#     cv=cv,
#     scoring=scoring,
#     n_jobs=2
# )

# pd.DataFrame(cv_results).mean()
# Assuming model, X_train, y_train, and RANDOM_STATE are defined

# -------------------------------------------------------------------------
# 1. Define the KFold splitter (non-stratified)
# -------------------------------------------------------------------------
# NOTE: This does NOT guarantee equal class proportions in each fold.
from sklearn.model_selection import KFold


cv = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# -------------------------------------------------------------------------
# 2. Define the scoring metrics
# -------------------------------------------------------------------------
scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "precision": "precision",
    "recall": "recall"
}

# -------------------------------------------------------------------------
# 3. Execute the cross-validation
# -------------------------------------------------------------------------
cv_results = cross_validate(
    model,      # Your machine learning pipeline
    X_train,    # Training features
    y_train,    # Training labels
    cv=cv,      # The KFold splitter
    scoring=scoring,
    n_jobs=2    # Use 2 CPU cores
)

# -------------------------------------------------------------------------
# 4. View and aggregate the results
# -------------------------------------------------------------------------
# The mean of the results gives the model's average performance.
average_metrics = pd.DataFrame(cv_results).mean()

print("\n--- Average Cross-Validation Metrics (KFold) ---")
print(average_metrics)

KeyboardInterrupt: 

In [21]:
# This step trains the single, final model pipeline that is saved
# in the 'model' variable and used for prediction and PFI.
model.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('preprocess', ...), ('rf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('struct', ...), ('embed', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'drop'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"threshold  threshold: float, default=0 Features with a training-set variance lower than this threshold will be removed. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples.",0.0

0,1,2
,"n_components  n_components: int, float or 'mle', default=None Number of components to keep. if n_components is not set all components are kept::  n_components == min(n_samples, n_features) If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's MLE is used to guess the dimension. Use of ``n_components == 'mle'`` will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If ``svd_solver == 'arpack'``, the number of components must be strictly less than the minimum of n_features and n_samples. Hence, the None case results in::  n_components == min(n_samples, n_features) - 1",100
,"copy  copy: bool, default=True If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead.",True
,"whiten  whiten: bool, default=False When True (False by default) the `components_` vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions.",False
,"svd_solver  svd_solver: {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'}, default='auto' ""auto"" :  The solver is selected by a default 'auto' policy is based on `X.shape` and  `n_components`: if the input data has fewer than 1000 features and  more than 10 times as many samples, then the ""covariance_eigh""  solver is used. Otherwise, if the input data is larger than 500x500  and the number of components to extract is lower than 80% of the  smallest dimension of the data, then the more efficient  ""randomized"" method is selected. Otherwise the exact ""full"" SVD is  computed and optionally truncated afterwards. ""full"" :  Run exact full SVD calling the standard LAPACK solver via  `scipy.linalg.svd` and select the components by postprocessing ""covariance_eigh"" :  Precompute the covariance matrix (on centered data), run a  classical eigenvalue decomposition on the covariance matrix  typically using LAPACK and select the components by postprocessing.  This solver is very efficient for n_samples >> n_features and small  n_features. It is, however, not tractable otherwise for large  n_features (large memory footprint required to materialize the  covariance matrix). Also note that compared to the ""full"" solver,  this solver effectively doubles the condition number and is  therefore less numerical stable (e.g. on input data with a large  range of singular values). ""arpack"" :  Run SVD truncated to `n_components` calling ARPACK solver via  `scipy.sparse.linalg.svds`. It requires strictly  `0 < n_components < min(X.shape)` ""randomized"" :  Run randomized SVD by the method of Halko et al. .. versionadded:: 0.18.0 .. versionchanged:: 1.5  Added the 'covariance_eigh' solver.",'auto'
,"tol  tol: float, default=0.0 Tolerance for singular values computed by svd_solver == 'arpack'. Must be of range [0.0, infinity). .. versionadded:: 0.18.0",0.0
,"iterated_power  iterated_power: int or 'auto', default='auto' Number of iterations for the power method computed by svd_solver == 'randomized'. Must be of range [0, infinity). .. versionadded:: 0.18.0",'auto'
,"n_oversamples  n_oversamples: int, default=10 This parameter is only relevant when `svd_solver=""randomized""`. It corresponds to the additional number of random vectors to sample the range of `X` so as to ensure proper conditioning. See :func:`~sklearn.utils.extmath.randomized_svd` for more details. .. versionadded:: 1.1",10
,"power_iteration_normalizer  power_iteration_normalizer: {'auto', 'QR', 'LU', 'none'}, default='auto' Power iteration normalizer for randomized SVD solver. Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` for more details. .. versionadded:: 1.1",'auto'
,"random_state  random_state: int, RandomState instance or None, default=None Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. .. versionadded:: 0.18.0",42

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",50
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [22]:
index=model.named_steps["preprocess"].get_feature_names_out()
index

array(['struct__author_exp_pre', 'struct__author_recent_activity_pre',
       'struct__loc_added', 'struct__loc_deleted',
       'struct__files_changed', 'struct__hunks_count', 'struct__msg_len',
       'struct__has_fix_kw', 'struct__has_bug_kw', 'struct__ast_delta',
       'struct__complexity_delta', 'struct__max_func_change',
       'struct__time_since_last_change', 'struct__todo', 'struct__fixme',
       'struct__try', 'struct__except', 'struct__raise',
       'struct__recent_churn', 'struct__recent_churn_winsorized',
       'struct__loc_churn_ratio', 'struct__activity_per_exp',
       'struct__extreme_churn_flag', 'struct__loc_added_bucket_cat',
       'struct__line_token_total', 'struct__todo_ratio',
       'struct__fixme_ratio', 'struct__try_ratio', 'struct__except_ratio',
       'struct__raise_ratio', 'struct__loc_added_x_loc_deleted',
       'struct__loc_added_x_hunks_count',
       'struct__loc_deleted_x_hunks_count', 'embed__pca0', 'embed__pca1',
       'embed__pca2', 'embed_

In [23]:
import time

# --- Setup: Ensure the model is fitted ---
# model.fit(X_train, y_train) 
# Assuming this has already been run.

start_time = time.time()

# This call runs the entire pipeline: Preprocessing (PCA) + Random Forest Prediction
# The output is not needed, just the execution time.
_ = model.predict(X_test)

end_time = time.time()
single_inference_duration = end_time - start_time

print(f"Time for a single inference run on X_test ({len(X_test)} rows): {single_inference_duration:.2f} seconds")

Time for a single inference run on X_test (26642 rows): 0.77 seconds


### 7.2.7 Feature Importance (Permutation-Based)

Critical due to correlated engineered features

You are calculating and displaying the **Permutation Feature Importance (PFI)** for your entire machine learning pipeline (model) using the held-out test set. This is a crucial step in model interpretation, especially for complex models like Random Forests.

The permutation_importance function calculates the drop in a model's score when a single feature is randomly shuffled (permuted).

How it Works:

1. The function first calculates the model's baseline score (e.g., ROC AUC) on the unshuffled X_test and y_test.

2. For each feature (e.g., lines_added), it randomly shuffles the values in that column across the entire X_test set.

3. It then recalculates the model's score using this corrupted data.

4. The Permutation Importance is the difference between the baseline score and the score with the shuffled feature. A large drop in score indicates the feature was highly important.

In [36]:
from joblib import parallel_backend
import tqdm


X_test_small = X_test.sample(n=3000, random_state=RANDOM_STATE)
y_test_small = y_test.loc[X_test_small.index]
# model.named_steps["classifier"].set_params(n_jobs=1)

# perm = permutation_importance(
#     model,
#     X_test_small,
#     y_test_small,
#     n_repeats=2, # The number of times each feature is randomly 
#                 # shuffled. A higher number (e.g., 5-10) is usually
#                 # recommended for more robust results, but 2 is used
#                 # here for speed.
#     random_state=RANDOM_STATE,
#     n_jobs=1
# )

# The total number of tasks is N_features * n_repeats
n_features = len(model.named_steps["preprocess"].get_feature_names_out())
total_tasks = n_features * 2

# with parallel_backend('loky', n_jobs=-1): # Use all cores
    # with tqdm.tqdm(total=total_tasks, desc="PFI Permutations") as progress_bar:
        # Wrap the function call in a helper that updates the progress bar
        # This is a bit advanced but forces joblib to use the tqdm callback
        
# NOTE: In modern scikit-learn/joblib, simply setting the backend 
# is often enough to show the progress. If not, this is the safest way:
perm = permutation_importance(
    model,
    X_test_small,
    y_test_small,
    n_repeats=2,
    random_state=RANDOM_STATE,
    n_jobs=1, # <--- Re-enabled parallel processing
)

importances = pd.Series(
    perm.importances_mean, # Retrieves the average importance score
                            # (the average drop in model performance)
                            # calculated across the n_repeats=2 runs 
                            # for each feature.
    # index=model.named_steps["preprocess"].get_feature_names_out()\
    index=X_test_small.columns

    # This is a crucial step for pipelines. After the ColumnTransformer 
    # ("preprocess") has run (including PCA and any other steps), the feature
    #  names are transformed (e.g., code_emb_0 becomes embed__pca__0). This 
    # method retrieves the correct, final feature names that the model actually used.
).sort_values(ascending=False)

importances.head(20)

msg_emb_588        0.047667
code_emb_588       0.029667
msg_emb_551        0.029000
code_emb_77        0.021500
msg_emb_217        0.016167
msg_emb_77         0.015333
code_emb_551       0.015167
msg_emb_97         0.012667
msg_emb_749        0.010500
msg_emb_82         0.008833
msg_emb_453        0.007833
max_func_change    0.007667
msg_emb_496        0.007667
msg_emb_331        0.006833
msg_emb_259        0.006167
msg_emb_570        0.005833
msg_emb_61         0.005667
code_emb_453       0.005500
msg_emb_247        0.005000
loc_deleted        0.004500
dtype: float64

PFI is generally preferred because:

1. **Model Agnostic**: It works for any model (Random Forest, Neural Network, etc.).

2. **Includes Preprocessing**: It measures the importance of features after they have gone through the entire pipeline (including PCA), giving you the importance of the final, processed features, which is essential when dealing with complex pipelines.

In [32]:
importances = pd.Series(
    perm.importances_mean, # Retrieves the average importance score
                            # (the average drop in model performance)
                            # calculated across the n_repeats=2 runs 
                            # for each feature.
    # index=model.named_steps["preprocess"].get_feature_names_out()\
    index=X_test_small.columns

    # This is a crucial step for pipelines. After the ColumnTransformer 
    # ("preprocess") has run (including PCA and any other steps), the feature
    #  names are transformed (e.g., code_emb_0 becomes embed__pca__0). This 
    # method retrieves the correct, final feature names that the model actually used.
).sort_values(ascending=False)

# importances.head(20)
# print(model.named_steps["preprocess"].get_feature_names_out())
# print(len(perm.importances_mean))
importances.head(20)

code_emb_588                 0.2
loc_deleted_x_hunks_count    0.1
code_emb_82                  0.1
msg_emb_77                   0.1
msg_emb_330                  0.1
msg_emb_551                  0.1
msg_emb_588                  0.1
raise                        0.0
content                      0.0
msg_emb_753                  0.0
msg_emb_754                  0.0
msg_emb_755                  0.0
msg_emb_756                  0.0
msg_emb_757                  0.0
msg_emb_758                  0.0
msg_emb_759                  0.0
msg_emb_760                  0.0
msg_emb_761                  0.0
msg_emb_762                  0.0
msg_emb_763                  0.0
dtype: float64

### 7.2.8 Feature Subset Refinement (Optional Iteration)
(Re-run steps 8â€“11 using reduced feature set)

In [None]:
TOP_K = 100
top_features = importances.head(TOP_K).index.tolist()

X_train_reduced = X_train[top_features]
X_test_reduced  = X_test[top_features]

## 7.3 Saving the Model

*joblib.dump()* is used to save the model object to a file. It is generally preferred over standard Python pickle for large objects containing NumPy arrays (like your Random Forest and PCA objects).

In [37]:
import joblib
import os

# Define the filename (e.g., in a 'models' directory)
# MODEL_SAVE_PATH = "models/random_forest_pipeline.joblib"
log_check("Saving the model ")

# Ensure the directory exists
os.makedirs("models", exist_ok=True)
MODEL_SAVE_PATH = MODEL_DIR / "random_forest_pipeline.joblib"
# Save the entire fitted pipeline
joblib.dump(model, MODEL_DIR / MODEL_SAVE_PATH)

log_result(f"âœ… Model successfully saved to: {MODEL_SAVE_PATH}", print_to_console=True)

[ENGINEERING RESULT] âœ… Model successfully saved to: C:\Users\fmojt\Code\Software Projects\DiplomaThesis\models\random_forest_pipeline.joblib
