# 07 - Model Training

## 7.1 Set-Up

### 7.1.1 - Initial Configuration

In [102]:
# logger.info("Setting up root by appending the parent to the sys...")
from jupyter_init import setup

setup()

from src_code.config import *

### 7.1.2 - Setting up Logger

In [103]:
from notebooks.logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="ENGINEERING")

log_start(print_to_console=True)
log_result("Logging configured.", print_to_console=True)

[ENGINEERING RESULT] Logging configured.


### 7.1.3 Imports & Configuration

In [104]:
import numpy as np
import pandas as pd

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance

# Imbalance handling
from sklearn.utils.class_weight import compute_class_weight

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 7.1.4 Loading Dataset

In [105]:
log_check("Loading the dataset...")
import pandas as pd
import numpy as np
import seaborn as sns

# TRANSFORMED_DF = EXTRACTED_DATA_DIR / "train_labeled_features_partial.feather"
# PREPROCESSED_DF = PROCESSED_DATA_DIR / "train_engineered.feather"
DF_PATH = ENGINEERING_MAPPINGS['train']['output']

# ---- LOAD ----
df = pd.read_feather(DF_PATH)
# df.describe()
my_list = df.columns.values.tolist()
print(my_list)

[ENGINEERING CHECK] Loading the dataset...
['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'files_changed', 'hunks_count', 'msg_len', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'recent_churn', 'todo', 'fixme', 'try', 'except', 'raise', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'msg_emb_0', 'msg_emb_1', 'msg_emb_2', 'msg_emb_3', 'msg_emb_4', 'msg_emb_5', 'msg_emb_6', 'msg_emb_7', 'msg_emb_8', 'msg_emb_9', 'msg_emb_10', 'msg_emb_11', 'msg_emb_12', 'msg_emb_13', 'msg_emb_14', 'msg_emb_15', 'msg_emb_16', 'msg_emb_17', 'msg_emb_18', 'msg_emb_19', 'msg_emb_20', 'msg_emb_21', 'msg_emb_22', 'msg_emb_23', 'msg_emb_24', 'msg_emb_25', 'msg_emb_26', 'msg_emb_27', 'msg_emb_28', 'msg_emb_29', 'msg_emb_30', 'msg_emb_31', 'msg_emb_32', 'msg_emb_33', 'msg_emb_34', 'msg_emb_35', 'msg_emb_36', 'msg_emb_37', 'msg_emb_38', 'msg_emb_39', 'msg_emb_4

## 7.2 Model Training

### 7.2.1 Target & Column Separation

In [106]:
TARGET = "label"

# Drop identifiers & leakage-prone columns
DROP_COLS = [
    "commit",
    "repo",
    "filepath",
    "author_email",
    "datetime",
    "canonical_datetime",
    "content",
    "methods",
    "lines",
    "files_changed",
    "loc_added_bucket"
]

df = df.drop(columns=DROP_COLS, errors="ignore")
my_list = df.columns.values.tolist()
print(my_list)

['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'hunks_count', 'msg_len', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'recent_churn', 'todo', 'fixme', 'try', 'except', 'raise', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'msg_emb_0', 'msg_emb_1', 'msg_emb_2', 'msg_emb_3', 'msg_emb_4', 'msg_emb_5', 'msg_emb_6', 'msg_emb_7', 'msg_emb_8', 'msg_emb_9', 'msg_emb_10', 'msg_emb_11', 'msg_emb_12', 'msg_emb_13', 'msg_emb_14', 'msg_emb_15', 'msg_emb_16', 'msg_emb_17', 'msg_emb_18', 'msg_emb_19', 'msg_emb_20', 'msg_emb_21', 'msg_emb_22', 'msg_emb_23', 'msg_emb_24', 'msg_emb_25', 'msg_emb_26', 'msg_emb_27', 'msg_emb_28', 'msg_emb_29', 'msg_emb_30', 'msg_emb_31', 'msg_emb_32', 'msg_emb_33', 'msg_emb_34', 'msg_emb_35', 'msg_emb_36', 'msg_emb_37', 'msg_emb_38', 'msg_emb_39', 'msg_emb_40', 'msg_emb_41', 'msg_emb_42', 'msg_emb_43', 'msg_emb_44', 

### 7.2.2 Embedding Handling (CodeBERT)
Transformation on the embedding columns because machine learning models, especially traditional ones like Logistic Regression, Random Forests, or Gradient Boosting, cannot directly process a list or a NumPy array stored as a single entry (a cell) in a pandas DataFrame.

The process is a necessary feature engineering step that converts the single embedding column into many separate numerical columns. This technique is often referred to as feature expansion or flattening the embedding vector.

In [107]:
# def expand_embedding(df, col_name, prefix):
#     emb = np.vstack(df[col_name].values)
#     emb_df = pd.DataFrame(
#         emb,
#         index=df.index,
#         columns=[f"{prefix}_{i}" for i in range(emb.shape[1])]
#     )
#     return emb_df

In [108]:
# code_emb_df = expand_embedding(df, "code_embed", "code_emb")
# msg_emb_df  = expand_embedding(df, "msg_embed", "msg_emb")

# df = pd.concat(
#     [df.drop(columns=["code_embed", "msg_embed"]), code_emb_df, msg_emb_df],
#     axis=1
# )

### 7.2.3 Feature Type Identification

In [109]:
numeric_features = df.select_dtypes(include=["float64", "int64", "int8"]).columns.tolist()
numeric_features.remove(TARGET)
log_result(f"Numeric features: {numeric_features}", print_to_console=True)

categorical_features = df.select_dtypes(include=["category"]).columns.tolist()
log_result(f"Categorical features: {categorical_features}", print_to_console=True)

structured_features = [
    f for f in numeric_features
    if not f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"Structural features: {structured_features}", print_to_console=True)
log_result(len(structured_features), print_to_console=True)


embedding_features = [
    f for f in numeric_features
    if f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"embedding_features: {embedding_features}", print_to_console=True)



[ENGINEERING RESULT] Numeric features: ['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'hunks_count', 'msg_len', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'recent_churn', 'todo', 'fixme', 'try', 'except', 'raise', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'msg_emb_0', 'msg_emb_1', 'msg_emb_2', 'msg_emb_3', 'msg_emb_4', 'msg_emb_5', 'msg_emb_6', 'msg_emb_7', 'msg_emb_8', 'msg_emb_9', 'msg_emb_10', 'msg_emb_11', 'msg_emb_12', 'msg_emb_13', 'msg_emb_14', 'msg_emb_15', 'msg_emb_16', 'msg_emb_17', 'msg_emb_18', 'msg_emb_19', 'msg_emb_20', 'msg_emb_21', 'msg_emb_22', 'msg_emb_23', 'msg_emb_24', 'msg_emb_25', 'msg_emb_26', 'msg_emb_27', 'msg_emb_28', 'msg_emb_29', 'msg_emb_30', 'msg_emb_31', 'msg_emb_32', 'msg_emb_33', 'msg_emb_34', 'msg_emb_35', 'msg_emb_36', 'msg_emb_37', 'msg_emb_38', 'msg_emb_39', 'msg_emb_40', 'msg_emb_41', 'ms

### 7.2.4 Train / Test Split (Stratified)

In [110]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    # stratify=y, not required since the training subset of the original df is balanced
    random_state=RANDOM_STATE
)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import make_scorer, matthews_corrcoef
# from sklearn.model_selection import GridSearchCV

# # 1. Define the model
# rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# # 2. Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20],
#     'min_samples_split': [2, 5, 10],
#     'max_features': ['sqrt', 'log2']
# }

# # 3. Create a custom scorer for MCC
# mcc_scorer = make_scorer(matthews_corrcoef)

# # 4. Set up Grid Search
# grid_search = GridSearchCV(
#     estimator=rf, 
#     param_grid=param_grid, 
#     scoring=mcc_scorer, 
#     cv=5,            # 5-fold cross-validation
#     n_jobs=6,       # Use all CPU cores
#     verbose=3
# )

# # 5. Run the search
# grid_search.fit(X_train, y_train)

# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Best MCC Score: {grid_search.best_score_}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits


KeyboardInterrupt: 

### 7.2.5 Preprocessing Pipeline

Design choices (aligned with the EDA):
- No scaling for trees
- Remove zero-variance features
- Keep engineered features (selection later)

Why VarianceThreshold is useful: This step removes any numerical features that have zero variance (i.e., all values are identical). Features with zero variance provide no information to the model and can sometimes cause issues or slow down training, so it's good practice to remove them.

#### PCA
PCA is an unsupervised linear transformation technique used for dimensionality reduction. Its goal is to reduce the number of features while retaining as much of the original variance (information) as possible.

`n_components` is the target number of dimensions. PCA will transform the original 768 embedding features into a new, smaller set of 100 features.

**In Context:** You are combining 768 Code embeddings and 768 Message embeddings, resulting in 1536 embedding features. PCA reduces this set of 1536 features down to a manageable and non-redundant set of 100 features that still capture most of the semantic meaning.


In [None]:
# from sklearn.decomposition import PCA


# numeric_transformer = Pipeline(steps=[
#     ("var_thresh", VarianceThreshold(threshold=0.0))
# ])

# embedding_transformer = Pipeline(steps=[
#     ("pca", PCA(n_components=100, random_state=RANDOM_STATE))
# ])


# preprocessor = ColumnTransformer(
#     transformers=[
#         # ("struct", "passthrough", structured_features),
#         ("struct", numeric_transformer, structured_features),
#         ("embed", embedding_transformer, embedding_features),
#     ],
#     remainder="drop"
# )

### 7.2.5 Baseline Random Forest Model

This combined object, model, is a Pipeline, which ensures that the preprocessing steps are always applied correctly before the Random Forest Classifier is trained or used for prediction.

In [None]:
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=None,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    # class_weight=CLASS_WEIGHT,
    n_jobs=1              # ðŸ”´ IMPORTANT
)

model = Pipeline(steps=[
    # ("preprocess", preprocessor),
    ("rf", rf)
])

### 7.2.6 Cross-Validation (Primary Evaluation)

In [None]:
# cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

# scoring = {
#     "roc_auc": "roc_auc",
#     "f1": "f1",
#     "precision": "precision",
#     "recall": "recall"
# }

# cv_results = cross_validate(
#     model,
#     X_train,
#     y_train,
#     cv=cv,
#     scoring=scoring,
#     n_jobs=2
# )

# pd.DataFrame(cv_results).mean()
# Assuming model, X_train, y_train, and RANDOM_STATE are defined

# -------------------------------------------------------------------------
# 1. Define the KFold splitter (non-stratified)
# -------------------------------------------------------------------------
# NOTE: This does NOT guarantee equal class proportions in each fold.
from sklearn.model_selection import KFold


cv = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# -------------------------------------------------------------------------
# 2. Define the scoring metrics
# -------------------------------------------------------------------------
scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "precision": "precision",
    "recall": "recall"
}

# -------------------------------------------------------------------------
# 3. Execute the cross-validation
# -------------------------------------------------------------------------
cv_results = cross_validate(
    model,      # Your machine learning pipeline
    X_train,    # Training features
    y_train,    # Training labels
    cv=cv,      # The KFold splitter
    scoring=scoring,
    n_jobs=2    # Use 2 CPU cores
)

# -------------------------------------------------------------------------
# 4. View and aggregate the results
# -------------------------------------------------------------------------
# The mean of the results gives the model's average performance.
average_metrics = pd.DataFrame(cv_results).mean()

print("\n--- Average Cross-Validation Metrics (KFold) ---")
print(average_metrics)

KeyboardInterrupt: 

In [None]:
# This step trains the single, final model pipeline that is saved
# in the 'model' variable and used for prediction and PFI.
model.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('rf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",50
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [None]:
import time

# --- Setup: Ensure the model is fitted ---
# model.fit(X_train, y_train) 
# Assuming this has already been run.

start_time = time.time()

# This call runs the entire pipeline: Preprocessing (PCA) + Random Forest Prediction
# The output is not needed, just the execution time.
_ = model.predict(X_test)

end_time = time.time()
single_inference_duration = end_time - start_time

print(f"Time for a single inference run on X_test ({len(X_test)} rows): {single_inference_duration:.2f} seconds")

Time for a single inference run on X_test (26642 rows): 0.77 seconds


### 7.2.7 Feature Importance (Permutation-Based)

Critical due to correlated engineered features

You are calculating and displaying the **Permutation Feature Importance (PFI)** for your entire machine learning pipeline (model) using the held-out test set. This is a crucial step in model interpretation, especially for complex models like Random Forests.

The permutation_importance function calculates the drop in a model's score when a single feature is randomly shuffled (permuted).

How it Works:

1. The function first calculates the model's baseline score (e.g., ROC AUC) on the unshuffled X_test and y_test.

2. For each feature (e.g., lines_added), it randomly shuffles the values in that column across the entire X_test set.

3. It then recalculates the model's score using this corrupted data.

4. The Permutation Importance is the difference between the baseline score and the score with the shuffled feature. A large drop in score indicates the feature was highly important.

In [None]:
from joblib import parallel_backend
import tqdm


X_test_small = X_test.sample(n=5000, random_state=RANDOM_STATE)
y_test_small = y_test.loc[X_test_small.index]


# The total number of tasks is N_features * n_repeats
# n_features = len(model.named_steps["preprocess"].get_feature_names_out())
n_features = X_test_small.shape[1]
total_tasks = n_features * 2

# with parallel_backend('loky', n_jobs=-1): # Use all cores
    # with tqdm.tqdm(total=total_tasks, desc="PFI Permutations") as progress_bar:
        # Wrap the function call in a helper that updates the progress bar
        # This is a bit advanced but forces joblib to use the tqdm callback
        
# NOTE: In modern scikit-learn/joblib, simply setting the backend 
# is often enough to show the progress. If not, this is the safest way:
perm = permutation_importance(
    model,
    X_test_small,
    y_test_small,
    n_repeats=2,
    random_state=RANDOM_STATE,
    n_jobs=-1, # <--- Re-enabled parallel processing
)

importances = pd.Series(
    perm.importances_mean, # Retrieves the average importance score
                            # (the average drop in model performance)
                            # calculated across the n_repeats=2 runs 
                            # for each feature.
    # index=model.named_steps["preprocess"].get_feature_names_out()\
    index=X_test_small.columns

    # This is a crucial step for pipelines. After the ColumnTransformer 
    # ("preprocess") has run (including PCA and any other steps), the feature
    #  names are transformed (e.g., code_emb_0 becomes embed__pca__0). This 
    # method retrieves the correct, final feature names that the model actually used.
).sort_values(ascending=False)

importances.head(20)

loc_deleted                  0.0041
msg_emb_0                    0.0033
msg_len                      0.0025
max_func_change              0.0024
hunks_count                  0.0021
loc_deleted_x_hunks_count    0.0018
code_emb_5                   0.0014
msg_emb_3                    0.0013
loc_added_x_hunks_count      0.0012
code_emb_2                   0.0012
ast_delta                    0.0010
msg_emb_20                   0.0009
code_emb_3                   0.0008
author_exp_pre               0.0004
loc_added_bucket_cat         0.0004
msg_emb_8                    0.0003
has_fix_kw                   0.0003
line_token_total             0.0003
code_emb_8                   0.0002
has_bug_kw                   0.0002
dtype: float64

PFI is generally preferred because:

1. **Model Agnostic**: It works for any model (Random Forest, Neural Network, etc.).

2. **Includes Preprocessing**: It measures the importance of features after they have gone through the entire pipeline (including PCA), giving you the importance of the final, processed features, which is essential when dealing with complex pipelines.

In [None]:
# importances = pd.Series(
#     perm.importances_mean, # Retrieves the average importance score
#                             # (the average drop in model performance)
#                             # calculated across the n_repeats=2 runs 
#                             # for each feature.
#     # index=model.named_steps["preprocess"].get_feature_names_out()\
#     index=X_test_small.columns

#     # This is a crucial step for pipelines. After the ColumnTransformer 
#     # ("preprocess") has run (including PCA and any other steps), the feature
#     #  names are transformed (e.g., code_emb_0 becomes embed__pca__0). This 
#     # method retrieves the correct, final feature names that the model actually used.
# ).sort_values(ascending=False)

# importances.head(20)
# print(model.named_steps["preprocess"].get_feature_names_out())
# print(len(perm.importances_mean))
importances.head(50)
# print(importances.values.tolist())

loc_deleted                  0.0041
msg_emb_0                    0.0033
msg_len                      0.0025
max_func_change              0.0024
hunks_count                  0.0021
loc_deleted_x_hunks_count    0.0018
code_emb_5                   0.0014
msg_emb_3                    0.0013
loc_added_x_hunks_count      0.0012
code_emb_2                   0.0012
ast_delta                    0.0010
msg_emb_20                   0.0009
code_emb_3                   0.0008
author_exp_pre               0.0004
loc_added_bucket_cat         0.0004
msg_emb_8                    0.0003
has_fix_kw                   0.0003
line_token_total             0.0003
code_emb_8                   0.0002
has_bug_kw                   0.0002
except                       0.0001
fixme_ratio                  0.0001
todo                         0.0000
try                          0.0000
code_emb_4                   0.0000
fixme                        0.0000
msg_emb_41                   0.0000
todo_ratio                  

### 7.2.8 Feature Subset Refinement (Optional Iteration)
(Re-run steps 8â€“11 using reduced feature set)

In [163]:
threshold = 0.0001 # Or use 0.0 to be more inclusive
top_features = importances[importances > threshold].index.tolist()

# Filter your training and testing sets
X_train_filtered = X_train[top_features]
X_test_filtered = X_test[top_features]
df_test = pd.read_feather(ENGINEERING_MAPPINGS['test']['output'])
top_filter = top_features.copy()
top_filter.append('label')
print(top_filter)
df_test = df_test[top_filter]
df_test.to_feather("lala.feather")


print(f"Reduced feature count from {len(importances)} to {len(top_features)}")

['loc_deleted', 'msg_emb_0', 'msg_len', 'max_func_change', 'hunks_count', 'loc_deleted_x_hunks_count', 'code_emb_5', 'msg_emb_3', 'loc_added_x_hunks_count', 'code_emb_2', 'ast_delta', 'msg_emb_20', 'code_emb_3', 'author_exp_pre', 'loc_added_bucket_cat', 'msg_emb_8', 'has_fix_kw', 'line_token_total', 'code_emb_8', 'has_bug_kw', 'label']
Reduced feature count from 85 to 20


In [None]:
# This step trains the single, final model pipeline that is saved
# in the 'model' variable and used for prediction and PFI.
model.fit(X_train_filtered, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('rf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",50
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# 1. Use the RF model we already defined
# Note: RFECV uses 'feature_importances_' (built-in) which is faster than Permutation
rf = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced', 
    random_state=42,
    n_jobs=-1
)

# 2. Set up the selector
# we use StratifiedKFold to keep the bug/no-bug ratio consistent
min_features_to_select = 5 
step = 1 # remove 1 feature at a time

selector = RFECV(
    estimator=rf,
    step=step,
    cv=StratifiedKFold(5),
    scoring=mcc_scorer, # Using the MCC scorer we made earlier!
    min_features_to_select=min_features_to_select,
    n_jobs=-1,
    verbose=1
)

# 3. Fit to the training data
selector = selector.fit(X_train, y_train)

# 4. Results
print(f"Optimal number of features: {selector.n_features_}")
selected_features = X_train.columns[selector.support_].tolist()
print(f"Selected Features: {selected_features}")

KeyboardInterrupt: 

## 7.3 Saving the Model

*joblib.dump()* is used to save the model object to a file. It is generally preferred over standard Python pickle for large objects containing NumPy arrays (like your Random Forest and PCA objects).

In [None]:
import joblib
import os

# Define the filename (e.g., in a 'models' directory)
# MODEL_SAVE_PATH = "models/random_forest_pipeline.joblib"
log_check("Saving the model ")

# Ensure the directory exists
os.makedirs("models", exist_ok=True)
MODEL_SAVE_PATH = MODEL_DIR / "random_forest_pipeline.joblib"
# Save the entire fitted pipeline
joblib.dump(model, MODEL_DIR / MODEL_SAVE_PATH)

log_result(f"âœ… Model successfully saved to: {MODEL_SAVE_PATH}", print_to_console=True)

[ENGINEERING CHECK] Saving the model 
[ENGINEERING RESULT] âœ… Model successfully saved to: C:\Users\fmojt\Code\DPThesis\DP_Thesis\models\random_forest_pipeline.joblib


In [157]:
df_test_path = ENGINEERING_MAPPINGS['test']['output']
# top_features.append('label')
df_test = pd.read_feather(df_test_path)
df_test = df_test[top_features]

df_test.columns
df_test.to_feather("test.feather")


ValueError: Duplicate column names found: ['has_bug_kw', 'loc_added_x_hunks_count', 'line_token_total', 'ast_delta', 'msg_emb_20', 'code_emb_3', 'loc_deleted', 'msg_emb_8', 'loc_deleted_x_hunks_count', 'label', 'msg_emb_0', 'max_func_change', 'msg_len', 'msg_emb_3', 'hunks_count', 'has_fix_kw', 'code_emb_8', 'code_emb_2', 'code_emb_5', 'loc_added_bucket_cat', 'author_exp_pre', 'label']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import GridSearchCV

# 1. Define the model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# 2. Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# 3. Create a custom scorer for MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# 4. Set up Grid Search
grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    scoring=mcc_scorer, 
    cv=5,            # 5-fold cross-validation
    n_jobs=1,       # Use all CPU cores
    verbose=3
)

# 5. Run the search
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best MCC Score: {grid_search.best_score_}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.614 total time=  36.1s
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.611 total time=  37.9s
[CV 3/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.613 total time=  36.1s
[CV 4/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.610 total time=  35.5s
[CV 5/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.614 total time=  36.4s
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.616 total time= 1.1min
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.611 total time= 1.1min
