# 07 - Model Training

## 7.1 Set-Up

### 7.1.1 - Logger

In [1]:
from logging_config import LOG_PATH, NotebookLogger

# logger, log_start, log_check, log_result = setup_notebook_logging(label="ENGINEERING")
logger = NotebookLogger(label="TRAIN", notebook_name=None, file_log_path=LOG_PATH)

logger.log_start(print_to_console=True)
logger.log_result("Logging configured.", print_to_console=True)

[TRAIN RESULT] Logging configured.


### 7.1.2 - Project Root

In [2]:
# logger.info("Setting up root by appending the parent to the sys...")
from jupyter_init import setup

setup()

### 7.1.3 Libs

In [None]:
from notebooks.juputils import display_func
from src_code.config import *


# --- local modules ---

from src_code.ml_pipeline.df_utils import load_df

from main_config import setup as main_setup
from src_code.ml_pipeline.feature_config import DROP_COLS
from src_code.ml_pipeline.training.utils import drop_cols
from src_code.ml_pipeline.training.utils import analyze_features

from main_config import RANDOM_STATE
from notebooks.constants import TARGET
from main_config import TEST_SPLIT

from src_code.ml_pipeline.models import RFWrapper
from src_code.ml_pipeline.pipelines import RFPipelineWrapper
from src_code.ml_pipeline.validations import CVWrapper
from src_code.ml_pipeline.validations import CVWrapper
from src_code.ml_pipeline.training.train import fit_model
from src_code.ml_pipeline.training.train import split_train_test
from src_code.ml_pipeline.training.tuning import RFTuningWrapper
from src_code.ml_pipeline.training.train import check_single_infer
from src_code.ml_pipeline.feature_importance import PFIWrapper
from src_code.ml_pipeline.df_utils import save_model

main_setup()

### 7.1.4 Loading Dataset

In [None]:
logger.log_check("Loading the dataset...")

# TRANSFORMED_DF = EXTRACTED_DATA_DIR / "train_labeled_features_partial.feather"
# PREPROCESSED_DF = PROCESSED_DATA_DIR / "train_engineered.feather"
DF_PATH = ENGINEERING_MAPPINGS['train']['output']

# ---- LOAD ----
# df = pd.read_feather(DF_PATH)
# df.describe()

df = load_df(DF_PATH, logger=logger)
df.describe()

[TRAIN CHECK] Loading the dataset...
[TRAIN CHECK] Loading the dataset...
[TRAIN RESULT] Loaded dataframe with 139545 rows and 103 columns



Unnamed: 0,author_exp_pre,author_recent_activity_pre,loc_added,loc_deleted,files_changed,hunks_count,msg_len,ast_delta,complexity_delta,max_func_change,time_since_last_change,recent_churn,todo,fixme,try,except,raise,code_emb_0,code_emb_1,code_emb_2,code_emb_3,code_emb_4,code_emb_5,code_emb_6,code_emb_7,...,msg_emb_34,msg_emb_35,msg_emb_36,msg_emb_37,msg_emb_38,msg_emb_39,msg_emb_40,msg_emb_41,msg_emb_42,msg_emb_43,msg_emb_44,label,has_fix_kw,has_bug_kw,loc_churn_ratio,activity_per_exp,line_token_total,todo_ratio,fixme_ratio,try_ratio,except_ratio,raise_ratio,loc_added_x_loc_deleted,loc_added_x_hunks_count,loc_deleted_x_hunks_count
count,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,...,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0,139545.0
mean,4.432032,2.91375,2.982764,3.539947,1.950455,3.151959,4.540631,2.134896,0.625291,3.43658,6.804231,6.250559,0.060705,0.012647,0.544129,0.263748,0.195102,-4.223387e-15,-7.299682e-16,1.538147e-15,7.821087000000001e-17,-2.607029e-17,2.313738e-16,-5.768052e-16,2.437572e-15,...,1.140575e-16,4.53786e-16,1.5479240000000002e-17,6.150959e-17,8.951988e-16,1.466454e-16,4.252716e-16,-1.162674e-15,-1.189457e-15,1.792333e-16,2.338179e-16,0.461149,0.304927,0.057279,0.638588,0.500729,1.076331,0.013312,0.004803,0.116986,0.053682,0.036393,12.748085,11.486308,13.122828
std,2.113777,1.577641,1.74927,1.671126,1.057575,1.40962,0.859233,2.288227,0.862086,1.841744,2.68301,3.207311,0.261651,0.135984,1.059338,0.703362,0.567292,2.219653,1.448152,1.023242,0.7669118,0.6201064,0.5688169,0.4886902,0.3937198,...,0.2104797,0.2041841,0.2027971,0.200119,0.1964164,0.1941571,0.1892906,0.188126,0.1853564,0.1834569,0.1833782,0.49849,0.460378,0.232376,0.407399,0.223706,1.945555,0.062077,0.050329,0.291973,0.156061,0.102202,10.214727,9.270532,9.355984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.425968,-5.010717,-5.25859,-3.103425,-2.14257,-2.911938,-2.13968,-2.444615,...,-0.9364891,-1.056244,-1.008676,-1.142868,-1.119048,-0.9300965,-1.063362,-1.023447,-0.9268058,-1.955976,-1.077711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.995732,1.791759,1.386294,2.302585,1.098612,1.94591,3.912023,0.0,0.0,2.772589,5.605802,4.644391,0.0,0.0,0.0,0.0,0.0,-0.6762461,-1.025519,-0.7850446,-0.4953535,-0.4271986,-0.2145677,-0.2887873,-0.2488683,...,-0.1351221,-0.1290165,-0.1292397,-0.128522,-0.1269535,-0.1172182,-0.1226275,-0.1173649,-0.1187762,-0.1144729,-0.1209108,0.0,0.0,0.0,0.422698,0.405511,0.0,0.0,0.0,0.0,0.0,0.0,3.131822,2.817885,4.651874
50%,4.875197,3.091042,3.135494,3.806662,1.791759,3.135494,4.330733,1.098612,0.0,4.025352,7.465083,7.16472,0.0,0.0,0.0,0.0,0.0,-0.2655769,-0.09212816,-0.09274639,-0.1490366,-0.07647201,0.0613629,0.02407541,-0.01038398,...,-0.0003622844,-0.0005184692,0.003640943,-0.00500266,-0.007044206,0.003589629,-0.0002091944,0.0008205552,-0.0003348134,-0.001941068,-0.00079104,0.0,0.0,0.0,0.67765,0.551684,0.0,0.0,0.0,0.0,0.0,0.0,11.29739,9.729218,12.030912
75%,6.167516,4.219508,4.543295,4.941642,2.833213,4.343805,5.241747,4.564348,1.386294,4.762174,8.81046,8.837246,0.0,0.0,0.693147,0.0,0.0,0.2085006,0.9974107,0.703516,0.414589,0.3604159,0.2961761,0.274274,0.2392909,...,0.1335562,0.1313729,0.1255679,0.1247857,0.1195289,0.1273497,0.121624,0.118188,0.1164196,0.1170017,0.119832,1.0,1.0,0.0,0.798161,0.655915,1.386294,0.0,0.0,0.165902,0.0,0.0,21.31589,18.920826,20.781406
max,7.058328,5.081404,5.433722,5.814131,3.637586,5.204007,5.985195,5.474369,2.140066,5.592851,9.702105,9.744433,4.691348,3.970292,12.435612,9.04133,7.526179,16.1572,5.846751,4.084592,3.901235,2.935733,1.421278,2.155233,1.403595,...,1.154362,1.319173,0.9797174,0.9108456,1.581407,1.092264,1.183203,0.9005868,1.503891,1.156565,0.9212782,1.0,1.0,1.0,5.433722,0.835564,33.835868,1.098612,1.098612,5.883322,2.639057,2.197225,31.592369,28.277126,30.256774


## 7.2 Model Training

### 7.2.1 Target & Column Separation

In [5]:
display_func(drop_cols)

In [6]:
# TARGET = "label"

# # Drop identifiers & leakage-prone columns
# DROP_COLS = [
#     "commit",
#     "repo",
#     "filepath",
#     "author_email",
#     "datetime",
#     "canonical_datetime",
#     "content",
#     "methods",
#     "lines",
#     "files_changed",
#     "loc_added_bucket"
# ]

# df = df.drop(columns=DROP_COLS, errors="ignore")
# my_list = df.columns.values.tolist()
# print(my_list)


df = drop_cols(df=df, cols=DROP_COLS, logger=logger)

[TRAIN CHECK] Dropping the specified columns...
[TRAIN RESULT] Dropping completed.
[TRAIN RESULT] Columns dropped: 10
[TRAIN RESULT] Columns remaining: 93


### 7.2.2 Embedding Handling (CodeBERT)
Transformation on the embedding columns because machine learning models, especially traditional ones like Logistic Regression, Random Forests, or Gradient Boosting, cannot directly process a list or a NumPy array stored as a single entry (a cell) in a pandas DataFrame.

The process is a necessary feature engineering step that converts the single embedding column into many separate numerical columns. This technique is often referred to as feature expansion or flattening the embedding vector.

### 7.2.3 Feature Type Identification

In [7]:
display_func(analyze_features)

In [None]:
analyze_features(df=df, target=TARGET, logger=logger)

# numeric_features = df.select_dtypes(include=["float64", "int64", "int8"]).columns.tolist()
# numeric_features.remove(TARGET)
# logger.log_result(f"Numeric features: {numeric_features}", print_to_console=True)

# categorical_features = df.select_dtypes(include=["category"]).columns.tolist()
# logger.log_result(f"Categorical features: {categorical_features}", print_to_console=True)

# structured_features = [
#     f for f in numeric_features
#     if not f.startswith(("code_emb_", "msg_emb_"))
# ]
# logger.log_result(f"Structural features: {structured_features}", print_to_console=True)
# logger.log_result(len(structured_features), print_to_console=True)


# embedding_features = [
#     f for f in numeric_features
#     if f.startswith(("code_emb_", "msg_emb_"))
# ]
# logger.log_result(f"embedding_features: {embedding_features}", print_to_console=True)



[TRAIN CHECK] Starting df feature analysis...
[TRAIN RESULT] Numeric features: ['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'hunks_count', 'msg_len', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'recent_churn', 'todo', 'fixme', 'try', 'except', 'raise', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'msg_emb_0', 'msg_emb_1', 'msg_emb_2', 'msg_emb_3', 'msg_emb_4', 'msg_emb_5', 'msg_emb_6', 'msg_emb_7', 'msg_emb_8', 'msg_emb_9', 'msg_emb_10', 'msg_emb_11', 'msg_emb_12', 'msg_emb_13', 'msg_emb_14', 'msg_emb_15', 'msg_emb_16', 'msg_emb_17', 'msg_emb_18', 'msg_emb_19', 'msg_emb_20', 'msg_emb_21', 'msg_emb_22', 'msg_emb_23', 'msg_emb_24', 'msg_emb_25', 'msg_emb_26', 'msg_emb_27', 'msg_emb_28', 'msg_emb_29', 'msg_emb_30', 'msg_emb_31', 'msg_emb_32', 'msg_emb_33', 'msg_emb_34', 'msg_emb_35', 'msg_emb_36', 'msg_emb_37', 'msg_emb_38', 'msg_

### 7.2.4 Train / Test Split (Stratified)

In [None]:
display_func(split_train_test)

In [None]:
# X = df.drop(columns=[TARGET])
# y = df[TARGET]

# X_train, X_test, y_train, y_test = train_test_split(
#     X,
#     y,
#     test_size=0.2,
#     # stratify=y, not required since the training subset of the original df is balanced
#     random_state=RANDOM_STATE
# )

# from main_config import TEST_SPLIT

X_train, X_test, y_train, y_test = split_train_test(
    df=df, target=TARGET, random_state=RANDOM_STATE, test_size=TEST_SPLIT, logger=logger
)

[TRAIN CHECK] Splitting df into train & test subsets...
[TRAIN RESULT] Total rows before split: 139545
[TRAIN RESULT] Feature count (X): 92
[TRAIN RESULT] Target column: 'label'
[TRAIN RESULT] Test size: 20.00%
[TRAIN RESULT] Train rows: 111636 | Test rows: 27909
[TRAIN RESULT] Splitting completed.


### 7.2.5 Preprocessing Pipeline

Design choices (aligned with the EDA):
- No scaling for trees
- Remove zero-variance features
- Keep engineered features (selection later)

Why VarianceThreshold is useful: This step removes any numerical features that have zero variance (i.e., all values are identical). Features with zero variance provide no information to the model and can sometimes cause issues or slow down training, so it's good practice to remove them.

#### PCA
PCA is an unsupervised linear transformation technique used for dimensionality reduction. Its goal is to reduce the number of features while retaining as much of the original variance (information) as possible.

`n_components` is the target number of dimensions. PCA will transform the original 768 embedding features into a new, smaller set of 100 features.

**In Context:** You are combining 768 Code embeddings and 768 Message embeddings, resulting in 1536 embedding features. PCA reduces this set of 1536 features down to a manageable and non-redundant set of 100 features that still capture most of the semantic meaning.


### 7.2.5 Baseline Random Forest Model

This combined object, model, is a Pipeline, which ensures that the preprocessing steps are always applied correctly before the Random Forest Classifier is trained or used for prediction.

In [None]:
# rf = RandomForestClassifier(
#     n_estimators=300,
#     max_depth=20,
#     # min_samples_leaf=2,
#     random_state=RANDOM_STATE,
#     max_features="log2",
#     min_samples_split=2,
#     # class_weight=CLASS_WEIGHT,
#     n_jobs=1              # ðŸ”´ IMPORTANT
# )

# model = Pipeline(steps=[
#     # ("preprocess", preprocessor),
#     ("rf", rf)
# ])

rf_wrapper = RFWrapper(random_state=RANDOM_STATE)
model = rf_wrapper.get_model()
rf_pipeline = RFPipelineWrapper(rf=model)

[DEF LOGGER CHECK] Defining Random Forest...
[DEF LOGGER RESULT] Model definition done.
[DEF LOGGER CHECK] Defining RF Pipeline...
[DEF LOGGER RESULT] Pipeline definition done.


### 7.2.6 Hyperparameter Tuning

In [None]:
rf_tuning_wrapper = RFTuningWrapper(rf=model, X_train=X_train, y_train=y_train)

display_func(rf_tuning_wrapper.run_grid_search)

[DEF LOGGER CHECK] Initializing RFTunning Wrapper object...
[DEF LOGGER RESULT] Initialization completed.


In [13]:
rf_tuning_wrapper = RFTuningWrapper(rf=model, X_train=X_train, y_train=y_train)
best_params, best_score = rf_tuning_wrapper.run_grid_search()
# --- Update Model ---
model.set_params(**best_params)  # make sure pipeline/model uses the tuned parameters

[DEF LOGGER CHECK] Initializing RFTunning Wrapper object...
[DEF LOGGER RESULT] Initialization completed.
[DEF LOGGER CHECK] Running grid search...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


KeyboardInterrupt: 

### 7.2.7 Cross-Validation (Primary Evaluation)

In [None]:
cv_wrapper = CVWrapper(random_state=RANDOM_STATE)
display_func(cv_wrapper.cross_validate)
display_func(cv_wrapper.mean_results)

[DEF LOGGER CHECK] Defining cross-validation...
[DEF LOGGER RESULT] Validation definition done.


In [None]:
# # -------------------------------------------------------------------------
# # 1. Define the KFold splitter (non-stratified)
# # -------------------------------------------------------------------------
# # NOTE: This does NOT guarantee equal class proportions in each fold.
# from sklearn.model_selection import KFold


# cv = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# # -------------------------------------------------------------------------
# # 2. Define the scoring metrics
# # -------------------------------------------------------------------------
# scoring = {
#     "roc_auc": "roc_auc",
#     "f1": "f1",
#     "precision": "precision",
#     "recall": "recall"
# }

# # -------------------------------------------------------------------------
# # 3. Execute the cross-validation
# # -------------------------------------------------------------------------
# cv_results = cross_validate(
#     model,      # Your machine learning pipeline
#     X_train,    # Training features
#     y_train,    # Training labels
#     cv=cv,      # The KFold splitter
#     scoring=scoring,
#     n_jobs=6,    # Use CPU cores
#     verbose=3
# )

# # -------------------------------------------------------------------------
# # 4. View and aggregate the results
# # -------------------------------------------------------------------------
# # The mean of the results gives the model's average performance.
# average_metrics = pd.DataFrame(cv_results).mean()

# print("\n--- Average Cross-Validation Metrics (KFold) ---")
# print(average_metrics)

cv_wrapper = CVWrapper(random_state=RANDOM_STATE)
cv_results = cv_wrapper.cross_validate(
    model=model, X_train=X_train, y_train=y_train
)

cv_wrapper.mean_results()

[DEF LOGGER CHECK] Defining cross-validation...
[DEF LOGGER RESULT] Validation definition done.
[DEF LOGGER CHECK] Cross validating provided model...


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


KeyboardInterrupt: 

### 7.2.8 Model fit

In [None]:
display_func(fit_model)

In [17]:
# This step trains the single, final model pipeline that is saved
# in the 'model' variable and used for prediction and PFI.
# model.fit(X_train, y_train)
# from src_code.ml_pipeline.training.train import fit_model


model = fit_model(model=model, X_train=X_train, y_train=y_train)


[DEF LOGGER CHECK] Starting model fit...
[DEF LOGGER RESULT] Model fit completed. Time: 124.375742


### 7.2.9 Single inference check

In [None]:
display_func(check_single_infer)

In [None]:
# import time

# start_time = time.time()

# # This call runs the entire pipeline: Preprocessing (PCA) + Random Forest Prediction
# # The output is not needed, just the execution time.
# _ = model.predict(X_test)

# end_time = time.time()
# single_inference_duration = end_time - start_time

# print(f"Time for a single inference run on X_test ({len(X_test)} rows): {single_inference_duration:.2f} seconds")

check_single_infer(model=model, X_test=X_test, logger=logger)

[TRAIN CHECK] Checking single model inference...
[TRAIN RESULT] Inference done.
[TRAIN RESULT] Time for a single inference run on X_test (27909 rows): 1.60 seconds


### 7.2.10 Feature Importance (Permutation-Based)

Critical due to correlated engineered features

You are calculating and displaying the **Permutation Feature Importance (PFI)** for your entire machine learning pipeline (model) using the held-out test set. This is a crucial step in model interpretation, especially for complex models like Random Forests.

The permutation_importance function calculates the drop in a model's score when a single feature is randomly shuffled (permuted).

How it Works:

1. The function first calculates the model's baseline score (e.g., ROC AUC) on the unshuffled X_test and y_test.

2. For each feature (e.g., lines_added), it randomly shuffles the values in that column across the entire X_test set.

3. It then recalculates the model's score using this corrupted data.

4. The Permutation Importance is the difference between the baseline score and the score with the shuffled feature. A large drop in score indicates the feature was highly important.

In [None]:
X_test_small = X_test.sample(n=5000, random_state=RANDOM_STATE)
y_test_small = y_test.loc[X_test_small.index]

pfi_wrapper = PFIWrapper(
    model=model,
    X_test=X_test_small,
    y_test=y_test_small,
    random_state=RANDOM_STATE,
    logger=logger,
)

display_func(pfi_wrapper.run_PFI)
display_func(pfi_wrapper.calc_importances)

[TRAIN CHECK] Initializing PFI wrapper..
[TRAIN CHECK] Initialization done.


In [None]:
TOP_K = 20


# pfi_wrapper = PFIWrapper(
#     model=model,
#     X_test=X_test_small,
#     y_test=y_test_small,
#     random_state=RANDOM_STATE,
#     logger=logger,
# )

pfi_wrapper.run_PFI()
pfi_wrapper.calc_importances()
importances = pfi_wrapper.get_importances(top_k=TOP_K)


# # The total number of tasks is N_features * n_repeats
# # n_features = len(model.named_steps["preprocess"].get_feature_names_out())
# n_features = X_test_small.shape[1]
# total_tasks = n_features * 2

# # with parallel_backend('loky', n_jobs=-1): # Use all cores
#     # with tqdm.tqdm(total=total_tasks, desc="PFI Permutations") as progress_bar:
#         # Wrap the function call in a helper that updates the progress bar
#         # This is a bit advanced but forces joblib to use the tqdm callback

# # NOTE: In modern scikit-learn/joblib, simply setting the backend
# # is often enough to show the progress. If not, this is the safest way:
# perm = permutation_importance(
#     model,
#     X_test_small,
#     y_test_small,
#     n_repeats=2,
#     random_state=RANDOM_STATE,
#     n_jobs=6, # <--- Re-enabled parallel processing
# )

# importances = pd.Series(
#     perm.importances_mean, # Retrieves the average importance score
#                             # (the average drop in model performance)
#                             # calculated across the n_repeats=2 runs
#                             # for each feature.
#     # index=model.named_steps["preprocess"].get_feature_names_out()\
#     index=X_test_small.columns

#     # This is a crucial step for pipelines. After the ColumnTransformer
#     # ("preprocess") has run (including PCA and any other steps), the feature
#     #  names are transformed (e.g., code_emb_0 becomes embed__pca__0). This
#     # method retrieves the correct, final feature names that the model actually used.
# ).sort_values(ascending=False)

# importances.head(20)

[TRAIN CHECK] Starting PFI...
[TRAIN RESULT] PFI completed.
[TRAIN CHECK] Calculating PFI importances...
[TRAIN RESULT] Calculation complete.
[TRAIN RESULT] Top 20 PFI features:
[TRAIN RESULT]  1. max_func_change: 0.007300
[TRAIN RESULT]  2. msg_len: 0.005900
[TRAIN RESULT]  3. msg_emb_0: 0.005800
[TRAIN RESULT]  4. loc_deleted_x_hunks_count: 0.005600
[TRAIN RESULT]  5. msg_emb_3: 0.004300
[TRAIN RESULT]  6. ast_delta: 0.003700
[TRAIN RESULT]  7. loc_added_x_hunks_count: 0.003500
[TRAIN RESULT]  8. code_emb_2: 0.003300
[TRAIN RESULT]  9. hunks_count: 0.003300
[TRAIN RESULT] 10. loc_added_x_loc_deleted: 0.003100
[TRAIN RESULT] 11. loc_deleted: 0.003000
[TRAIN RESULT] 12. loc_churn_ratio: 0.002800
[TRAIN RESULT] 13. code_emb_4: 0.002800
[TRAIN RESULT] 14. code_emb_0: 0.002700
[TRAIN RESULT] 15. code_emb_7: 0.002500
[TRAIN RESULT] 16. msg_emb_22: 0.002400
[TRAIN RESULT] 17. code_emb_5: 0.002300
[TRAIN RESULT] 18. msg_emb_24: 0.002200
[TRAIN RESULT] 19. msg_emb_14: 0.002200
[TRAIN RESULT] 

PFI is generally preferred because:

1. **Model Agnostic**: It works for any model (Random Forest, Neural Network, etc.).

2. **Includes Preprocessing**: It measures the importance of features after they have gone through the entire pipeline (including PCA), giving you the importance of the final, processed features, which is essential when dealing with complex pipelines.

In [None]:
# for index, (key, value) in enumerate(importances.items()):
#     print(f"{index}. {key}: {value}")

AttributeError: 'zip' object has no attribute 'items'

#### 7.2.10.1 Feature Subset Refinement (Optional Iteration)
(Re-run steps 8â€“11 using reduced feature set)

In [23]:
display_func(pfi_wrapper.refine_features)

In [24]:
threshold = 0.0001 # Or use 0.0 to be more inclusive
# top_features = importances[importances > threshold].index.tolist()

# # Filter your training and testing sets
# X_train_filtered = X_train[top_features]
# X_test_filtered = X_test[top_features]
# # df_test = pd.read_feather(ENGINEERING_MAPPINGS['test']['output'])
# # top_filter = top_features.copy()
# # top_filter.append('label')
# # print(top_filter)
# # df_test = df_test[top_filter]
# # df_test.to_feather("lala.feather")


# print(f"Reduced feature count from {len(importances)} to {len(top_features)}")

X_train, X_test = pfi_wrapper.refine_features(X_train=X_train, X_test=X_test, threshold=threshold)

[TRAIN CHECK] Refining features based on best PFI importances...
[TRAIN RESULT] Reduced feature count from 92 to 77


#### 7.2.10.2 Retraining

In [25]:
# This step trains the single, final model pipeline that is saved
# in the 'model' variable and used for prediction and PFI.
# model.fit(X_train_filtered, y_train)
# model.fit(X_train, y_train)
model = fit_model(model=model, X_train=X_train, y_train=y_train, logger=logger)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",20
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'log2'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


### 7.2.11 RCEV (alternative to TFI, just demo)

In [None]:
# from sklearn.feature_selection import RFECV
# from sklearn.model_selection import StratifiedKFold

# # 1. Use the RF model we already defined
# # Note: RFECV uses 'feature_importances_' (built-in) which is faster than Permutation
# rf = RandomForestClassifier(
#     n_estimators=100, 
#     class_weight='balanced', 
#     random_state=42,
#     n_jobs=-1
# )

# # 2. Set up the selector
# # we use StratifiedKFold to keep the bug/no-bug ratio consistent
# min_features_to_select = 5 
# step = 1 # remove 1 feature at a time

# selector = RFECV(
#     estimator=rf,
#     step=step,
#     cv=StratifiedKFold(5),
#     scoring=mcc_scorer, # Using the MCC scorer we made earlier!
#     min_features_to_select=min_features_to_select,
#     n_jobs=-1,
#     verbose=1
# )

# # 3. Fit to the training data
# selector = selector.fit(X_train, y_train)

# # 4. Results
# print(f"Optimal number of features: {selector.n_features_}")
# selected_features = X_train.columns[selector.support_].tolist()
# print(f"Selected Features: {selected_features}")

KeyboardInterrupt: 

## 7.3 Saving the Model

*joblib.dump()* is used to save the model object to a file. It is generally preferred over standard Python pickle for large objects containing NumPy arrays (like your Random Forest and PCA objects).

In [None]:
display_func(save_model)

In [29]:
# Define the filename (e.g., in a 'models' directory)
# MODEL_SAVE_PATH = "models/random_forest_pipeline.joblib"
MODEL_SAVE_PATH = MODEL_DIR / "random_forest_pipeline.joblib"
# log_check("Saving the model ")

# # Ensure the directory exists
# os.makedirs("models", exist_ok=True)
# MODEL_SAVE_PATH = MODEL_DIR / "random_forest_pipeline.joblib"
# # Save the entire fitted pipeline
# joblib.dump(model, MODEL_DIR / MODEL_SAVE_PATH)

# log_result(f"âœ… Model successfully saved to: {MODEL_SAVE_PATH}", print_to_console=True)
# save_df(df=df, df_file_path=MODEL_SAVE_PATH, logger=logger)
save_model(model=model, path=MODEL_SAVE_PATH, logger=logger)

[TRAIN CHECK] Saving the trained model...
[TRAIN RESULT] Saving done.


In [None]:
# df_test_path = ENGINEERING_MAPPINGS['test']['output']
# # top_features.append('label')
# df_test = pd.read_feather(df_test_path)
# df_test = df_test[top_features]

# df_test.columns
# df_test.to_feather("test.feather")


ValueError: Duplicate column names found: ['has_bug_kw', 'loc_added_x_hunks_count', 'line_token_total', 'ast_delta', 'msg_emb_20', 'code_emb_3', 'loc_deleted', 'msg_emb_8', 'loc_deleted_x_hunks_count', 'label', 'msg_emb_0', 'max_func_change', 'msg_len', 'msg_emb_3', 'hunks_count', 'has_fix_kw', 'code_emb_8', 'code_emb_2', 'code_emb_5', 'loc_added_bucket_cat', 'author_exp_pre', 'label']

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import make_scorer, matthews_corrcoef
# from sklearn.model_selection import GridSearchCV

# # 1. Define the model
# rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# # 2. Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20],
#     'min_samples_split': [2, 5, 10],
#     'max_features': ['sqrt', 'log2']
# }

# # 3. Create a custom scorer for MCC
# mcc_scorer = make_scorer(matthews_corrcoef)

# # 4. Set up Grid Search
# grid_search = GridSearchCV(
#     estimator=rf, 
#     param_grid=param_grid, 
#     scoring=mcc_scorer, 
#     cv=5,            # 5-fold cross-validation
#     n_jobs=1,       # Use all CPU cores
#     verbose=3
# )

# # 5. Run the search
# grid_search.fit(X_train, y_train)

# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Best MCC Score: {grid_search.best_score_}")
# from src_code.ml_pipeline.training.tuning import RFTuningWrapper


# rf_tuning_wrapper = RFTuningWrapper(rf=model, X_train=X_train, y_train=y_train)
# best_params, best_score = rf_tuning_wrapper.run_grid_search()


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.614 total time=  36.1s
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.611 total time=  37.9s
[CV 3/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.613 total time=  36.1s
[CV 4/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.610 total time=  35.5s
[CV 5/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.614 total time=  36.4s
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.616 total time= 1.1min
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.611 total time= 1.1min
[CV 3/5] END max_depth=10, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.613 total time= 1.1min
[C