# 07 - Model Training

## 7.1 Set-Up

### 7.1.1 - Initial Configuration

In [1]:
# logger.info("Setting up root by appending the parent to the sys...")
from jupyter_init import setup

setup()

from src_code.config import *

### 7.1.2 - Setting up Logger

In [2]:
from notebooks.logging_config import setup_notebook_logging

logger, log_start, log_check, log_result = setup_notebook_logging(label="ENGINEERING")

log_start(print_to_console=True)
log_result("Logging configured.", print_to_console=True)

[ENGINEERING RESULT] Logging configured.


### 7.1.3 Imports & Configuration

In [3]:
import numpy as np
import pandas as pd

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance

# Imbalance handling
from sklearn.utils.class_weight import compute_class_weight

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 7.1.4 Loading Dataset

In [4]:
log_check("Loading the dataset...")
import pandas as pd
import numpy as np
import seaborn as sns

# TRANSFORMED_DF = EXTRACTED_DATA_DIR / "train_labeled_features_partial.feather"
PREPROCESSED_DF = PREPROCESSED_DATA_DIR / "train_engineered.feather"

# ---- LOAD ----
df = pd.read_feather(PREPROCESSED_DF)

## 7.2 Model Training

### 7.2.1 Target & Column Separation

In [5]:
TARGET = "label"

# Drop identifiers & leakage-prone columns
DROP_COLS = [
    "commit",
    "repo",
    "filepath",
    "author_email",
    "datetime",
    "canonical_datetime",
]

df = df.drop(columns=DROP_COLS, errors="ignore")

### 7.2.2 Embedding Handling (CodeBERT)
Transformation on the embedding columns because machine learning models, especially traditional ones like Logistic Regression, Random Forests, or Gradient Boosting, cannot directly process a list or a NumPy array stored as a single entry (a cell) in a pandas DataFrame.

The process is a necessary feature engineering step that converts the single embedding column into many separate numerical columns. This technique is often referred to as feature expansion or flattening the embedding vector.

In [6]:
def expand_embedding(df, col_name, prefix):
    emb = np.vstack(df[col_name].values)
    emb_df = pd.DataFrame(
        emb,
        index=df.index,
        columns=[f"{prefix}_{i}" for i in range(emb.shape[1])]
    )
    return emb_df

In [7]:
code_emb_df = expand_embedding(df, "code_embed", "code_emb")
msg_emb_df  = expand_embedding(df, "msg_embed", "msg_emb")

df = pd.concat(
    [df.drop(columns=["code_embed", "msg_embed"]), code_emb_df, msg_emb_df],
    axis=1
)

### 7.2.3 Feature Type Identification

In [8]:
numeric_features = df.select_dtypes(include=["float64", "int64", "int8"]).columns.tolist()
numeric_features.remove(TARGET)
log_result(f"Numeric features: {numeric_features}", print_to_console=True)

categorical_features = df.select_dtypes(include=["category"]).columns.tolist()
log_result(f"Categorical features: {categorical_features}", print_to_console=True)

structured_features = [
    f for f in numeric_features
    if not f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"Structural features: {structured_features}", print_to_console=True)


embedding_features = [
    f for f in numeric_features
    if f.startswith(("code_emb_", "msg_emb_"))
]
log_result(f"embedding_features: {embedding_features}", print_to_console=True)



[ENGINEERING RESULT] Numeric features: ['author_exp_pre', 'author_recent_activity_pre', 'loc_added', 'loc_deleted', 'files_changed', 'hunks_count', 'msg_len', 'has_fix_kw', 'has_bug_kw', 'ast_delta', 'complexity_delta', 'max_func_change', 'time_since_last_change', 'todo', 'fixme', 'try', 'except', 'raise', 'recent_churn', 'recent_churn_winsorized', 'loc_churn_ratio', 'activity_per_exp', 'extreme_churn_flag', 'loc_added_bucket_cat', 'line_token_total', 'todo_ratio', 'fixme_ratio', 'try_ratio', 'except_ratio', 'raise_ratio', 'loc_added_x_loc_deleted', 'loc_added_x_hunks_count', 'loc_deleted_x_hunks_count', 'code_emb_0', 'code_emb_1', 'code_emb_2', 'code_emb_3', 'code_emb_4', 'code_emb_5', 'code_emb_6', 'code_emb_7', 'code_emb_8', 'code_emb_9', 'code_emb_10', 'code_emb_11', 'code_emb_12', 'code_emb_13', 'code_emb_14', 'code_emb_15', 'code_emb_16', 'code_emb_17', 'code_emb_18', 'code_emb_19', 'code_emb_20', 'code_emb_21', 'code_emb_22', 'code_emb_23', 'code_emb_24', 'code_emb_25', 'code_em

### 7.2.4 Train / Test Split (Stratified)

In [None]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

### 7.2.5 Preprocessing Pipeline

Design choices (aligned with the EDA):
- No scaling for trees
- Remove zero-variance features
- Keep engineered features (selection later)

In [None]:
from sklearn.decomposition import PCA


numeric_transformer = Pipeline(steps=[
    ("var_thresh", VarianceThreshold(threshold=0.0))
])

embedding_transformer = Pipeline(steps=[
    ("pca", PCA(n_components=100, random_state=RANDOM_STATE))
])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_features),
#     ],
#     remainder="drop"
# )
preprocessor = ColumnTransformer(
    transformers=[
        ("struct", "passthrough", structured_features),
        ("embed", embedding_transformer, embedding_features),
    ],
    remainder="drop"
)

### 7.2.5 Baseline Random Forest Model

In [None]:
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=None,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    # class_weight=CLASS_WEIGHT,
    n_jobs=1              # ðŸ”´ IMPORTANT
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("rf", rf)
])

### 7.2.6 Cross-Validation (Primary Evaluation)

In [None]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "precision": "precision",
    "recall": "recall"
}

cv_results = cross_validate(
    model,
    X_train,
    y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=2
)

pd.DataFrame(cv_results).mean()

### 7.2.7 Final Training & Test Evaluation

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

### 7.2.8 Feature Importance (Permutation-Based)

Critical due to correlated engineered features

In [None]:
perm = permutation_importance(
    model,
    X_test,
    y_test,
    n_repeats=2,
    random_state=RANDOM_STATE,
    n_jobs=2
)

importances = pd.Series(
    perm.importances_mean,
    index=model.named_steps["preprocess"].get_feature_names_out()
).sort_values(ascending=False)

importances.head(20)

### 7.2.9 Feature Subset Refinement (Optional Iteration)
(Re-run steps 8â€“11 using reduced feature set)

In [None]:
TOP_K = 100
top_features = importances.head(TOP_K).index.tolist()

X_train_reduced = X_train[top_features]
X_test_reduced  = X_test[top_features]