# Extra Trees Model Training (from scraped GitHub code)

This notebook trains an **Extra Trees** model on the metrics dataset produced by your pipeline.

**Expected input:** `data/processed/dataset.csv` (built from analyzing code scraped from GitHub repos).

**Output artifacts:** saved model + feature columns under `models/` (so you can reuse it for inference).

## 1) Install dependencies (if needed)
If you already have these installed, you can skip this cell.

In [None]:
# If running in a fresh environment, uncomment:
%pip install -U pandas numpy scikit-learn joblib matplotlib

## 2) Load dataset

In [3]:
from pathlib import Path
import json

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent  # notebooks/ -> repo root
DATASET_PATH = PROJECT_ROOT / 'data' / 'processed' / 'dataset_processed.csv'
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

if not DATASET_PATH.exists():
    raise FileNotFoundError(f'Missing dataset at {DATASET_PATH}. Build it first (scrape -> analyze -> dataset_builder).')

df = pd.read_csv(DATASET_PATH)
print('Loaded:', DATASET_PATH)
print('Shape:', df.shape)
display(df.head())

Loaded: /home/darth/the-duat/Notes/01_college/CSCI-417/project/data/processed/dataset_processed.csv
Shape: (3088, 51)


Unnamed: 0,abbreviation_density,average_cyclomatic_complexity,avg_line_length,comment_code_mismatch_score,comment_lines,comment_percentage,decision_density,documentation_coverage,external_vs_internal_field_access_ratio,functions,...,y_MisleadingComments,y_PoorDocumentation,y_UntestedCode,complexity_score,code_health,doc_quality,has_tests,coupling_complexity,smell_density,effort_impact_ratio
0,0.0,0.666324,0.568106,0.0,1.0,1.0,0.623314,0.0,0.458333,0.125,...,0.0,1.0,1.0,0.718428,0.880992,0.4165,0,0.541667,0.150928,0.44942
1,0.0,0.306982,0.531561,0.0,0.25,1.0,0.326629,0.0,0.125,0.0625,...,0.0,1.0,0.0,0.35237,0.932637,0.4525,1,0.083333,0.402474,0.0
2,0.333,0.204312,0.348837,0.0,0.0,0.0,0.1632,0.0,0.041667,0.0,...,0.0,1.0,0.0,0.30012,0.929269,0.5,0,0.083333,0.551215,0.0
3,0.5,0.204312,0.325581,0.0,0.0,0.0,0.104,0.0,0.041667,0.0,...,0.0,1.0,0.0,0.27922,0.932637,0.5,0,0.083333,0.291447,0.019656
4,0.5,0.204312,0.320598,0.0,0.0,0.0,0.099429,0.0,0.041667,0.0,...,0.0,1.0,0.0,0.27922,0.932637,0.5,0,0.083333,0.272644,0.0


## 4) Build `X` and `y` (cleaning + split)
This keeps only numeric features and fills missing values.

In [5]:
from sklearn.model_selection import train_test_split

# Keep numeric columns only (ExtraTrees in sklearn expects numeric input)
numeric_df = df.select_dtypes(include=[np.number]).copy()

# Drop rows with missing target
numeric_df = numeric_df.dropna(subset=[x for x in numeric_df.columns if x.startswith('y_')])

Y = numeric_df.iloc[:, 39:53]
X = numeric_df.drop(columns=[x for x in numeric_df.columns if x.startswith('y_')])

# Fill missing features with 0 (minimal, consistent default)
X = X.fillna(0)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1042
)

print('X_train:', X_train.shape, 'X_test:', X_test.shape)
print('Y distribution (train):')
display(Y_train.value_counts(dropna=False))

X_train: (2470, 42) X_test: (618, 42)
Y distribution (train):


y_PoorDocumentation  y_UntestedCode  complexity_score  code_health  doc_quality  has_tests  coupling_complexity  smell_density  effort_impact_ratio
0.0                  0.0             0.091123          0.932637     0.5          0          0.000000             0.000000       0.0                    196
                                                                                 1          0.000000             0.000000       0.0                     86
                                     0.195622          0.932637     0.5          0          0.083333             0.469554       0.0                     18
                                                                                                                 0.422598       0.0                     10
                                     0.279220          0.932637     0.5          0          0.166667             0.301856       0.0                      6
                                                                             

## 5) Train Extra Trees

In [7]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor

RANDOM_STATE = 10
est_params = dict(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)

def _is_binary_series(s):
    vals = np.unique(s.dropna())
    return set(vals.tolist()) <= {0, 1}

# Choose classifier vs regressor (and multioutput wrappers) based on Y_train
if Y_train.shape[1] == 1:
    y = Y_train.iloc[:, 0]
    if pd.api.types.is_integer_dtype(y) or _is_binary_series(y):
        model = ExtraTreesClassifier(**est_params, class_weight='balanced_subsample')
    else:
        model = ExtraTreesRegressor(**est_params)
else:
    # Multi-output: use classifier only if all targets are binary {0,1}
    all_binary = all(_is_binary_series(Y_train[c]) for c in Y_train.columns)
    if all_binary:
        base = ExtraTreesClassifier(**est_params, class_weight='balanced_subsample')
        model = MultiOutputClassifier(base)
    else:
        base = ExtraTreesRegressor(**est_params)
        model = MultiOutputRegressor(base)

model.fit(X_train, Y_train)
print('Trained:', model.__class__.__name__)

Trained: MultiOutputRegressor


## 6) Evaluate

In [9]:
from sklearn.metrics import (
    accuracy_score, mean_absolute_error, mean_squared_error, r2_score,
)

Y_pred = model.predict(X_test)

# For multi-output: evaluate per-column accuracy and overall metrics
if Y_test.shape[1] > 1:
    print(f'Multi-output evaluation ({Y_test.shape[1]} targets):')
    overall_acc = (Y_pred == Y_test.values).mean()  # Element-wise accuracy
    print(f'  Overall accuracy (element-wise): {overall_acc:.4f}')
    
    for i, col in enumerate(Y_test.columns):
        col_acc = accuracy_score(Y_test.iloc[:, i], Y_pred[:, i])
        print(f'  Target "{col}": {col_acc:.4f}')
    
    print(f'\n  MAE: {mean_absolute_error(Y_test, Y_pred):.4f}')
    print(f'  MSE: {mean_squared_error(Y_test, Y_pred):.4f}')
else:
    # Single output: standard metrics
    y_test_col = Y_test.iloc[:, 0]
    y_pred_col = Y_pred[:, 0] if Y_pred.ndim > 1 else Y_pred
    
    print(f'Accuracy: {accuracy_score(y_test_col, y_pred_col):.4f}')
    print(f'MAE: {mean_absolute_error(y_test_col, y_pred_col):.4f}')

Multi-output evaluation (9 targets):
  Overall accuracy (element-wise): 0.4266


ValueError: Classification metrics can't handle a mix of binary and continuous targets

## 7) Feature importance (quick look)

In [None]:
import pandas as pd

fi = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
display(fi.head(20))

## 8) Save model + metadata
This writes:
- `models/extratrees_<task>_<target>.joblib`
- `models/extratrees_<task>_<target>_features.json`

In [None]:
import joblib

safe_target = ''.join(c if c.isalnum() or c in ('_', '-') else '_' for c in TARGET_COLUMN)
model_path = MODELS_DIR / f'extratrees_{TASK}_{safe_target}.joblib'
features_path = MODELS_DIR / f'extratrees_{TASK}_{safe_target}_features.json'

joblib.dump(model, model_path)
features_path.write_text(json.dumps({
    'target': TARGET_COLUMN,
    'task': TASK,
    'feature_columns': list(X.columns),
    'random_state': RANDOM_STATE,
}, indent=2))

print('Saved model to:', model_path)
print('Saved feature metadata to:', features_path)

## 9) (Optional) Inference helper
Given a single metrics record (dict), this predicts the target.

Note: your inference input must have the **same feature columns** as training.

In [None]:
def predict_one(metrics_record: dict):
    row = pd.DataFrame([metrics_record])
    row = row.reindex(columns=X.columns, fill_value=0)
    row = row.select_dtypes(include=[np.number]).fillna(0)
    return model.predict(row)[0]

# Example: take the first row of the dataset and predict
example = X.iloc[0].to_dict()
print('Prediction:', predict_one(example))