In [81]:

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedKFold, LeaveOneOut
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor, LinearRegression, BayesianRidge, Ridge, ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_predict
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os
import sys

# Add project root to path (for Jupyter notebooks)
# Get the current directory and navigate to project root
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from src.config import Config
import src.eda as eda
import src.present_value as present_value
from src.ml_utils import remove_outliers, calculate_metrics, analysis_plots

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
## FROM DATABASE
pv = present_value.PresentValue()
anual_increment = pv.fetch_incremento_from_database()

fase = "III"
preproccesing = eda.EDA()
df_raw = preproccesing.assemble_projects_from_database(fase)
df_vp = preproccesing.create_dataset(pv.present_value_costs, fase=fase)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [83]:
def train_model(df_clean, predictor_name, hue_name, target_name):
    X = df_clean[[predictor_name, hue_name]].copy()
    X[predictor_name + ' LOG'] = np.log1p(X[predictor_name])
    y = df_clean[target_name].astype(float)

    pre = ColumnTransformer([
        ('num', StandardScaler(), [predictor_name, predictor_name + ' LOG']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), [hue_name])
    ])

    svr = SVR(kernel='rbf')
    pipe = Pipeline([('pre', pre), ('svr', svr)])
    model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)

    param_grid = {
        'regressor__svr__C': [5, 10, 80, 200, 1000],
        'regressor__svr__epsilon': [0.01],
        'regressor__svr__gamma': ['scale', 'auto', 0.01, 0.1, 1.0],
    }

    cv = RepeatedKFold(n_splits=min(5, len(y)//2), n_repeats=min(5, len(y)//2), random_state=42) if len(y) >= 10 else LeaveOneOut()
    gs = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, refit=True)
    gs.fit(X, y)

    cv_simple = RepeatedKFold(n_splits=min(5, len(y)//2), n_repeats=1, random_state=42) if len(y) >= 10 else LeaveOneOut()
    y_oof = cross_val_predict(gs.best_estimator_, X, y, cv=cv_simple, n_jobs=-1)
    
    # Calculate comprehensive metrics using the centralized function
    metrics = calculate_metrics(y, y_oof, target_name, include_rmsle=True)

    print('Best params:', gs.best_params_)
    print({'R2': metrics['R²'], 'MAE': metrics['MAE'], 'RMSE': metrics['RMSE'], 'RMSLE': metrics['RMSLE'], 'MAPE%': metrics['MAPE (%)']})
    
    y_predicted = y_oof
   
    return X, y, y_predicted, gs.best_estimator_
    

In [84]:
def train_and_calculate_metrics(df, target_columns, predictor_name, hue_name):
    results = {}
    
    for target_name in target_columns:
        df_item = df.loc[:, [predictor_name, hue_name, target_name]]
        df_item_cleaned = remove_outliers(df_item, target_name) 
        print(target_name)
        X, y, y_predicted, trained_model = train_model(df_item_cleaned, predictor_name, hue_name, target_name)
        # analysis_plots(y, y_predicted, df_item_cleaned, predictor_name, target_name, hue_name, df_raw=df_raw)
        
        results[target_name] = { 'X': X, 'y': y, 'y_predicted': y_predicted,'trained_model': trained_model }
        
    return results 

In [85]:
predictor_name = 'LONGITUD KM'
hue_name = 'ALCANCE'

target_columns_fase_II = ['1 - TRANSPORTE', '2 - TRAZADO Y TOPOGRAFIA (incluye subcomponentes)', '3 - GEOLOGÍA (incluye subcomponentes)', 
                          '8 - PAVIMENTO', '9 - PREDIAL', '10 - AMBIENTAL Y SOCIAL',
                          '11 - COSTOS Y PRESUPUESTOS', '12 - SOCIOECONÓMICA', '13 - DIRECCIÓN Y COORDINACIÓN']

target_columns_fase_III = ['1 - TRANSPORTE', '2.1 - INFORMACIÓN GEOGRÁFICA','2.2 - TRAZADO Y DISEÑO GEOMÉTRICO', '2.3 - SEGURIDAD VIAL', '2.4 - SISTEMAS INTELIGENTES', 
                  '5 - TALUDES', '6 - PAVIMENTO', '7 - SOCAVACIÓN', '11 - PREDIAL', 
                  '12 - IMPACTO AMBIENTAL', '15 - OTROS - MANEJO DE REDES']

df = df_vp[['LONGITUD KM', 'ALCANCE']].join(df_vp.loc[:, '1 - TRANSPORTE':'15 - OTROS - MANEJO DE REDES'])
results = train_and_calculate_metrics(df, target_columns_fase_III, predictor_name, hue_name)    

1 - TRANSPORTE
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': -2.999999999999962, 'MAE': 4643975.471230011, 'RMSE': np.float64(4643975.471230011), 'RMSLE': np.float64(0.23571981522235674), 'MAPE%': np.float64(23.790881617359197)}
  → Removed 1/10 outliers (10.0%) using ensemble
2.1 - INFORMACIÓN GEOGRÁFICA
Best params: {'regressor__svr__C': 1000, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': 0.9997796987954439, 'MAE': 199650.3645912337, 'RMSE': np.float64(262145.3416134265), 'RMSLE': np.float64(0.09552108481900061), 'MAPE%': np.float64(5.977188326771584)}
  → Removed 5/50 outliers (10.0%) using ensemble
2.2 - TRAZADO Y DISEÑO GEOMÉTRICO
Best params: {'regressor__svr__C': 10, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'auto'}
{'R2': 0.8262669836001061, 'MAE': 2270169.4857858047, 'RMSE': np.float64(5467999.962276422), 'RMSLE': np.float64(0.38350201315039295), 'MAPE%': np.float64(22.30

In [78]:
last_target = '15 - OTROS - MANEJO DE REDES'
X = results[last_target]['X']
y = results[last_target]['y']
y_predicted = results[last_target]['y_predicted']

X['ACTUAL'] = y
X['PREDICTED'] = y_predicted
output = X.drop(columns=['LONGITUD KM LOG'])

# Calculate various accuracy metrics for each prediction
output['APE (%)'] = (abs(X['ACTUAL'] - X['PREDICTED']) / X['ACTUAL'].replace(0, np.nan)) * 100  # Absolute Percentage Error
output['ACCURACY (%)'] = 100 - output['APE (%)']  # Accuracy as percentage

# Add quality indicators
output['WITHIN_20%'] = output['APE (%)'] <= 20  # Flag for acceptable predictions

# Display summary statistics
print("PREDICTION ACCURACY SUMMARY")
print(f"Mean Absolute Percentage Error (MAPE): {output['APE (%)'].mean():.2f}%")
print(f"Median Absolute Percentage Error: {output['APE (%)'].median():.2f}%")
print(f"Mean Accuracy: {output['ACCURACY (%)'].mean():.2f}%")
print(f"Predictions within ±20%: {output['WITHIN_20%'].sum()} / {len(output)} ({output['WITHIN_20%'].sum()/len(output)*100:.1f}%)")

# Sort by absolute percentage error (worst predictions first) and display
output.sort_values(by='APE (%)', ascending=False)

PREDICTION ACCURACY SUMMARY
Mean Absolute Percentage Error (MAPE): 20.87%
Median Absolute Percentage Error: 8.10%
Mean Accuracy: 79.13%
Predictions within ±20%: 20 / 30 (66.7%)


Unnamed: 0,LONGITUD KM,ALCANCE,ACTUAL,PREDICTED,APE (%),ACCURACY (%),WITHIN_20%
8,7.6,Segunda calzada,5187508.0,9631660.0,85.67026,14.32974,False
0,17.4,Segunda calzada,50279650.0,17512500.0,65.169807,34.830193,False
29,0.8,Construcción,2883774.0,4709587.0,63.313287,36.686713,False
9,5.0,Mejoramiento,29011250.0,11548410.0,60.193353,39.806647,False
42,1.0,Puesta a punto,3604718.0,1713578.0,52.462904,47.537096,False
2,10.0,Segunda calzada,6825669.0,3718551.0,45.52108,54.47892,False
31,5.17,Segunda calzada,18636390.0,10250500.0,44.997419,55.002581,False
26,15.0,Rehabilitación,54070770.0,33783750.0,37.519379,62.480621,False
27,3.12,Nuevo,11246720.0,7862367.0,30.091905,69.908095,False
40,3.15,Rehabilitación,11354860.0,14623210.0,28.783706,71.216294,False
