In [None]:

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedKFold, LeaveOneOut
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_predict
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os
import sys

# Add project root to path (for Jupyter notebooks)
# Get the current directory and navigate to project root
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from src.config import Config
import src.eda as eda
import src.present_value as present_value
from src.ml_utils import remove_outliers

%load_ext autoreload
%autoreload 2
%reload_ext autoreload




In [68]:
## FROM DATABASE
pv = present_value.PresentValue()
anual_increment = pv.fetch_incremento_from_database()

fase = "III"
preproccesing = eda.EDA()
df_raw = preproccesing.assemble_projects_from_database(fase)
df_vp = preproccesing.create_dataset(pv.present_value_costs, fase=fase)

  w = (df[cols] / totals).fillna(0)


In [None]:
# remove_outliers function now imported from src.ml_utils
# This centralizes the outlier detection logic and eliminates code duplication

In [4]:
def rmsle_scorer(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

def train_model(df_clean, predictor_name, hue_name, target_name):
    X = df_clean[[predictor_name, hue_name]].copy()
    X[predictor_name + ' LOG'] = np.log1p(X[predictor_name])
    y = df_clean[target_name].astype(float)

    pre = ColumnTransformer([
        ('num', StandardScaler(), [predictor_name, predictor_name + ' LOG']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), [hue_name])
    ])

    svr = SVR(kernel='rbf')
    pipe = Pipeline([('pre', pre), ('svr', svr)])
    model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)

    param_grid = {
        'regressor__svr__C': [5, 10, 80, 200, 1000],
        'regressor__svr__epsilon': [0.01],
        'regressor__svr__gamma': ['scale', 'auto', 0.01, 0.1, 1.0],
    }

    cv = RepeatedKFold(n_splits=min(5, len(y)//2), n_repeats=min(5, len(y)//2), random_state=42) if len(y) >= 10 else LeaveOneOut()
    gs = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, refit=True)
    gs.fit(X, y)


    cv_simple = RepeatedKFold(n_splits=min(5, len(y)//2), n_repeats=1, random_state=42) if len(y) >= 10 else LeaveOneOut()
    y_oof = cross_val_predict(gs.best_estimator_, X, y, cv=cv_simple, n_jobs=-1)
    
    r2_oof = r2_score(y, y_oof)
    mae_oof = mean_absolute_error(y, y_oof)
    rmse_oof = np.sqrt(mean_squared_error(y, y_oof))
    rmsle_oof = rmsle_scorer(y, y_oof)
    mape_oof = np.mean(np.abs((y - y_oof) / y.replace(0, np.nan))) * 100

    print('Best params:', gs.best_params_)
    print({'R2': r2_oof, 'MAE': mae_oof, 'RMSE': rmse_oof, 'RMSLE': rmsle_oof, 'MAPE%': float(mape_oof)})
    
    y_predicted = y_oof
   
    return X, y, y_predicted, gs.best_estimator_
    

In [5]:
def analysis_plots(y, y_predicted, df_item_cleaned, predictor_name, target_name, hue_name):
    """
    Creates beautiful executive Plotly visualizations for model analysis
    
    Parameters:
    - y: Actual values
    - y_predicted: Predicted values
    - df_item_cleaned: Cleaned dataframe with all project data
    - predictor_name: Name of the predictor column (e.g., 'LONGITUD KM')
    - target_name: Name of the target column (e.g., '5 - TALUDES')
    - hue_name: Name of the hue column (e.g., 'ALCANCE')
    """
    
    # Get df_raw to access project codes and names
    global df_raw
    
    # Create hover data with project information
    hover_data = []
    for idx in df_item_cleaned.index:
        hover_text = ""
        # Try to get project code and name from df_raw
        if 'df_raw' in globals() and idx in df_raw.index:
            if 'CÓDIGO DEL PROYECTO' in df_raw.columns:
                hover_text += f"<b>Código:</b> {df_raw.loc[idx, 'CÓDIGO DEL PROYECTO']}<br>"
            if 'NOMBRE DEL PROYECTO' in df_raw.columns:
                hover_text += f"<b>Nombre:</b> {df_raw.loc[idx, 'NOMBRE DEL PROYECTO']}<br>"
        
        if predictor_name in df_item_cleaned.columns:
            hover_text += f"<b>{predictor_name}:</b> {df_item_cleaned.loc[idx, predictor_name]:.2f}<br>"
        if hue_name in df_item_cleaned.columns:
            hover_text += f"<b>{hue_name}:</b> {df_item_cleaned.loc[idx, hue_name]}<br>"
        hover_data.append(hover_text)
    
    # Plot 1: Actual vs Predicted (Plotly)
    fig1 = go.Figure()
    
    # Add scatter points colored by Alcance
    colors_map = {
        'Segunda calzada': '#1f77b4',
        'operacion y mantenimiento': '#ff7f0e', 
        'Mejoramiento': '#2ca02c',
        'Rehabilitación': '#d62728',
        'Nuevo': '#9467bd',
        'Construcción': '#8c564b',
        'Puesta a punto': '#e377c2'
    }
    
    # Convert y and y_predicted to pandas Series if they aren't already
    if not isinstance(y, pd.Series):
        y = pd.Series(y, index=df_item_cleaned.index)
    if not isinstance(y_predicted, pd.Series):
        y_predicted = pd.Series(y_predicted, index=df_item_cleaned.index)
    
    for alcance_type in sorted(df_item_cleaned[hue_name].unique()):
        mask = df_item_cleaned[hue_name] == alcance_type
        indices = df_item_cleaned[mask].index
        y_actual_subset = y[mask]
        y_pred_subset = y_predicted[mask]
        
        hover_subset = [hover_data[list(df_item_cleaned.index).index(idx)] + 
                       f"<b>Valor Real:</b> ${y_actual_subset.loc[idx]:,.0f}<br><b>Predicción:</b> ${y_pred_subset.loc[idx]:,.0f}"
                       for idx in indices]
        
        fig1.add_trace(go.Scatter(
            x=y_actual_subset,
            y=y_pred_subset,
            mode='markers',
            name=alcance_type,
            marker=dict(
                size=12,
                color=colors_map.get(alcance_type, '#7f7f7f'),
                opacity=0.8,
                line=dict(width=1, color='white')
            ),
            hovertemplate='%{customdata}<extra></extra>',
            customdata=hover_subset
        ))
    
    # Add perfect prediction line
    min_val = min(y.min(), y_predicted.min())
    max_val = max(y.max(), y_predicted.max())
    fig1.add_trace(go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Predicción Perfecta',
        line=dict(color='red', width=2, dash='dash'),
        showlegend=True
    ))
    
    # Update layout for executive look
    fig1.update_layout(
        title=dict(
            text=f'<b>Predicción vs Realidad - {target_name}</b>',
            x=0.5,
            xanchor='center',
            font=dict(size=20, family='Arial Black')
        ),
        xaxis=dict(
            title=dict(
                text='<b>Valor Real ($)</b>',
                font=dict(size=14)
            ),
            showgrid=True,
            gridcolor='lightgray',
            gridwidth=0.5
        ),
        yaxis=dict(
            title=dict(
                text='<b>Valor Predicho ($)</b>',
                font=dict(size=14)
            ),
            showgrid=True,
            gridcolor='lightgray',
            gridwidth=0.5
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        hovermode='closest',
        legend=dict(
            orientation="v",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01,
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor="lightgray",
            borderwidth=1
        ),
        width=900,
        height=600
    )
    
    fig1.show()

In [6]:
def train_and_calculate_metrics(df, target_columns, predictor_name, hue_name):
    results = {}
    
    for target_name in target_columns:
        df_item = df.loc[:, [predictor_name, hue_name, target_name]]
        df_item_cleaned = remove_outliers(df_item, target_name) 
        # df_item_cleaned = remove_outliers(df_item, target_name, method='all_strict') 
        # preproccesing.show_plots_eda(predictor_name, target_name, hue_name, df_item_cleaned)
        print(target_name)
        X, y, y_predicted, trained_model = train_model(df_item_cleaned, predictor_name, hue_name, target_name)
        analysis_plots(y, y_predicted, df_item_cleaned, predictor_name, target_name, hue_name)
        
        results[target_name] = {
            'X': X,
            'y': y,
            'y_predicted': y_predicted,
            'trained_model': trained_model
        }
        
        # if target_name == '16 - DIRECCIÓN Y COORDINACIÓN':
        #     break
    return results    

predictor_name = 'LONGITUD KM'
hue_name = 'ALCANCE'

# Fase II
# ['NOMBRE DEL PROYECTO', 'CÓDIGO', 'LONGITUD KM',
#        'PUENTES VEHICULARES UND', 'PUENTES VEHICULARES M2',
#        'PUENTES PEATONALES UND', 'PUENTES PEATONALES M2', 'TUNELES UND',
#        'TUNELES KM', 'ALCANCE', 'ZONA', 'TIPO TERRENO', '1 - TRANSPORTE',
#        '2 - TRAZADO Y TOPOGRAFIA (incluye subcomponentes)',
#        '3 - GEOLOGÍA (incluye subcomponentes)', '4 - TALUDES',
#        '5 - HIDROLOGÍA E HIDRÁULICA', '6 - ESTRUCTURAS', '7 - TÚNELES',
#        '8 - PAVIMENTO', '9 - PREDIAL', '10 - AMBIENTAL Y SOCIAL',
#        '11 - COSTOS Y PRESUPUESTOS', '12 - SOCIOECONÓMICA',
#        '13 - DIRECCIÓN Y COORDINACIÓN']

#Fase III
# ['LONGITUD KM', 'ALCANCE', '1 - TRANSPORTE', '2.1 - INFORMACIÓN GEOGRÁFICA', '2.2 - TRAZADO Y DISEÑO GEOMÉTRICO',
#        '2.3 - SEGURIDAD VIAL', '2.4 - SISTEMAS INTELIGENTES', '3.1 - GEOLOGÍA','3.2 - HIDROGEOLOGÍA', '4 - SUELOS', '5 - TALUDES', '6 - PAVIMENTO',
#        '7 - SOCAVACIÓN', '8 - ESTRUCTURAS', '9 - TÚNELES', '10 - URBANISMO Y PAISAJISMO', '11 - PREDIAL', '12 - IMPACTO AMBIENTAL',
#        '13 - CANTIDADES', '14 - EVALUACIÓN SOCIOECONÓMICA', '15 - OTROS - MANEJO DE REDES', '16 - DIRECCIÓN Y COORDINACIÓN']

target_columns_fase_II = ['1 - TRANSPORTE', '2 - TRAZADO Y TOPOGRAFIA (incluye subcomponentes)', '3 - GEOLOGÍA (incluye subcomponentes)', 
                          '8 - PAVIMENTO', '9 - PREDIAL', '10 - AMBIENTAL Y SOCIAL',
                          '11 - COSTOS Y PRESUPUESTOS', '12 - SOCIOECONÓMICA', '13 - DIRECCIÓN Y COORDINACIÓN']

target_columns_fase_III = ['1 - TRANSPORTE', '2.1 - INFORMACIÓN GEOGRÁFICA','2.2 - TRAZADO Y DISEÑO GEOMÉTRICO', '2.3 - SEGURIDAD VIAL', '2.4 - SISTEMAS INTELIGENTES', 
                  '5 - TALUDES', '6 - PAVIMENTO', '7 - SOCAVACIÓN', '11 - PREDIAL', 
                  '12 - IMPACTO AMBIENTAL', '15 - OTROS - MANEJO DE REDES', '16 - DIRECCIÓN Y COORDINACIÓN']

df = df_vp[['LONGITUD KM', 'ALCANCE']].join(df_vp.loc[:, '1 - TRANSPORTE':'16 - DIRECCIÓN Y COORDINACIÓN'])
results = train_and_calculate_metrics(df, target_columns_fase_III, predictor_name, hue_name)    

1 - TRANSPORTE
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': -2.999999999999962, 'MAE': 4643975.471230011, 'RMSE': np.float64(4643975.471230011), 'RMSLE': np.float64(0.23571981522235674), 'MAPE%': 23.790881617359197}


  → Removed 1/10 outliers (10.0%) using ensemble
     • isolation_forest: 1 outliers
     • lof: 1 outliers
     • robust_statistical: 4 outliers
     • z_score: 0 outliers
2.1 - INFORMACIÓN GEOGRÁFICA
Best params: {'regressor__svr__C': 1000, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': 0.9997796987954439, 'MAE': 199650.3645912337, 'RMSE': np.float64(262145.3416134265), 'RMSLE': np.float64(0.09552108481900061), 'MAPE%': 5.977188326771584}


  → Removed 5/50 outliers (10.0%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 3 outliers
     • z_score: 2 outliers
2.2 - TRAZADO Y DISEÑO GEOMÉTRICO
Best params: {'regressor__svr__C': 10, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'auto'}
{'R2': 0.8262669836001061, 'MAE': 2270169.4857858047, 'RMSE': np.float64(5467999.962276422), 'RMSLE': np.float64(0.38350201315039295), 'MAPE%': 22.301882321811455}


  → Removed 4/49 outliers (8.2%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 2 outliers
     • z_score: 1 outliers
2.3 - SEGURIDAD VIAL
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': 0.7918920978029302, 'MAE': 2511792.6589240604, 'RMSE': np.float64(4750779.3538916), 'RMSLE': np.float64(0.49436419068004045), 'MAPE%': 35.99627260044706}


  → Removed 5/42 outliers (11.9%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 4 outliers
     • z_score: 1 outliers
2.4 - SISTEMAS INTELIGENTES
Best params: {'regressor__svr__C': 200, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': 0.8512799272969779, 'MAE': 409025.9121887115, 'RMSE': np.float64(1723149.2436338912), 'RMSLE': np.float64(0.12688379723284127), 'MAPE%': 4.590438781928622}


  → Removed 7/44 outliers (15.9%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 9 outliers
     • z_score: 0 outliers
5 - TALUDES
Best params: {'regressor__svr__C': 80, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': 0.5500808832166654, 'MAE': 4610935.4374588, 'RMSE': np.float64(11457757.603246277), 'RMSLE': np.float64(0.6167585671285556), 'MAPE%': 21.407052573444656}


  → Removed 4/34 outliers (11.8%) using ensemble
     • isolation_forest: 4 outliers
     • lof: 4 outliers
     • robust_statistical: 4 outliers
     • z_score: 0 outliers
6 - PAVIMENTO
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': 0.24086125721603624, 'MAE': 4531609.340969036, 'RMSE': np.float64(10548559.482150026), 'RMSLE': np.float64(0.6518224834715809), 'MAPE%': 30.959784696545945}


  → Removed 5/50 outliers (10.0%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 1 outliers
     • z_score: 1 outliers
7 - SOCAVACIÓN
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': 0.9191092259281675, 'MAE': 3759520.04480352, 'RMSE': np.float64(6367428.417797689), 'RMSLE': np.float64(0.28881793052083904), 'MAPE%': 19.058232143082638}


11 - PREDIAL
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 1.0}
{'R2': -0.4044402705078842, 'MAE': 710990.1232883679, 'RMSE': np.float64(1052513.0803657123), 'RMSLE': np.float64(0.059109258098428374), 'MAPE%': 3.8924951032041744}


12 - IMPACTO AMBIENTAL
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': -3.000000000000001, 'MAE': 117199626.3808198, 'RMSE': np.float64(117199626.38081981), 'RMSLE': np.float64(4.2750618423999285), 'MAPE%': 3593.535818986291}


  → Removed 4/34 outliers (11.8%) using ensemble
     • isolation_forest: 4 outliers
     • lof: 4 outliers
     • robust_statistical: 7 outliers
     • z_score: 1 outliers
15 - OTROS - MANEJO DE REDES
Best params: {'regressor__svr__C': 200, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.1}
{'R2': 0.7624701568730734, 'MAE': 3693765.6476776865, 'RMSE': np.float64(8049804.631489442), 'RMSLE': np.float64(0.3844994149087249), 'MAPE%': 20.865049176069167}


  → Removed 6/51 outliers (11.8%) using ensemble
     • isolation_forest: 5 outliers
     • lof: 5 outliers
     • robust_statistical: 9 outliers
     • z_score: 3 outliers
16 - DIRECCIÓN Y COORDINACIÓN
Best params: {'regressor__svr__C': 10, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 'scale'}
{'R2': 0.21370390700900155, 'MAE': 8699400.727068625, 'RMSE': np.float64(26692933.321417972), 'RMSLE': np.float64(0.6642193520187814), 'MAPE%': 27.372159165142275}


In [19]:
# ['LONGITUD KM', 'ALCANCE', '1 - TRANSPORTE', '2.1 - INFORMACIÓN GEOGRÁFICA', '2.2 - TRAZADO Y DISEÑO GEOMÉTRICO',
#        '2.3 - SEGURIDAD VIAL', '2.4 - SISTEMAS INTELIGENTES', '3.1 - GEOLOGÍA','3.2 - HIDROGEOLOGÍA', '4 - SUELOS', '5 - TALUDES', '6 - PAVIMENTO',
#        '7 - SOCAVACIÓN', '8 - ESTRUCTURAS', '9 - TÚNELES', '10 - URBANISMO Y PAISAJISMO', '11 - PREDIAL', '12 - IMPACTO AMBIENTAL',
#        '13 - CANTIDADES', '14 - EVALUACIÓN SOCIOECONÓMICA', '15 - OTROS - MANEJO DE REDES', '16 - DIRECCIÓN Y COORDINACIÓN']

df_items = df_vp[['LONGITUD KM', 'ALCANCE']].join(df_vp.loc[:, '1 - TRANSPORTE':])
df_items
# remove_outliers(df_items, '2.1 - INFORMACIÓN GEOGRÁFICA')

Unnamed: 0,LONGITUD KM,ALCANCE,1 - TRANSPORTE,2.1 - INFORMACIÓN GEOGRÁFICA,2.2 - TRAZADO Y DISEÑO GEOMÉTRICO,2.3 - SEGURIDAD VIAL,2.4 - SISTEMAS INTELIGENTES,3.1 - GEOLOGÍA,3.2 - HIDROGEOLOGÍA,4 - SUELOS,...,7 - SOCAVACIÓN,8 - ESTRUCTURAS,9 - TÚNELES,10 - URBANISMO Y PAISAJISMO,11 - PREDIAL,12 - IMPACTO AMBIENTAL,13 - CANTIDADES,14 - EVALUACIÓN SOCIOECONÓMICA,15 - OTROS - MANEJO DE REDES,16 - DIRECCIÓN Y COORDINACIÓN
0,17.4,Segunda calzada,0.0,0.0,135340500.0,72875640.0,40135750.0,123936200.0,0.0,187843700.0,...,52759750.0,683821500.0,0.0,0.0,0.0,0.0,13772380.0,0.0,50279650.0,171404300.0
1,13.69,Nuevo,0.0,0.0,0.0,0.0,24430490.0,65075530.0,0.0,193779300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163428900.0
2,10.0,Segunda calzada,0.0,1332293.0,6119988.0,1619942.0,6750908.0,0.0,0.0,0.0,...,12401650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6825669.0,3505045.0
3,14.3,Segunda calzada,0.0,1905179.0,8751583.0,2316517.0,9653799.0,0.0,0.0,0.0,...,17734360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9760706.0,5012214.0
4,10.2,operacion y mantenimiento,0.0,1358939.0,6242388.0,1652341.0,6885926.0,0.0,0.0,0.0,...,12649680.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6962182.0,3575146.0
5,8.5,operacion y mantenimiento,0.0,1132449.0,5201990.0,1376951.0,5738272.0,0.0,0.0,0.0,...,10541400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5801818.0,2979288.0
6,13.5,Segunda calzada,0.0,1798596.0,8261984.0,2186922.0,9113726.0,0.0,0.0,0.0,...,16742230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9214653.0,4731811.0
7,47.2,operacion y mantenimiento,0.0,6288425.0,28886340.0,7646127.0,31864290.0,0.0,0.0,0.0,...,58535790.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32217160.0,16543810.0
8,7.6,Segunda calzada,0.0,1012543.0,4651191.0,1231156.0,5130690.0,0.0,0.0,0.0,...,9425254.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5187508.0,2663834.0
9,5.0,Mejoramiento,0.0,0.0,6311029.0,2105373.0,0.0,13858170.0,0.0,13261060.0,...,6289853.0,642177.4,0.0,0.0,0.0,0.0,0.0,0.0,29011250.0,6672842.0


In [None]:
# Extract the last target's results (16 - DIRECCIÓN Y COORDINACIÓN)
last_target = '16 - DIRECCIÓN Y COORDINACIÓN'
X = results[last_target]['X']
y = results[last_target]['y']
y_predicted = results[last_target]['y_predicted']

X['ACTUAL'] = y
X['PREDICTED'] = y_predicted
output = X.drop(columns=['LONGITUD KM LOG'])

# Calculate various accuracy metrics for each prediction
output['APE (%)'] = (abs(X['ACTUAL'] - X['PREDICTED']) / X['ACTUAL'].replace(0, np.nan)) * 100  # Absolute Percentage Error
output['ACCURACY (%)'] = 100 - output['APE (%)']  # Accuracy as percentage

# Add quality indicators
output['WITHIN_20%'] = output['APE (%)'] <= 20  # Flag for acceptable predictions

# Display summary statistics
print("PREDICTION ACCURACY SUMMARY")
print(f"Mean Absolute Percentage Error (MAPE): {output['APE (%)'].mean():.2f}%")
print(f"Median Absolute Percentage Error: {output['APE (%)'].median():.2f}%")
print(f"Mean Accuracy: {output['ACCURACY (%)'].mean():.2f}%")
print(f"Predictions within ±20%: {output['WITHIN_20%'].sum()} / {len(output)} ({output['WITHIN_20%'].sum()/len(output)*100:.1f}%)")

# Sort by absolute percentage error (worst predictions first) and display
output.sort_values(by='APE (%)', ascending=False)

PREDICTION ACCURACY SUMMARY
Mean Absolute Percentage Error (MAPE): 21.41%
Median Absolute Percentage Error: 4.19%
Mean Accuracy: 78.59%
Predictions within ±20%: 25 / 37 (67.6%)


Unnamed: 0,LONGITUD KM,ALCANCE,ACTUAL,PREDICTED,APE (%),ACCURACY (%),WITHIN_20%
19,9.86,Mejoramiento,4078240.0,8088686.0,98.337683,1.662317,False
9,5.0,Mejoramiento,25736600.0,2072171.0,91.948543,8.051457,False
24,10.06,Mejoramiento,44276650.0,8204297.0,81.470375,18.529625,False
23,5.0,Mejoramiento,6931841.0,2072171.0,70.106484,29.893516,False
20,8.35,Rehabilitación,71114030.0,21717030.0,69.461685,30.538315,False
18,14.52,Mejoramiento,6005683.0,9764899.0,62.594305,37.405695,False
16,6.56,Mejoramiento,2713312.0,4390557.0,61.815448,38.184552,False
29,0.8,Construcción,2032312.0,928502.5,54.313,45.687,False
32,19.31,Puesta a punto,49054940.0,33585850.0,31.534214,68.465786,False
49,15.76,Segunda calzada,52424390.0,37333420.0,28.786173,71.213827,False
