In [1]:

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score, RepeatedKFold, LeaveOneOut
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_predict
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os
import sys
from scipy.stats import linregress

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

# Add project root to path (for Jupyter notebooks)
# Get the current directory and navigate to project root
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from src.config import Config
import src.eda as eda
import src.present_value as present_value
from src.ml_utils import remove_outliers, calculate_metrics, analysis_plots, create_scatter_plot_with_regression

%load_ext autoreload
%autoreload 2
%reload_ext autoreload




In [44]:
## EXTRACT DATA FROM DATABASE
pv = present_value.PresentValue()
anual_increment = pv.fetch_incremento_from_database()

fase = "III"
preproccesing = eda.EDA()
df_raw = preproccesing.assemble_projects_from_database(fase)
df_vp = preproccesing.create_dataset(pv.present_value_costs, fase=fase)

  w = (df[cols] / totals).fillna(0)


In [95]:
df = df_vp.drop(columns=['ALCANCE', 'ZONA', 'TIPO TERRENO'])

agg_dict = {col: 'sum' for col in df.columns if col not in ['CÓDIGO']}
# agg_dict['ALCANCE'] = lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0]

df = df.groupby('CÓDIGO', as_index=False).agg(agg_dict)
df['3 - GEOLOGÍA'] = df['3.1 - GEOLOGÍA'] + df['3.2 - HIDROGEOLOGÍA']
df = df.drop(columns=['3.1 - GEOLOGÍA', '3.2 - HIDROGEOLOGÍA'])
df = df[df['3 - GEOLOGÍA'] != 0]
df = df[df['PUENTES VEHICULARES M2'] != 0]
df

Unnamed: 0,CÓDIGO,NOMBRE DEL PROYECTO,LONGITUD KM,PUENTES VEHICULARES UND,PUENTES VEHICULARES M2,PUENTES PEATONALES UND,PUENTES PEATONALES M2,TUNELES UND,TUNELES KM,1 - TRANSPORTE,...,8 - ESTRUCTURAS,9 - TÚNELES,10 - URBANISMO Y PAISAJISMO,11 - PREDIAL,12 - IMPACTO AMBIENTAL,13 - CANTIDADES,14 - EVALUACIÓN SOCIOECONÓMICA,15 - OTROS - MANEJO DE REDES,16 - DIRECCIÓN Y COORDINACIÓN,3 - GEOLOGÍA
0,1,TAMBILLO - COLIBRÍ,17.4,11,4856,14,1365,0,0.0,0.0,...,683821500.0,0.0,0.0,0.0,0.0,13772380.0,0.0,50279650.0,171404300.0,123936200.0
5,300605,BUGA - BUENAVENTURA UF4,17.0,13,16654,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37755150.0,49683150.0
7,347801,TERMINACIÓN TÚNEL DE LA LÍNEATERMINACIÓN TÚNEL...,25.45,26,51069,0,0,21,11.7947,0.0,...,1345857000.0,420419000.0,0.0,50715300.0,0.0,0.0,0.0,0.0,611515000.0,143078700.0
9,552903,TRANSVERSAL BOYACÁ,10.06,6,3837,0,0,0,0.0,17470400.0,...,216908800.0,0.0,0.0,0.0,0.0,3513204.0,0.0,0.0,22021940.0,29155670.0
10,581301,QUEREMAL - DANUBIO,5.243,2,38,0,0,3,0.146,22114380.0,...,264923700.0,0.0,0.0,18914860.0,118853000.0,17256940.0,17245100.0,0.0,79293030.0,42295440.0
12,6935,PEDREGAL - PASTO UF4-UF5 PEDREGAL - PASTO UF4...,37.96,4,6292,1,77,0,0.0,0.0,...,361903600.0,0.0,25815610.0,0.0,0.0,0.0,0.0,187474000.0,166864200.0,76864250.0


In [138]:
from sklearn.model_selection import LeaveOneOut
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer

features = ["2.2 - TRAZADO Y DISEÑO GEOMÉTRICO", "5 - TALUDES"]
# features = ["LONGITUD KM", "PUENTES VEHICULARES M2", "TUNELES KM"]
target = "3 - GEOLOGÍA"

df_clean = remove_outliers(df[features + [target]], target=target, method='ensemble')

X = df_clean[features]
y = df_clean[target]

loo = LeaveOneOut()
results = []

for name, model in [("Linear Regression", LinearRegression()), ("SVR", SVR(kernel='rbf', C=100, gamma='scale')), ("Random Forest", RandomForestRegressor(n_estimators=10, random_state=42))]:
    
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipe_log_x = Pipeline([('log', FunctionTransformer(np.log1p)), ('scaler', StandardScaler()), ('model', model)])
    
    # No log
    y_pred = cross_val_predict(pipe, X, y, cv=loo)
    results.append(calculate_metrics(y, y_pred, model_name=f"{name} (No Log)"))
    
    # Log target only
    log_y_model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)
    y_pred = cross_val_predict(log_y_model, X, y, cv=loo)
    results.append(calculate_metrics(y, y_pred, model_name=f"{name} (Log y)"))
    
    # Log inputs only
    y_pred = cross_val_predict(pipe_log_x, X, y, cv=loo)
    results.append(calculate_metrics(y, y_pred, model_name=f"{name} (Log X)"))
    
    # Log both
    log_both_model = TransformedTargetRegressor(regressor=pipe_log_x, func=np.log1p, inverse_func=np.expm1)
    y_pred = cross_val_predict(log_both_model, X, y, cv=loo)
    results.append(calculate_metrics(y, y_pred, model_name=f"{name} (Log X+y)"))

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))



                      Model        R²          MAE         RMSE  MAPE (%)    Median AE    Max Error
 Linear Regression (No Log)  0.520882 2.500568e+07 2.937721e+07 32.311894 2.110008e+07 4.554302e+07
  Linear Regression (Log y)  0.852490 1.284515e+07 1.630048e+07 14.174240 9.428689e+06 2.797938e+07
  Linear Regression (Log X)  0.341733 3.145376e+07 3.443422e+07 47.597548 3.555569e+07 4.736729e+07
Linear Regression (Log X+y)  0.580688 2.143745e+07 2.748262e+07 23.294271 2.191966e+07 4.367752e+07
               SVR (No Log) -0.775134 5.071464e+07 5.654638e+07 76.770698 4.113858e+07 9.339553e+07
                SVR (Log y) -0.055807 3.397421e+07 4.360954e+07 41.925405 1.906809e+07 8.070521e+07
                SVR (Log X) -0.775133 5.071461e+07 5.654635e+07 76.770643 4.113854e+07 9.339553e+07
              SVR (Log X+y) -0.375920 4.267532e+07 4.978353e+07 56.816440 3.648239e+07 8.340790e+07
     Random Forest (No Log) -0.368275 4.738096e+07 4.964505e+07 75.243429 4.127913e+07 7.113916e+07
