In [1]:

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedKFold, LeaveOneOut
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_predict
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os
import sys

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

# Add project root to path (for Jupyter notebooks)
# Get the current directory and navigate to project root
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.insert(0, project_root)

from src.config import Config
import src.eda as eda
import src.present_value as present_value
from src.ml_utils import remove_outliers, calculate_metrics

%load_ext autoreload
%autoreload 2
%reload_ext autoreload




In [2]:
## FROM DATABASE
pv = present_value.PresentValue()
anual_increment = pv.fetch_incremento_from_database()

fase = "III"
preproccesing = eda.EDA()
df_raw = preproccesing.assemble_projects_from_database(fase)
df_vp = preproccesing.create_dataset(pv.present_value_costs, fase=fase)

  w = (df[cols] / totals).fillna(0)


In [4]:
def generate_synthetic_data(df_real, predictor1, predictor2, target, n_synthetic=20):
    X_real = df_real[[predictor1, predictor2]].values
    y_real = df_real[target].values
    
    np.random.seed(42)
    X_synthetic = []
    y_synthetic = []
    
    for _ in range(n_synthetic):
        idx1, idx2 = np.random.choice(len(df_real), 2, replace=False)
        alpha = np.random.beta(2, 2)
        
        x_new = alpha * X_real[idx1] + (1 - alpha) * X_real[idx2]
        x_new *= np.random.uniform(0.99, 1.01, size=2)
        
        ratio1 = x_new[0] / X_real[idx1, 0] if X_real[idx1, 0] != 0 else 1
        ratio2 = x_new[1] / X_real[idx1, 1] if X_real[idx1, 1] != 0 else 1
        y_new = y_real[idx1] * (ratio1**0.6) * (ratio2**0.4)
        y_new *= np.random.uniform(0.99, 1.01)
        
        X_synthetic.append(x_new)
        y_synthetic.append(y_new)
    
    df_synthetic = pd.DataFrame(X_synthetic, columns=[predictor1, predictor2])
    df_synthetic[target] = y_synthetic
    
    return pd.concat([df_real[[predictor1, predictor2, target]], df_synthetic], ignore_index=True)

def train_model_tunnels(df_clean, predictor_name, hue_name, target_name):
    predictor1, predictor2 = predictor_name, hue_name
    
    df_augmented = generate_synthetic_data(df_clean, predictor1, predictor2, target_name, n_synthetic=200)
    
    X = df_augmented[[predictor1, predictor2]].copy()
    X[predictor1 + '_LOG'] = np.log1p(X[predictor1])
    X[predictor2 + '_LOG'] = np.log1p(X[predictor2])
    y = df_augmented[target_name].astype(float)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    model = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log1p, inverse_func=np.expm1)
    model.fit(X_scaled, y)
    
    X_real = df_clean[[predictor1, predictor2]].copy()
    X_real[predictor1 + '_LOG'] = np.log1p(X_real[predictor1])
    X_real[predictor2 + '_LOG'] = np.log1p(X_real[predictor2])
    y_real = df_clean[target_name].astype(float)
    
    X_real_scaled = scaler.transform(X_real)
    y_pred = model.predict(X_real_scaled)
    
    # Calculate comprehensive metrics using the centralized function
    metrics = calculate_metrics(y_real, y_pred, target_name)
    
    print(f"\nR² = {metrics['R²']:.4f} | MAPE = {metrics['MAPE (%)']:.2f}%")
    
    class ScaledModel:
        def __init__(self, model, scaler, feature_names):
            self.model, self.scaler, self.feature_names = model, scaler, feature_names
        def predict(self, X_new):
            return self.model.predict(self.scaler.transform(X_new[self.feature_names]))
    
    return X_real, y_real, y_pred, ScaledModel(model, scaler, X.columns.tolist())
    

In [5]:
def train_and_calculate_metrics_tunnels(df, target_column, predictors):
    predictor1, predictor2 = predictors
    df_item = df[df[target_column] > 0][[predictor1, predictor2, target_column]].dropna()
    
    X, y, y_pred, model = train_model_tunnels(df_item, predictor1, predictor2, target_column)
    
    print(pd.DataFrame({
        predictor1: df_item[predictor1].values,
        predictor2: df_item[predictor2].values,
        'Actual': y.values,
        'Predicted': y_pred,
        'Error%': ((y.values - y_pred) / y.values * 100)
    }))
    
    return {target_column: {'X': X, 'y': y, 'y_predicted': y_pred, 'trained_model': model}}    

target_column = '9 - TÚNELES'
predictors = ['TUNELES UND', 'TUNELES KM']

df = df_vp[['LONGITUD KM', 'ALCANCE']].join(df_vp.loc[:, '1 - TRANSPORTE':'16 - DIRECCIÓN Y COORDINACIÓN'])
df[predictors[0]] = df_vp[predictors[0]]
df[predictors[1]] = df_vp[predictors[1]]

results = train_and_calculate_metrics_tunnels(df, target_column, predictors)    


R² = 0.8339 | MAPE = 9.76%
   TUNELES UND  TUNELES KM        Actual     Predicted     Error%
0            5      0.9200  6.644646e+07  6.562381e+07   1.238065
1            1      8.6000  1.632824e+08  1.275371e+08  21.891687
2           15      2.2747  1.906902e+08  2.023911e+08  -6.136095
