In [69]:

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedKFold, LeaveOneOut
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer
import numpy as np, seaborn as sns, matplotlib.pyplot as plt

import eda
import present_value

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
pv = present_value.PresentValue()
anual_increment = pv.fetch_salary_increase_per_year()

In [25]:
filename = "../data/raw/BASE DE DATOS PRESUPUESTOS.xlsx"
preproccesing = eda.EDA(filename)
df = preproccesing.create_dataset(pv.present_value_costs)

  w = (df[cols] / totals).fillna(0)


In [26]:
def remove_outliers(df, target: str) -> pd.DataFrame:
    q1, q3 = df[target].quantile(0.05), df[target].quantile(0.95)
    iqr = q3 - q1
    x_clean = (df[target] >= q1 - 1.5*iqr) & (df[target] <= q3 + 1.5*iqr) & (df[target] != 0)
    
    df_clean = df[x_clean]
    return df_clean

In [None]:
def train_model(df_clean, predictor_name, hue_name, target_name):
    X = df_clean[[predictor_name, hue_name]].copy()
    X[predictor_name + ' LOG'] = np.log1p(X[predictor_name])
    y = df_clean[target_name].astype(float)

    pre = ColumnTransformer([
        ('num', StandardScaler(), [predictor_name, predictor_name + ' LOG']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), [hue_name])
    ])

    svr = SVR(kernel='rbf')
    pipe = Pipeline([('pre', pre), ('svr', svr)])
    model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)

    param_grid = {
        'regressor__svr__C': [5, 10, 80, 200, 1000],
        'regressor__svr__epsilon': [0.01],
        'regressor__svr__gamma': ['scale', 'auto', 0.01, 0.1, 1.0],
    }

    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)
    gs = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, refit=True)
    gs.fit(X, y)

    scores = cross_validate(gs.best_estimator_, X, y, cv=cv, 
                           scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error', 
                                    'rmse': 'neg_root_mean_squared_error'}, n_jobs=-1)
    
    y_predicted = gs.predict(X)
    mape = np.mean(np.abs((y - y_predicted) / y.replace(0, np.nan))) * 100

    print('Best params:', gs.best_params_)
    print({'R2': scores['test_r2'].mean(), 'MAE': -scores['test_mae'].mean(), 'RMSE': -scores['test_rmse'].mean(), 'Full_MAPE%': float(mape)})

    # plt.style.use('seaborn-v0_8-whitegrid')
    # fig, ax = plt.subplots(1, 2, figsize=(12,5))
    # ax[0].scatter(y, y_predicted, alpha=0.6); lim = [y.min(), y.max()]
    # ax[0].plot(lim, lim, 'r--'); ax[0].set_title('Actual vs Pred'); ax[0].set_xlabel('Actual'); ax[0].set_ylabel('Predicted')
    # sns.histplot(y - y_predicted, kde=True, ax=ax[1], color='slateblue'); ax[1].set_title('Residuals')
    # plt.tight_layout(); plt.show()
    
    return X, y, y_predicted 
    

In [None]:
predictor_name = 'LONGITUD KM'
hue_name = 'ALCANCE'
target_names_function_of_longitude = ['2.2 TRAZADO Y DISEÑO GEOMÉTRICO', 
                                      '2.3 - SEGURIDAD VIAL',
                                      '2.4 - SISTEMAS INTELIGENTES', 
                                      '5 - TALUDES',
                                      '6 - PAVIMENTO',
                                      '7 - SOCAVACIÓN',
                                      '11 - PREDIAL',
                                      '12 - IMPACTO AMBIENTAL',
                                      '15 - OTROS - MANEJO DE REDES']

for target_name in target_names_function_of_longitude:
    df_item = df.loc[:, [predictor_name, hue_name, target_name]]
    df_item_cleaned = remove_outliers(df_item, target_name)
    print(f'*********{target_name}*********')
    # preproccesing.show_plots_eda(predictor_name, target_name, hue_name, df_item_cleaned)
    X, y, y_predicted = train_model(df_item_cleaned, predictor_name, hue_name, target_name)
    if target_name == '11 - PREDIAL':
        break
    

*********2.2 TRAZADO Y DISEÑO GEOMÉTRICO*********
Best params: {'regressor__svr__C': 5, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 1.0}
{'R2': np.float64(0.7962218833740572), 'MAE': np.float64(4725717.8684844645), 'RMSE': np.float64(8921131.037139025), 'Full_MAPE%': 4.722184642741052}
*********2.3 - SEGURIDAD VIAL*********
Best params: {'regressor__svr__C': 10, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.1}
{'R2': np.float64(0.7214713003273263), 'MAE': np.float64(2614164.488472851), 'RMSE': np.float64(4924509.751561174), 'Full_MAPE%': 17.224377251099256}
*********2.4 - SISTEMAS INTELIGENTES*********
Best params: {'regressor__svr__C': 1000, 'regressor__svr__epsilon': 0.01, 'regressor__svr__gamma': 0.01}
{'R2': np.float64(0.7391468529364189), 'MAE': np.float64(1232016.4907846171), 'RMSE': np.float64(2868227.839904746), 'Full_MAPE%': 2.158906992256163}
*********5 - TALUDES*********
Best params: {'regressor__svr__C': 200, 'regressor__svr__epsilon': 0.01, 'reg

In [77]:
X['ACTUAL'] = y
X['PREDICTED'] = y_predicted
output = X.drop(columns=['LONGITUD KM LOG'])
output['difference'] = abs(X['ACTUAL'] - X['PREDICTED'])
output.sort_values(by='difference', ascending=False)

Unnamed: 0,LONGITUD KM,ALCANCE,ACTUAL,PREDICTED,difference
44,14.6,Construcción,17694770.0,19751580.0,2056811.0
45,49.4,Construcción,59871330.0,60487020.0,615692.4
47,26.4,Construcción,31996010.0,31663010.0,333003.9
41,26.2,Construcción,31753620.0,31438850.0,314770.5
25,5.243,Mejoramiento,18914860.0,18731030.0,183829.0
43,24.9,Construcción,30178060.0,29999490.0,178568.3
46,40.8,Construcción,49448380.0,49317420.0,130967.3
42,23.8,Construcción,28844890.0,28806020.0,38866.97
