In [13]:

# SVR-only: preprocessing, log-feature, target log-transform, CV + grid search, plots
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np, seaborn as sns, matplotlib.pyplot as plt

import eda
import present_value

In [7]:
pv = present_value.PresentValue()
anual_increment = pv.fetch_salary_increase_per_year()

In [14]:
filename = "../data/raw/BASE DE DATOS PRESUPUESTOS.xlsx"
df = eda.EDA(filename).create_dataset(pv.present_value_costs)

TypeError: EDA.assemble_project() takes 1 positional argument but 2 were given

In [15]:
def remove_outliers(df, target='2.2 TRAZADO Y DISEÑO GEOMÉTRICO'):
    q1, q3 = df[target].quantile(0.05), df[target].quantile(0.95)
    iqr = q3 - q1
    x_clean = (df[target] >= q1 - 1.5*iqr) & (df[target] <= q3 + 1.5*iqr) & (df[target] != 0)
    
    df_clean = df[x_clean]
    return df_clean

In [None]:
df_clean = remove_outliers(df)[['LONGITUD KM', 'ALCANCE', '2.2 TRAZADO Y DISEÑO GEOMÉTRICO']]

In [None]:
X = df_clean[['LONGITUD KM', 'ALCANCE']].copy()
X['LONGITUD KM LOG'] = np.log1p(X['LONGITUD KM'])
y = df_clean['2.2 TRAZADO Y DISEÑO GEOMÉTRICO'].astype(float)

pre = ColumnTransformer([
    ('num', StandardScaler(), ['LONGITUD KM', 'LONGITUD KM LOG']),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['ALCANCE'])
])

svr = SVR(kernel='rbf')
pipe = Pipeline([('pre', pre), ('svr', svr)])
model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)

param_grid = {
    'regressor__svr__C': [10, 100, 300, 1000],
    'regressor__svr__epsilon': [0.1, 0.3, 0.5, 1.0],
    'regressor__svr__gamma': ['scale', 'auto', 0.1, 0.01],
}

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=1982)
gs = GridSearchCV(model, param_grid, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, refit=True)
gs.fit(Xtr, ytr)

yp = gs.predict(Xte)
mae = mean_absolute_error(yte, yp)
rmse = mean_squared_error(yte, yp)
r2 = r2_score(yte, yp)
mape = np.mean(np.abs((yte - yp) / yte.replace(0, np.nan))) * 100

print('Best params:', gs.best_params_)
print({'Test_R2': r2, 'Test_MAE': mae, 'Test_RMSE': rmse, 'Test_MAPE%': float(mape)})

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(1, 2, figsize=(12,5))
ax[0].scatter(yte, yp, alpha=0.6); lim = [yte.min(), yte.max()]
ax[0].plot(lim, lim, 'r--'); ax[0].set_title('Actual vs Pred'); ax[0].set_xlabel('Actual'); ax[0].set_ylabel('Predicted')
sns.histplot(yte - yp, kde=True, ax=ax[1], color='slateblue'); ax[1].set_title('Residuals')
plt.tight_layout(); plt.show()