In [None]:
# Solo monta Drive si estamos en Colab
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=True)
else:
    print("No Colab: saltando montaje de Drive.")


In [None]:
from pathlib import Path
import pandas as pd

CLEAN_PATH = Path("data/processed/clean.csv")
assert CLEAN_PATH.exists(), f"No existe {CLEAN_PATH}. Corre el stage de cleaning primero."
dataset_modified = pd.read_csv(CLEAN_PATH).convert_dtypes()


##Load Libraries

In [None]:
#!pip install scikit-learn==1.5.0

In [None]:
# Parámetros (el runner/papermill los puede sobreescribir si quieres)
PATH_IN = "data/processed/clean.csv"                  # entrada oficial del stage clean
MODEL_OUT = "models/best_model.joblib"                # si dejas un “mejor modelo” único
REPORT_OUT = "reports/models/cv_results_summary.csv"  # resumen de CV
METRICS_OUT = "reports/models/metrics.json"           # métricas para DVC

# Carga de datos
import pandas as pd
from pathlib import Path

Path("models").mkdir(parents=True, exist_ok=True)
Path("reports/models").mkdir(parents=True, exist_ok=True)

if Path(PATH_IN).exists():
    dataset_modified = pd.read_csv(PATH_IN, low_memory=False).convert_dtypes()
elif Path("df_final_validated.csv").exists():  # fallback por si alguien aún lo usa
    dataset_modified = pd.read_csv("df_final_validated.csv", low_memory=False).convert_dtypes()
else:
    raise FileNotFoundError("No se encontró data/processed/clean.csv ni df_final_validated.csv")

print("dataset_modified:", dataset_modified.shape)


In [None]:
# Patched: evitar fallo fuera de Colab
try:
    import google.colab  # type: ignore
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=True)
    print('Google Drive montado (Colab).')
except Exception:
    print('No Colab: saltando montaje de Drive.')


## Load data

In [None]:
#drive.mount('/content/drive')
#os.chdir('/content/drive/MyDrive/Colab Notebooks/3_Trimestre/MLOps/Semana_4/Tarea')
#os.chdir('/content/drive/MyDrive/Colab Notebooks/MNA/4rto_Trimestre/MLOps/Semana_4/Tarea')

In [None]:
# --- Load data (agnóstico a Colab) ---
from pathlib import Path
import pandas as pd

# (Opcional) Si estás en Colab, monta Drive; en local no hace nada
try:
    import google.colab  # type: ignore
    from google.colab import drive  # type: ignore
    print("Colab detectado -> montando Drive…")
    drive.mount('/content/drive', force_remount=True)
except Exception:
    pass  # no estamos en Colab

# Ruta oficial del pipeline (DVC) y fallback local
CLEAN_PATH = Path("data/processed/clean.csv")
FALLBACK_PATH = Path("df_final_validated.csv")

if CLEAN_PATH.exists():
    dataset_modified = pd.read_csv(CLEAN_PATH, low_memory=False)
    print(f"✓ Cargado {CLEAN_PATH} ->", dataset_modified.shape)
elif FALLBACK_PATH.exists():
    dataset_modified = pd.read_csv(FALLBACK_PATH, low_memory=False)
    print(f"✓ Cargado {FALLBACK_PATH} ->", dataset_modified.shape)
else:
    raise FileNotFoundError(
        "No encuentro data/processed/clean.csv ni df_final_validated.csv.\n"
        "Corre el stage de cleaning (python -m dvc repro) o deja el CSV de respaldo en la raíz."
    )

# Normaliza tipos
dataset_modified = dataset_modified.convert_dtypes()
print("dtypes (primeras 8):")
print(dataset_modified.dtypes.head(8))


In [None]:
#pd.set_option('display.max_rows', None)

In [None]:
dataset_modified=pd.read_csv('df_final_validated.csv',low_memory=False )

In [None]:
dataset_modified=dataset_modified.convert_dtypes()

In [None]:
dataset_modified

In [None]:
print(dataset_modified.dtypes)

## Step 1 EDA - Clean Dataframe and describe columns

###Classes and functions to clean columns and insert into pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
class CleanNumericColumns(BaseEstimator, TransformerMixin):
    """
    1. Lower-cases + trims selected columns
    2. Casts to numeric (non-parsable values → NaN)
    3. Imputes NaN with the training-set median
    """
    def __init__(self, columns=None):
        """
        columns : list of column names to clean; if None, all columns
        """
        self.columns = columns              # hyper-parameter (used by __init__)

    # ----------- Fit -----------
    def fit(self, X, y=None):
        X_ = X.copy()
        cols = self.columns or X_.columns   # default: every column

        self.medians_ = {}                  # learned parameter(s) end with "_"
        for c in cols:
            col_clean = (
                X_[c].astype(str)
                    .str.lower()
                    .str.strip()
                    .pipe(pd.to_numeric, errors='coerce')
            )
            self.medians_[c] = col_clean.median()
        self.cols_ = cols                   # save for use in transform
        return self                         # always return self

    # ----------- Transform -----------
    def transform(self, X):
        X = X.copy()
        for c in self.cols_:
            X[c] = (
                X[c].astype(str)
                    .str.lower()
                    .str.strip()
                    .pipe(pd.to_numeric, errors='coerce')
                    .fillna(self.medians_[c])
            )
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.cols_)

In [None]:
class CleanBooleanColumns(BaseEstimator, TransformerMixin):
    """
    1. Lower-cases + trims selected columns
    2. Casts to numeric; NaN if coercion fails
    3. Imputes NaN with the mode (most frequent) value from the *training* data
    4. Converts to strict 0/1 flag based on equality with 1
    """
    def __init__(self, columns=None):
        self.columns = columns          # list of columns (or None = all)

    # ---------------- fit ----------------
    def fit(self, X, y=None):
        X_ = X.copy()
        cols = self.columns or X_.columns
        self.modes_ = {}

        for c in cols:
            col_clean = (
                X_[c].astype(str)
                    .str.lower()
                    .str.strip()
                    .pipe(pd.to_numeric, errors='coerce')
            )
            # .mode() returns Series; take the first element
            self.modes_[c] = col_clean.mode(dropna=True)[0]

        self.cols_ = cols
        return self

    # ------------- transform -------------
    def transform(self, X):
        X = X.copy()
        for c in self.cols_:
            X[c] = (
                X[c].astype(str)
                    .str.lower()
                    .str.strip()
                    .pipe(pd.to_numeric, errors='coerce')
                    .fillna(self.modes_[c])
                    .isin([1])          # True if value == 1
                    .astype(int)        # → 0/1 integer
            )
        return X

    # (Optional) expose feature names so ColumnTransformer → OneHotEncoder pipelines work
    def get_feature_names_out(self, input_features=None):
        return np.array(self.cols_)

In [None]:
def _clean_string_columns(X, columns):
    """
    • Cast to string
    • Lower-case + strip
    • Keep value only if it starts with 'http', else set to <NA>

    Parameters
    ----------
    X : pandas DataFrame
    columns : list[str] – columns to clean (must exist in X)

    Returns
    -------
    X_new : pandas DataFrame (copy of X with cleaned columns)
    """
    X = X.copy()
    for c in columns:
        X[c] = (
            X[c].astype(str)
                 .str.lower()
                 .str.strip()
                 .where(lambda s: s.str.startswith('http'), pd.NA)
        )
    return X

In [None]:
class CleanStringColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        # stateless – nothing to learn
        self.cols_ = self.columns or X.columns.tolist()
        return self

    def transform(self, X):
        return _clean_string_columns(X, self.cols_)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.cols_)

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer


In [None]:

def _drop_rows_where_url_is_nan(X):
    """Return a copy of X with rows removed if url is missing (<NA>)."""
    return X.loc[X['url'].notna()].reset_index(drop=True)

drop_bad_url = FunctionTransformer(
    _drop_rows_where_url_is_nan,
    feature_names_out='one-to-one'   # keeps column names unchanged
)

def _drop_duplicate_url(X):
    return X.drop_duplicates(subset='url', keep='first').reset_index(drop=True)

drop_dup_url = FunctionTransformer(_drop_duplicate_url,
                                   feature_names_out='one-to-one')

In [None]:
def delete_outliers(
        df,
        k=1.5,          # IQR multiplier
        max_cols=1,     # row dropped if > max_cols columns flag it
        return_counts=False
    ):
    num_cols = df.select_dtypes(include=[np.number]).columns
    col_counts = {}

    mask_df = pd.DataFrame(index=df.index, columns=num_cols, dtype=bool)

    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - k*iqr, q3 + k*iqr
        col_mask = (df[col] < lower) | (df[col] > upper)

        mask_df[col] = col_mask
        col_counts[col] = int(col_mask.sum())

    row_counts = mask_df.sum(axis=1)
    row_mask   = row_counts > max_cols

    cleaned_df = df.loc[~row_mask].reset_index(drop=True)

    if return_counts:
        return cleaned_df, pd.Series(col_counts).sort_values(ascending=False)
    return cleaned_df

def _drop_outliers(X):
    # delete_outliers returns only the cleaned df when return_counts=False
    return delete_outliers(X, k=1.5, max_cols=10, return_counts=False)

drop_outliers = FunctionTransformer(_drop_outliers,
                                    feature_names_out='one-to-one')


In [None]:
def clip_numeric_ranges(X):
    X = X.copy()
    for c in columns_0_to_1:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors='coerce').clip(0, 1)
    for c in columns_neg1_to_1:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors='coerce').clip(-1, 1)
    for c in columns_neg1_to_0:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors='coerce').clip(-1, 0)
    return X

clip_ranges = FunctionTransformer(clip_numeric_ranges,
                                  feature_names_out='one-to-one')

### Define column type

In [None]:
string_columns=['url']
bool_columns=['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world', 'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday','is_weekend']
num_columns=['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03','LDA_04', 'abs_title_sentiment_polarity', 'abs_title_subjectivity', 'average_token_length', 'avg_negative_polarity', 'avg_positive_polarity', 'global_rate_negative_words', 'global_rate_positive_words', 'global_sentiment_polarity', 'global_subjectivity', 'kw_avg_avg', 'kw_avg_max', 'kw_avg_min', 'kw_max_avg', 'kw_max_max', 'kw_max_min', 'kw_min_avg', 'kw_min_max', 'kw_min_min', 'max_negative_polarity', 'max_positive_polarity', 'min_negative_polarity', 'min_positive_polarity', 'mixed_type_col', 'n_non_stop_unique_tokens', 'n_non_stop_words', 'n_tokens_content', 'n_tokens_title', 'n_unique_tokens', 'num_hrefs', 'num_imgs', 'num_keywords', 'num_self_hrefs', 'num_videos', 'rate_negative_words', 'rate_positive_words', 'timedelta', 'title_sentiment_polarity', 'title_subjectivity','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','shares']

In [None]:
len(string_columns),len(bool_columns),len(num_columns)

### Classify numeric columns

In [None]:
# Clip values to match original dataset range
columns_0_to_1 = ["n_unique_tokens","n_non_stop_words","n_non_stop_unique_tokens",
           "LDA_00","LDA_01","LDA_02","LDA_03","LDA_04","global_rate_positive_words",
           "global_subjectivity","global_rate_negative_words",
           "rate_positive_words","rate_negative_words","avg_positive_polarity","min_positive_polarity","max_positive_polarity"
           "title_subjectivity","abs_title_subjectivity","abs_title_sentiment_polarity"]

columns_neg1_to_1 = ["global_sentiment_polarity","title_sentiment_polarity"]

columns_neg1_to_0 = ["max_negative_polarity", "avg_negative_polarity","min_negative_polarity"]

In [None]:
len(columns_0_to_1),len(columns_neg1_to_1),len(columns_neg1_to_0)

### Preprocess columns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ('num' , CleanNumericColumns(num_columns) ,  num_columns),
        ('bool', CleanBooleanColumns(bool_columns),  bool_columns),
        ('str' , CleanStringColumns(string_columns), string_columns),
    ],
    remainder='drop',                # keep only the columns above
    verbose_feature_names_out=False
).set_output(transform='pandas')     # ← keep pandas DataFrame & names


In [None]:
cleaning_pipe = Pipeline([
    ('prep', preprocess),
 #   ('drop_bad_url', drop_bad_url),
 #   ('drop_dup_url',  drop_dup_url),
 #   ('drop_outliers',  drop_outliers),
 #   ('clip_ranges',    clip_ranges)
])


In [None]:
# Fit *once* on your full DataFrame (no y needed)
cleaned_df = cleaning_pipe.fit_transform(dataset_modified)

In [None]:
print("Original shape :", dataset_modified.shape)
print("Cleaned shape  :", cleaned_df.shape)
print("Any NA in url? :", cleaned_df['url'].isna().any())
print("Duplicate urls :", cleaned_df['url'].duplicated().any())

### Describe the columns

In [None]:
cleaned_df.dtypes

In [None]:
cleaned_df.describe(include='all').T

## Step 2 EDA - graphs

### Function to graph

In [None]:
def graph_features(df,num_columns,bool_columns):
  # ───────────────────────────────────────────────────────────
  # 1.  Decide how many plots we need
  # ───────────────────────────────────────────────────────────
  n_plots = len(num_columns) * 2 + len(bool_columns)   # 2 plots per numeric feature

  # choose grid size automatically (≤4 columns usually looks good)
  n_cols  = 4
  n_rows  = math.ceil(n_plots / n_cols)

  # ───────────────────────────────────────────────────────────
  # 2.  Create the figure canvas
  # ───────────────────────────────────────────────────────────
  fig, axes = plt.subplots(n_rows, n_cols,
                          figsize=(n_cols * 4, n_rows * 3),
                          constrained_layout=True)      # auto-spacing
  axes = axes.ravel()        # 1-D iterator over all axes

  # ───────────────────────────────────────────────────────────
  # 3.  Draw the plots
  # ───────────────────────────────────────────────────────────
  idx = 0
  BINS=60
  # Numeric: histogram + boxplot
  for col in num_columns:
      # Histogram (+ optional KDE line)
      sns.histplot(df[col],
                  bins=BINS,
                  ax=axes[idx],
                  kde=True, color='steelblue')
      axes[idx].set_title(f'{col}\nHistogram', fontsize=9)
      idx += 1

      # Boxplot (vertical wastes less horizontal space)
      sns.boxplot(y=df[col],
                  ax=axes[idx],
                  color='salmon')
      axes[idx].set_title(f'{col}\nBoxplot', fontsize=9)
      idx += 1

  # Boolean / nominal: countplot
  for col in bool_columns:
      sns.countplot(x=df[col],
                    ax=axes[idx],
                    palette='pastel')
      axes[idx].set_title(f'{col}\nCount', fontsize=9)
      idx += 1

  # ───────────────────────────────────────────────────────────
  # 4.  Remove any unused axes to keep the grid tidy
  # ───────────────────────────────────────────────────────────
  for j in range(idx, len(axes)):
      fig.delaxes(axes[j])

  # Global style tweaks
  sns.despine(fig=fig)           # remove top/right spines
  plt.show()

In [None]:
import math
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
graph_features(df=cleaned_df,num_columns=num_columns,bool_columns=bool_columns)

## ML model

### Split dataframe for Train, Validation and Test and drop columns that are redundant

### Agregar analisis de correlacion

In [None]:
corr_matrix = cleaned_df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix")
plt.show()

##### The following columns where removed from the X



1.   average_token_length since it is an average correlated to n_tokens_content
2.   kw_avg_min since it is an average correlated to kw_min_min and kw_max_min
3.   kw_avg_max since it is an average correlated to kw_max_min and kw_max_max
4.   self_reference_min_shares since it is normally data obtained after models have been deployed
5.   self_reference_max_shares since it is normally data obtained after models have been deployed
6.   self_reference_avg_sharess since it is normally data obtained after models have been deployed
7.   is_weekend since we have columns for saturday and sunday
8.   weekday_is_sunday since we can determine by knowing if the other days did not apply
9.  avg_positive_polarity since it is an abg correlated to min_positive_polarity and max_positive_polarity
10.  avg_negative_polarity since it is an abg correlated to min_negative_polarity and max_negative_polarity
11. url since it is a string column that can be used as the index
12. shares since that is our output
13. kw_avg_avg since it is correlated to kw_max_avg and kw_min_avg
14. timedelta since it is a not predictive column







In [None]:
#Dataframe for X
index = dataset_modified["url"]
y = cleaned_df["shares"]
X=cleaned_df.drop(columns={'timedelta','average_token_length','kw_avg_min','kw_avg_max','self_reference_min_shares','self_reference_max_shares','self_reference_avg_sharess','is_weekend','weekday_is_sunday','avg_positive_polarity','avg_negative_polarity','url','shares','kw_avg_avg'})

In [None]:
# Imports mínimos para el split y métricas
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# (opcional) por si el kernel se reinició
import warnings
warnings.filterwarnings("ignore")


In [None]:
X_train, X_valtest, y_train, y_valtest = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=None
)

X_val, X_test, y_val, y_test = train_test_split(
    X_valtest, y_valtest,
    test_size=0.50,
    random_state=42,
    stratify=None
)

print('Original dataframe',dataset_modified.shape)
print('Cleaned dataframe',cleaned_df.shape)
print('X_train', X_train.shape)
print('X_val', X_val.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_val', y_val.shape)
print('y_test', y_test.shape)

### Pipeline to improve features distributions





In [None]:
# === Imports mínimos para los pipelines ===
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
#Se elimino shares de num_cols

string_columns=['url']
bool_columns=['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world', 'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday']
num_columns=['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03','LDA_04', 'abs_title_sentiment_polarity', 'abs_title_subjectivity', 'global_rate_negative_words', 'global_rate_positive_words', 'global_sentiment_polarity', 'global_subjectivity', 'kw_max_avg', 'kw_max_max', 'kw_max_min', 'kw_min_avg', 'kw_min_max', 'kw_min_min', 'max_negative_polarity', 'max_positive_polarity', 'min_negative_polarity', 'min_positive_polarity', 'mixed_type_col', 'n_non_stop_unique_tokens', 'n_non_stop_words', 'n_tokens_content', 'n_tokens_title', 'n_unique_tokens', 'num_hrefs', 'num_imgs', 'num_keywords', 'num_self_hrefs', 'num_videos', 'rate_negative_words', 'rate_positive_words', 'title_sentiment_polarity', 'title_subjectivity']
print(+len(num_columns)+len(bool_columns))

In [None]:
# Variables numéricas:
numeric_pipe = Pipeline(steps=[('impute_median',SimpleImputer(strategy='median')),('yeo-johnson_transformer',PowerTransformer(method='yeo-johnson',standardize=False)),('standard_scaler',StandardScaler())])


In [None]:
# Variables categóricas binaries:
bool_pipe = Pipeline(steps=[('impute_mode',SimpleImputer(strategy='most_frequent'))])
bool_pipe2 = Pipeline(steps=[('impute_mode',SimpleImputer(strategy='most_frequent'))])

In [None]:
# Conjuntas las transformaciones de todo tipo de variable y
# deja sin procesar aquellas que hayas decidido no transformar:

columnasTransformer = ColumnTransformer(transformers=[('num_pipe',numeric_pipe,num_columns),('bin_pipe',bool_pipe,bool_columns),],remainder='passthrough')

columnasTransformer2 = ColumnTransformer(transformers=[('num_pipe',numeric_pipe,num_columns),('bin_pipe',bool_pipe2,bool_columns),],remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

Xtmp = X_train.copy()
tmp = columnasTransformer.fit_transform(Xtmp)
tmp2=columnasTransformer2.fit_transform(Xtmp)
print("Dimensión de los datos de entrada:")
print("antes de aplicar las transformaciones:", Xtmp.shape)
print("después de aplicar las transformaciones:", tmp.shape)

### Histogram after pipeline

In [None]:
tmp2.head()

In [None]:
graph_features(df=tmp2,num_columns=num_columns,bool_columns=bool_columns)

### Merge X_train and X_test

In [None]:
# Como se va a utilizar Validación-Cruzada, concatena los conjuntos de entrenamiento
# y prueba en uno nuevo conjunto aumentado que llamaremos trainval:


# ************* Inlcuye aquí tu código:**************************


Xtraintest = pd.concat([X_train,X_test])
ytraintest = pd.concat([y_train,y_test])


# *********** Aquí termina la sección de agregar código *************


# Veamos cuántas variables nuevas se introducen con las transformaciones One-Hot-Encoding:
Xtmp = Xtraintest.copy()
tmp = columnasTransformer.fit_transform(Xtmp)
print("Dimensión de las variables de entrada ANTES de las transformaciones:", Xtmp.shape)
print("Dimensión de las variables de entrada DESPUÉS de las transformaciones:", tmp.shape)

#### functions

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error


# Helper: RMSE scorer (scikit-learn does not provide it directly)
rmse_scorer = make_scorer(
    mean_squared_error,
    greater_is_better=False,   # tell sklearn that “bigger is worse”
    squared=False              # √MSE  instead of MSE
)

In [None]:
def mi_fun_nosampling(modelo, nombre, X, y):
    """
    Evaluate a *regression* model (wrapped in a Pipeline with the global
    columnasTransformer) using repeated k-fold CV and several regression
    metrics, then print the aggregated results.

    Parameters
    ----------
    modelo : estimator
        Any scikit-learn compatible regressor (e.g., RandomForestRegressor).
    nombre : str
        A tag that appears in the printed summary.
    X : pd.DataFrame or np.ndarray
        Features.
    y : array-like
        Target vector.
    """

    print('No sampling method used with column transformation, results:')

    pipeline = Pipeline(steps=[
        ('ct', columnasTransformer),
        ('model', modelo)
    ])

    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=5)

    # Metrics to compute
    mis_metricas = {
        'rmse' : rmse_scorer,                        # root-mean-squared error
        'mae'  : 'neg_mean_absolute_error',          # will be negated back
        'mape' : 'neg_mean_absolute_percentage_error',
        'r2'   : 'r2'
    }

    scores = cross_validate(
        pipeline,
        X, np.ravel(y),
        scoring=mis_metricas,
        cv=cv,
        return_train_score=True
    )

    # Pretty printing
    print(f'>> {nombre}')
    for metric_name in mis_metricas.keys():
        values = scores[f'test_{metric_name}']
        # flip the sign for “neg_...” scorers so that smaller is worse
        if metric_name in ['mae', 'mape', 'rmse']:
            values = -values
        mean_val = np.nanmean(values)
        std_val  = np.nanstd(values)
        print(f'\t{metric_name:5s}: {mean_val:.4f} ({std_val:.3f})')

    print('------------------------------------------------------------------------------------------')

In [None]:
# ---------------------------------------------------------
def mi_fun_grid(modelo, nombre, X, y, dicc_grid=None):
    """
    Grid-search helper for REGRESSION models.

    Parameters
    ----------
    modelo : estimator
        Any scikit-learn compatible regressor (e.g., XGBRegressor()).
    nombre : str
        Name that appears in the printed summary.
    X : pd.DataFrame or np.ndarray
        Feature matrix.
    y : array-like
        Target vector.
    dicc_grid : dict or list of dicts
        Hyper-parameter grid to explore (as in GridSearchCV).
    """
    print('Find best parameters while using column transformation')

    # 1. Pipeline ----------------------------------------------------
    pipeline = Pipeline(steps=[
        ('ct', columnasTransformer),  # assumes you defined this globally
        ('model', modelo)
    ])

    # 2. Scoring dictionary -----------------------------------------
    mis_metricas = {
        'rmse':  'neg_root_mean_squared_error',      # lower is better
        'mae':   'neg_mean_absolute_error',
        'mape':  'neg_mean_absolute_percentage_error',
        'r2':    'r2'
    }

    # 3. CV strategy -------------------------------------------------
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

    # 4. Grid search -------------------------------------------------
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=dicc_grid,
        cv=cv,
        scoring=mis_metricas,
        refit='rmse',           # the model returned by .best_estimator_ minimises RMSE
        n_jobs=-1,
        return_train_score=True,
        verbose=1
    )

    # 5. Fit ---------------------------------------------------------
    grid_result = grid.fit(X, np.ravel(y))

    # 6. Report ------------------------------------------------------
    best_rmse = -grid_result.best_score_   # flip sign back to positive RMSE
    print(f'>> {nombre}')
    print(f'Mejor RMSE (CV): {best_rmse:.4f} usando {grid_result.best_params_}')
    print('------------------------------------------------------------------------------------------')

    # Optional: display the mean CV value for every metric
    for m in mis_metricas:
        vals = grid_result.cv_results_[f'mean_test_{m}']
        stds = grid_result.cv_results_[f'std_test_{m}']
        # flip sign for neg_ scorers
        if m in ['rmse', 'mae', 'mape']:
            vals = -vals
        print(f'{m:5s}: {vals.min():.4f} – {vals.max():.4f}  (std avg {stds.mean():.3f})')
    print('------------------------------------------------------------------------------------------')

## ML models

### Regresion lineal

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Puedes consultar la siguiente liga para los métodos de submuestreo o sobremuestreo:
# https://imbalanced-learn.org/stable/references/over_sampling.html

# ************* Inlcuye aquí tu código:**************************

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RepeatedKFold


# Selecciona la configuración de valores de hiperparámetros que consideres te da el mejor desempeño:
nombre='Linear_Regression'
modelo =LinearRegression()

#Grid parameters
dicc_grid = {
    'model__fit_intercept': [True, False],
    'model__copy_X': [True, False],
    'model__positive': [False, True]  # forces coefficients to be positive
}
#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo =LinearRegression(C=0.5,penalty='l1',solver='liblinear',random_state=1)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)


# *********** Aquí termina la sección de agregar código *************




Find best parameters while using column transformation
Fitting 15 folds for each of 8 candidates, totalling 120 fits
>> Linear_Regression
Mejor RMSE (CV): 3974.5696 usando {'model__copy_X': True, 'model__fit_intercept': True, 'model__positive': False}
------------------------------------------------------------------------------------------
rmse : 3974.5696 – 4081.8782  (std avg 90.238)
mae  : 2340.2179 – 2381.9597  (std avg 32.343)
mape : 1.5913 – 1.6280  (std avg 0.325)
r2   : -0.0035 – 0.0486  (std avg 0.006)
------------------------------------------------------------------------------------------


In [None]:
# --- Guardar artefactos mínimos para DVC ---
from pathlib import Path
from joblib import dump
import json
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Path("models").mkdir(parents=True, exist_ok=True)
Path("reports/models").mkdir(parents=True, exist_ok=True)

# EJEMPLOS: ajusta los nombres de tus objetos entrenados
# dump(modelo_lineal, "models/Linear_Regression.joblib")
# dump(modelo_knn,   "models/KNN.joblib")

# Si tus objetos se llaman distinto, cambia las variables de abajo
artefactos = []
def _metricas(nombre, y_true, y_pred):
    return {
        "model": nombre,
        "rmse_val": float(mean_squared_error(y_true, y_pred, squared=False)),
        "mae_val":  float(mean_absolute_error(y_true, y_pred)),
        "r2_val":   float(r2_score(y_true, y_pred)),
    }

# EJEMPLOS de predicción en validación; usa tus propias variables
# artefactos.append(_metricas("Linear_Regression", y_val, modelo_lineal.predict(X_val)))
# artefactos.append(_metricas("KNN",               y_val, modelo_knn.predict(X_val)))

pd.DataFrame(artefactos).to_csv("reports/models/cv_results_summary.csv", index=False)

# best por RMSE de validación
best = min(artefactos, key=lambda d: d["rmse_val"]) if artefactos else {"model":"NA","rmse_val":999999.0}
with open("reports/models/metrics.json", "w") as f:
    json.dump({"best_model": best["model"], "rmse_val": best["rmse_val"]}, f, indent=2)

print("✅ Artefactos guardados:")
print("- models/Linear_Regression.joblib")
print("- models/KNN.joblib")
print("- reports/models/cv_results_summary.csv")
print("- reports/models/metrics.json")


### k-Vecinos Más Cercanos (kNN)italicized text

In [None]:
from sklearn.neighbors import KNeighborsRegressor


In [None]:
# k-Vecinos más Cercanos : k-Nearest-Neighbors-kNN:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html


# ************* Inlcuye aquí tu código:**************************

nombre = 'K_neighbors_nearest'

#Input best parameters into model
modelo = KNeighborsRegressor()

#Grid parameters
dicc_grid = {
    'model__n_neighbors': [5, 11, 15, 21],   # number of neighbors
    'model__weights': ['uniform'],        # weight function
    'model__algorithm': ['auto'],  # search algorithm
    'model__p': [1, 2],                               # 1 = Manhattan, 2 = Euclidean
    #'model__leaf_size': [10, 20, 30, 40, 50]          # affects speed/memory balance
}

#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo =KNeighborsRegressor(n_neighbors=200)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)

### Decision tree

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
#"""class sklearn.tree.DecisionTreeRegressor(*, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None)"""

# ************* Inlcuye aquí tu código:**************************

#nombre = 'Decision_tree'
#modelo = DecisionTreeRegressor()

#Grid parameters
#dicc_grid = {
    #'model__criterion': ['squared_error', 'absolute_error'],  # loss function
    #'model__splitter': ['best'],        # how to choose splits
    #'model__max_depth': [None, 7, 20],  # tree depth limit
    #'model__min_samples_split': [2, 7, 15],   # minimum samples to split an internal node
    #'model__min_samples_leaf': [1, 2, 6, 10],  # minimum samples at a leaf node
   # 'model__max_features': [None, 'sqrt'],# number of features to consider when looking for best split
    #'model__max_leaf_nodes': [None, 10, 20, 50]  # maximum number of leaf nodes
#}
#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
#mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo = DecisionTreeRegressor(criterion='gini',max_depth=3,max_features=25,min_samples_split=3,random_state=1)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)

Find best parameters while using column transformation
Fitting 15 folds for each of 12 candidates, totalling 180 fits
>> Decision_tree
Mejor RMSE (CV): 23400.2251 usando {'model__criterion': 'absolute_error', 'model__max_depth': 7, 'model__max_features': 'sqrt'}
------------------------------------------------------------------------------------------
rmse : 23400.2251 – 39427.2034  (std avg 5531.391)
mae  : 3138.4086 – 6596.0320  (std avg 385.893)
mape : 0.6508 – 3.4537  (std avg 0.298)
r2   : -2.5399 – -0.0223  (std avg 0.896)

### Random Forest

In [None]:
# Bosque Aleatorio-RandomForest-RF:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


# ************* Inlcuye aquí tu código:**************************

#nombre = 'Random_forest'
#modelo=RandomForestRegressor()

#Grid parameters
#dicc_grid = {
    ##'model__criterion': ['squared_error', 'absolute_error'],
    #'model__max_depth': [None, 15, 30],       # maximum depth of each tree
    #'model__min_samples_split': [2, 5, 10],          # min samples to split an internal node
    #'model__min_samples_leaf': [1, 2, 4],            # min samples at a leaf node
    #'model__max_features': ['sqrt', None],   # number of features considered for split
    #'model__bootstrap': [True, False],               # whether to use bootstrap samples
    #'model__max_leaf_nodes': [None, 20, 50, 100]     # optional, controls model complexity
#}
##Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
#mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo = RandomForestRegressor(criterion='entropy',max_depth=17,max_features=7,min_samples_split=11,n_estimators=31,random_state=1)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)

### XGBoosting

In [None]:
# XGBoosting:
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
# https://xgboost.readthedocs.io/en/stable/parameter.html


# ************* Inlcuye aquí tu código:**************************

#nombre = 'Extreme_Gradient_Boost'
#modelo=XGBRegressor()

#Grid parameters
#dicc_grid =  {
    # Tree complexity
    #'model__n_estimators': [50, 100, 5¿200],        # number of boosting rounds
    #'model__max_depth': [3, 10, 20],             # tree depth
    #'model__min_child_weight': [1, 3, 5],          # min sum of instance weight (controls overfitting)

    # Learning dynamics
    #'model__learning_rate': [0.01, 0.05, 0.1, 0.3], # shrinkage step
    #'model__subsample': [0.6, 0.8, 1.0],            # row sampling
    #'model__colsample_bytree': [0.6, 0.8, 1.0],     # feature sampling per tree

    # Regularization
    #'model__gamma': [0, 0.1, 0.3, 0.5],             # minimum loss reduction to make a split
    #'model__reg_alpha': [0, 0.1, 0.5, 1.0],         # L1 regularization
    #'model__reg_lambda': [0.5, 1.0, 2.0]            # L2 regularization
#}
#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer,
#mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo=XGBRegressor(booster='gbtree',n_estimators=30,learning_rate=0.2,max_depth=3,max_depth=3,subsample=0.8,random_state=1)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)

### Neuronal network MLP

### Support vector machine SVM

In [None]:
# Red neuronal de Perceptrón Multicapa-MLP:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html


# ************* Inlcuye aquí tu código:**************************

#nombre = "Red Neuronal Multicapa MLP"
#modelo = MLPRegressor()


#Grid parameters
#dicc_grid = {
    # Network architecture
    #'model__hidden_layer_sizes': [
    #    (50,), (100,), (100, 50), (50, 50, 50)
    #],  # number of layers and neurons per layer

    # Activation and solver
    #'model__activation': ['relu', 'tanh', 'logistic'],   # nonlinearities
    #'model__solver': ['adam', 'lbfgs'],                  # optimizer

    # Regularization and learning
    #'model__alpha': [0.0001, 0.001, 0.01],              # L2 penalty
    #'model__learning_rate': ['constant', 'adaptive'],    # learning rate schedule
    #'model__learning_rate_init': [0.001, 0.01, 0.05],    # initial learning rate
   # 'model__early_stopping': [True]                      # helps avoid overfitting
#}
#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
#mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo = MLPRegressor(hidden_layer_sizes=(75,),activation='tanh',solver='adam',learning_rate='constant',learning_rate_init=0.0001,max_iter=2000,random_state=1)

# Selecciona el método de submuestreo o sobremuestreo, si lo deseas incluir.
#metodo_uo = SMOTETomek(random_state=1)

#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)

In [None]:
# Máquina de Vectores de Soporte-SVM:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html


# ************* Inlcuye aquí tu código:**************************

#nombre = 'Support Vector Machine SVM'
#modelo = SVR()

#Grid parameters
#dicc_grid =  {
    #'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'model__C': [0.1, 1, 10, 100],          # regularization parameter
    #'model__epsilon': [0.01, 0.1, 0.2, 0.5],# insensitive loss margin
    #'model__gamma': ['scale', 'auto'],      # kernel coefficient
    #'model__degree': [2, 3, 4]              # only used for 'poly' kernel
#}
#Evaluate modelo WITHOUT over/sub sampling and WITH column transformer
#mi_fun_grid(modelo,nombre, Xtraintest, ytraintest,dicc_grid=dicc_grid)
#modelo = SVR(C=3.25,kernel='rbf',gamma='scale',random_state=1)


#Evaluate modelo WITHOUT over/sub sampling
#mi_fun_nosampling(modelo,nombre, Xtraintest, ytraintest)
