In [8]:
# Recarga automáticamente los módulos externos cuando cambian
%load_ext autoreload
%autoreload 2

import sys
import os

# 1. Obtener la ruta del directorio raíz del proyecto
# Sube un nivel desde el directorio actual del notebook (notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# 2. Agregar el directorio raíz al 'path' de Python
if project_root not in sys.path:
    print(f"Agregando {project_root} al sys.path")
    sys.path.append(project_root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Importacion de clase limpieza y eda
from mlops_online_news_popularity.preprocess import cleaning_eda

# --- Importaciones de Sklearn ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# =====================================================================
# CLASE: NewsPopularityModel
# =====================================================================

class NewsPopularityModel:
    """
    Clase principal refactorizada.
    """
    def __init__(self, filepath):
        self.filepath = filepath
        self.reports_path = '../docs/'
        self.model_pipeline = None 
        self.data = None
        self.X_train, self.X_test, self.y_train, self.y_test = [None] * 4
        
        self.expected_cols = [
            "url","timedelta","n_tokens_title","n_tokens_content","n_unique_tokens",
            "n_non_stop_words","n_non_stop_unique_tokens","num_hrefs","num_self_hrefs",
            "num_imgs","num_videos","average_token_length","num_keywords",
            "data_channel_is_lifestyle","data_channel_is_entertainment","data_channel_is_bus",
            "data_channel_is_socmed","data_channel_is_tech","data_channel_is_world",
            "kw_min_min","kw_max_min","kw_avg_min","kw_min_max","kw_max_max","kw_avg_max",
            "kw_min_avg","kw_max_avg","kw_avg_avg",
            "self_reference_min_shares","self_reference_max_shares","self_reference_avg_sharess",
            "weekday_is_monday","weekday_is_tuesday","weekday_is_wednesday","weekday_is_thursday",
            "weekday_is_friday","weekday_is_saturday","weekday_is_sunday","is_weekend",
            "LDA_00","LDA_01","LDA_02","LDA_03","LDA_04",
            "global_subjectivity","global_sentiment_polarity","global_rate_positive_words",
            "global_rate_negative_words","rate_positive_words","rate_negative_words",
            "avg_positive_polarity","min_positive_polarity","max_positive_polarity",
            "avg_negative_polarity","min_negative_polarity","max_negative_polarity",
            "title_subjectivity","title_sentiment_polarity","abs_title_subjectivity",
            "abs_title_sentiment_polarity","shares"
        ]
        self.lda_cols = ["LDA_00","LDA_01","LDA_02","LDA_03","LDA_04"]
        self.cols_to_drop = ['url', 'timedelta'] 
        self.TARGET_COL = 'shares'
        self.threshold = 1400 

    def load_data(self):
        print(f"Cargando datos desde {self.filepath}...")
        try:
            self.data = pd.read_csv(self.filepath)
            print(f"Datos cargados exitosamente. Shape={self.data.shape}")
            DataExplorer.explore_data(self.data)
            DataExplorer.generate_profiling_report(
                self.data, "Reporte 1: Datos Crudos (Raw)", 
                self.reports_path, "01_raw_data_report.html"
            )
        except FileNotFoundError:
            print(f"Error: No se encontró el archivo en {self.filepath}")
            return None
        except Exception as e:
            print(f"Error al cargar o explorar datos: {e}")
            return None
        return self

    def _handle_high_correlation(self, threshold=0.9):
        """
        Encuentra y elimina características altamente correlacionadas del set de entrenamiento.
        Actualiza self.X_train, self.X_test y self.cols_to_drop.
        """
        print(f"\n--- Buscando características altamente correlacionadas (umbral > {threshold}) ---")
        
        corr_matrix = self.X_train.corr(numeric_only=True).abs()
        to_drop = set()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i + 1, len(corr_matrix.columns)):
                col_i = corr_matrix.columns[i]
                col_j = corr_matrix.columns[j]
                
                if col_i in to_drop or col_j in to_drop:
                    continue
                    
                if corr_matrix.iloc[i, j] > threshold:
                    avg_corr_i = corr_matrix[col_i].mean()
                    avg_corr_j = corr_matrix[col_j].mean()
                    col_to_drop = col_i if avg_corr_i > avg_corr_j else col_j
                    to_drop.add(col_to_drop)

        to_drop_list = list(to_drop)
        if not to_drop_list:
            print("No se encontraron características nuevas altamente correlacionadas para eliminar.")
            return

        print(f"Columnas a eliminar por alta correlación ({len(to_drop_list)}): {to_drop_list}")
        
        self.X_train = self.X_train.drop(columns=to_drop_list)
        self.X_test = self.X_test.drop(columns=to_drop_list)
        
        self.cols_to_drop.extend(to_drop_list)
        print("X_train y X_test actualizados.")

    def preprocess_data(self):
        if self.data is None:
            print("Error: No hay datos cargados. Ejecute .load_data() primero.")
            return self

        print("\n" + "="*30)
        print("INICIANDO LIMPIEZA Y PREPROCESAMIENTO")
        print("="*30)
        
        cleaner = DataCleaner(self.data)
        cleaner.filter_expected_columns(self.expected_cols)
        cleaner.clean_primary_key(key="url")
        cleaner.force_numeric(exclude=["url"])
        cleaner.apply_business_rules()
        cleaner.normalize_lda(lda_cols=self.lda_cols)
        cleaned_data = cleaner.get_df()
        print("Limpieza de datos estática completada.")

        DataExplorer.generate_profiling_report(
            cleaned_data, 
            "Reporte 2: Datos Limpios (Post-Limpieza Estática)", 
            self.reports_path, 
            "02_cleaned_data_report.html"
        )
        
        X = cleaned_data.drop(self.TARGET_COL, axis=1)
        y = cleaned_data[self.TARGET_COL]
        y_binary = (y > self.threshold).astype(int)
        print(f"Target '{self.TARGET_COL}' binarizado con umbral > {self.threshold}.")
        
        print("\nDividiendo en sets de entrenamiento y prueba...")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
        )
        print("Datos divididos exitosamente.")
        
        print(f"\nEliminando columnas no-features de X_train/X_test: {self.cols_to_drop}")
        self.X_train = self.X_train.drop(self.cols_to_drop, axis=1, errors='ignore')
        self.X_test = self.X_test.drop(self.cols_to_drop, axis=1, errors='ignore')
        
        DataExplorer.plot_correlation_matrix(
            self.X_train, 
            title="Matriz de Correlación ANTES de eliminar alta correlación",
            save_path=os.path.join(self.reports_path, "05_corr_matrix_before.png")
        )
        
        self._handle_high_correlation(threshold=0.9)
        
        DataExplorer.plot_correlation_matrix(
            self.X_train, 
            title="Matriz de Correlación DESPUÉS de eliminar alta correlación",
            save_path=os.path.join(self.reports_path, "06_corr_matrix_after.png")
        )
        
        train_df_report = self.X_train.copy()
        train_df_report[self.TARGET_COL] = self.y_train
        DataExplorer.generate_profiling_report(
            train_df_report, "Reporte 3: Set de Entrenamiento (Train, Final)", 
            self.reports_path, "03_train_set_report.html"
        )
        test_df_report = self.X_test.copy()
        test_df_report[self.TARGET_COL] = self.y_test
        DataExplorer.generate_profiling_report(
            test_df_report, "Reporte 4: Set de Prueba (Test, Final)", 
            self.reports_path, "04_test_set_report.html"
        )

        ##### PIPELINE PREPROCESSING ###
        ###
        ###

        print("\nDefiniendo el Pipeline de preprocesamiento de Scikit-Learn...")
        
        numeric_features = self.X_train.select_dtypes(include=np.number).columns
        cols_bin, cols_no_bin = classify_numeric_columns(self.X_train[numeric_features])



        numeric_non_binary_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('power', PowerTransformer(method='yeo-johnson')),
            ('scaler', StandardScaler())
        ])
        binary_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ])

        preprocessor = ColumnTransformer(transformers=[
            ('num_non_bin', numeric_non_binary_transformer, cols_no_bin),
            ('num_bin', binary_transformer, cols_bin)
        ], remainder='passthrough')


        print("Pipeline de preprocesamiento definido exitosamente.")
        print(f"X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}")
        print(f"X_test shape: {self.X_test.shape}, y_test shape: {self.y_test.shape}")
        

        print("="*30)
        print("FIN DE PREPROCESAMIENTO Y DEFINICIÓN DE PIPELINE")
        print("="*30)

        ###
        ###
        ##### PIPELINE PREPROCESSING ###


        return self

In [10]:
# MAIN

if __name__ == "__main__":
    
    filepath = '../Data/online_news_modified.csv.dvc' 
    
    print("\n\n" + "="*40)
    print("INICIANDO FLUJO AUTOMATIZADO")
    print("="*40)
    
    model = NewsPopularityModel(filepath)
    
    if model.load_data() is not None:
        model.preprocess_data()
        
        print("\n...FLUJO AUTOMATIZADO COMPLETADO.")
        print(f"Verifica la carpeta '{model.reports_path}' para los 4 reportes HTML y 2 imágenes PNG.")
        print(f"\nColumnas totales eliminadas del set de features: {model.cols_to_drop}")
    else:
        print("Fallo al cargar datos. Terminando script.")



INICIANDO FLUJO AUTOMATIZADO
Cargando datos desde ../Data/online_news_modified.csv.dvc...
Datos cargados exitosamente. Shape=(4, 1)
Error al cargar o explorar datos: name 'DataExplorer' is not defined
Fallo al cargar datos. Terminando script.
