In [None]:
import pandas as pd
import numpy as np

# ================================
# 1. DataLoader
# ================================
class DataLoader:
    def load_csv(self, path):
        return pd.read_csv(path)

    def save_csv(self, df, path):
        df.to_csv(path, index=False)
        print(f"💾 Guardado en {path} (shape={df.shape})")


In [2]:
# ================================
# 2. DataCleaner
# ================================
class DataCleaner:
    def __init__(self, df):
        self.df = df

    def filter_expected_columns(self, expected_cols):
        extra = [c for c in self.df.columns if c not in expected_cols]
        missing = [c for c in expected_cols if c not in self.df.columns]
        if extra: print("⚠️ Extras ignoradas:", extra)
        if missing: print("⚠️ Faltan columnas:", missing)
        self.df = self.df[[c for c in expected_cols if c in self.df.columns]]
        return self

    def force_numeric(self, exclude=["url"]):
        for c in self.df.columns:
            if c in exclude:
                continue
            if self.df[c].dtype == "O":
                self.df[c] = (
                    self.df[c].astype(str)
                    .str.replace(",", ".", regex=False)
                    .replace({"nan": np.nan, "None": np.nan, "": np.nan})
                )
            self.df[c] = pd.to_numeric(self.df[c], errors="coerce")
        return self

    def apply_business_rules(self):
        # ejemplo timedelta
        if "timedelta" in self.df:
            self.df["timedelta"] = self.df["timedelta"].clip(0, 731)
        # clip proporciones
        clip_01 = ["n_unique_tokens", "global_subjectivity"]
        for c in clip_01:
            if c in self.df:
                self.df[c] = self.df[c].clip(0, 1)
        return self

    def winsorize_columns(self, exclude=set()):
        def winsorize(s, low=0.01, high=0.99):
            if s.notna().sum() == 0: return s
            ql, qh = s.quantile(low), s.quantile(high)
            return s.clip(ql, qh)
        num_cols = [c for c in self.df.select_dtypes(include=[np.number]).columns if c not in exclude]
        for c in num_cols:
            self.df[c] = winsorize(self.df[c])
        return self

    def normalize_lda(self, lda_cols=None):
        if not lda_cols: return self
        lda_cols = [c for c in lda_cols if c in self.df]
        if lda_cols:
            s = self.df[lda_cols].sum(axis=1)
            mask = s > 0
            self.df.loc[mask, lda_cols] = self.df.loc[mask, lda_cols].div(s[mask], axis=0)
        return self

    def clean_primary_key(self, key="url"):
        self.df = self.df[self.df[key].notna() & (self.df[key] != "")]
        self.df[key] = self.df[key].astype(str).str.strip().str.lower()
        self.df = self.df[self.df[key].str.startswith("http", na=False)]
        return self

    def impute_missing_values(self):
        for col in self.df.columns[1:]:
            skew = self.df[col].skew()
            if -1 < skew < 1:
                val = self.df[col].mean()
                self.df[col] = self.df[col].fillna(val)
            else:
                val = self.df[col].median()
                self.df[col] = self.df[col].fillna(val)
        return self

    def get_df(self):
        return self.df

In [4]:
import pandas as pd

# ==============================
# 3. DataComparator
# ==============================
class DataComparator:
    def __init__(self, orig, clean):
        self.orig = orig
        self.clean = clean
        self.report = pd.DataFrame()  # evita problemas con None

    def compare_stats(self):
        """Calcula estadísticas descriptivas (media y mediana)."""
        self.report = pd.DataFrame({
            "mean_orig": self.orig.mean(numeric_only=True),
            "mean_clean": self.clean.mean(numeric_only=True),
            "median_orig": self.orig.median(numeric_only=True),
            "median_clean": self.clean.median(numeric_only=True)
        })
        return self

    def add_differences(self):
        """Agrega diferencias absolutas entre original y limpio."""
        if self.report.empty:
            raise ValueError("Primero ejecuta compare_stats() antes de add_differences().")
        self.report["diff_mean"] = (self.report["mean_clean"] - self.report["mean_orig"]).abs()
        self.report["diff_median"] = (self.report["median_clean"] - self.report["median_orig"]).abs()
        return self

    def missing_values_ratio(self):
        """Calcula proporción de valores faltantes en %."""
        self.report["missing_orig_%"] = (self.orig.isna().sum() / len(self.orig)) * 100
        self.report["missing_clean_%"] = (self.clean.isna().sum() / len(self.clean)) * 100
        return self

    def export_report(self, path):
        """Exporta el reporte a CSV."""
        if self.report.empty:
            raise ValueError("No hay reporte que exportar. Ejecuta los métodos primero.")
        self.report.to_csv(path, index=False)
        print(f"📊 Reporte exportado a {path}")
        return self.report


In [11]:
# ================================
# 4. Ejemplo de uso
# ================================
if __name__ == "__main__":
    loader = DataLoader()

    # Carga de datos (ajustado a tus archivos en raíz)
    orig = loader.load_csv("../../Data/online_news_original.csv")
    mod = loader.load_csv("../../Data/online_news_modified.csv")





    # ================================
    # Definir columnas esperadas
    # ================================
    expected_cols = [
        "url","timedelta","n_tokens_title","n_tokens_content","n_unique_tokens",
        "n_non_stop_words","n_non_stop_unique_tokens","num_hrefs","num_self_hrefs",
        "num_imgs","num_videos","average_token_length","num_keywords",
        "data_channel_is_lifestyle","data_channel_is_entertainment","data_channel_is_bus",
        "data_channel_is_socmed","data_channel_is_tech","data_channel_is_world",
        "kw_min_min","kw_max_min","kw_avg_min","kw_min_max","kw_max_max","kw_avg_max",
        "kw_min_avg","kw_max_avg","kw_avg_avg",
        "self_reference_min_shares","self_reference_max_shares","self_reference_avg_sharess",
        "weekday_is_monday","weekday_is_tuesday","weekday_is_wednesday","weekday_is_thursday",
        "weekday_is_friday","weekday_is_saturday","weekday_is_sunday","is_weekend",
        "LDA_00","LDA_01","LDA_02","LDA_03","LDA_04",
        "global_subjectivity","global_sentiment_polarity",
        "global_rate_positive_words","global_rate_negative_words",
        "rate_positive_words","rate_negative_words",
        "avg_positive_polarity","min_positive_polarity","max_positive_polarity",
        "avg_negative_polarity","min_negative_polarity","max_negative_polarity",
        "title_subjectivity","title_sentiment_polarity",
        "abs_title_subjectivity","abs_title_sentiment_polarity",
        "shares"
    ]

    # Columnas extra o faltantes
    extra_cols = [c for c in mod.columns if c not in expected_cols]
    missing_cols = [c for c in expected_cols if c not in mod.columns]

    if extra_cols:
        print("⚠️ Columnas extra ignoradas:", extra_cols)
    if missing_cols:
        print("⚠️ Columnas esperadas que no encontré (seguiré sin ellas):", missing_cols)

    keep_cols = [c for c in expected_cols if c in mod.columns]
    mod = mod[keep_cols]

    # ================================
    # Limpieza
    # ================================
    cleaner = DataCleaner(mod)
    mod_clean = (cleaner
        .filter_expected_columns(expected_cols=keep_cols)
        .force_numeric()
        .apply_business_rules()
        .winsorize_columns()
        .normalize_lda(["LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04"])
        .clean_primary_key()
        .impute_missing_values()
        .get_df())

    loader.save_csv(mod_clean, "online_news_cleaned.csv")

    # ================================
    # Comparación con original
    # ================================
    comparator = DataComparator(orig, mod_clean)
    report = (comparator
        .compare_stats()
        .add_differences()
        .missing_values_ratio()
        .export_report("comparacion_final.csv"))

    display(report.head(15))


⚠️ Columnas extra ignoradas: ['mixed_type_col']
💾 Guardado en online_news_cleaned.csv (shape=(40010, 61))
📊 Reporte exportado a comparacion_final.csv


Unnamed: 0,mean_orig,mean_clean,median_orig,median_clean,diff_mean,diff_median,missing_orig_%,missing_clean_%
timedelta,354.530471,357.429169,339.0,347.0,2.898698,8.0,0.0,0.0
n_tokens_title,10.398749,11.537167,10.0,10.0,1.138418,0.0,0.0,0.0
n_tokens_content,546.514731,551.29806,409.0,413.0,4.783329,4.0,0.0,0.0
n_unique_tokens,0.548216,0.535094,0.539226,0.538462,0.013122,0.0007640113,0.0,0.0
n_non_stop_words,0.996469,1.129708,1.0,1.0,0.13324,4.799994e-11,0.0,0.0
n_non_stop_unique_tokens,0.689175,0.73005,0.690476,0.691743,0.040875,0.00126693,0.0,0.0
num_hrefs,10.88369,11.870682,8.0,8.0,0.986992,0.0,0.0,0.0
num_self_hrefs,3.293638,3.579641,3.0,3.0,0.286002,0.0,0.0,0.0
num_imgs,4.544143,4.918595,1.0,1.0,0.374452,0.0,0.0,0.0
num_videos,1.249874,1.334141,0.0,0.0,0.084268,0.0,0.0,0.0
