# Pipeline solo con el archivo modificado

**Cargar archivo modificado**

In [16]:
import os

print(os.getcwd())



c:\Users\oscar\mlops-project\mlops-project\notebooks


In [17]:
import pandas as pd
import numpy as np
mod = pd.read_csv("../data/online_news_modified.csv")

**Mantener solo las columnas esperadas (si hay extras, se ignoran)**

In [18]:
expected_cols = [
    "url","timedelta","n_tokens_title","n_tokens_content","n_unique_tokens",
    "n_non_stop_words","n_non_stop_unique_tokens","num_hrefs","num_self_hrefs",
    "num_imgs","num_videos","average_token_length","num_keywords",
    "data_channel_is_lifestyle","data_channel_is_entertainment","data_channel_is_bus",
    "data_channel_is_socmed","data_channel_is_tech","data_channel_is_world",
    "kw_min_min","kw_max_min","kw_avg_min","kw_min_max","kw_max_max","kw_avg_max",
    "kw_min_avg","kw_max_avg","kw_avg_avg",
    "self_reference_min_shares","self_reference_max_shares","self_reference_avg_sharess",
    "weekday_is_monday","weekday_is_tuesday","weekday_is_wednesday","weekday_is_thursday",
    "weekday_is_friday","weekday_is_saturday","weekday_is_sunday","is_weekend",
    "LDA_00","LDA_01","LDA_02","LDA_03","LDA_04",
    "global_subjectivity","global_sentiment_polarity",
    "global_rate_positive_words","global_rate_negative_words",
    "rate_positive_words","rate_negative_words",
    "avg_positive_polarity","min_positive_polarity","max_positive_polarity",
    "avg_negative_polarity","min_negative_polarity","max_negative_polarity",
    "title_subjectivity","title_sentiment_polarity",
    "abs_title_subjectivity","abs_title_sentiment_polarity",
    "shares"
]
extra_cols = [c for c in mod.columns if c not in expected_cols]
missing_cols = [c for c in expected_cols if c not in mod.columns]
if extra_cols: print("⚠️ Columnas extra ignoradas:", extra_cols)
if missing_cols: print("⚠️ Columnas esperadas que no encontré (seguiré sin ellas):", missing_cols)

keep_cols = [c for c in expected_cols if c in mod.columns]
mod = mod[keep_cols]

⚠️ Columnas extra ignoradas: ['mixed_type_col']


**Forzar tipos numéricos en todo excepto 'url'**

In [19]:
for c in mod.columns:
    if c == "url":
        continue
    if mod[c].dtype == "O":
        # intenta arreglar decimales con coma si existieran
        mod[c] = (
            mod[c]
            .astype(str)
            .str.replace(",", ".", regex=False)
            .replace({"nan": np.nan, "None": np.nan, "NA": np.nan, "": np.nan})
        )
    mod[c] = pd.to_numeric(mod[c], errors="coerce")


**Reglas de negocio: límites duros / binarios / proporciones / polaridades**

In [20]:
# timedelta
if "timedelta" in mod.columns:
    mod["timedelta"] = mod["timedelta"].clip(0, 731)

# columnas binarias
binary_cols = [c for c in mod.columns if c.startswith("weekday_is_")] + \
              [c for c in mod.columns if c.startswith("data_channel_is_")] + \
              (["is_weekend"] if "is_weekend" in mod.columns else [])
for c in binary_cols:
    mod[c] = mod[c].clip(0, 1).round().fillna(0)

# proporciones en [0,1]
clip_01 = [
    "n_unique_tokens","n_non_stop_words","n_non_stop_unique_tokens",
    "global_subjectivity","global_rate_positive_words","global_rate_negative_words",
    "rate_positive_words","rate_negative_words",
    "title_subjectivity","abs_title_subjectivity","abs_title_sentiment_polarity",
    "LDA_00","LDA_01","LDA_02","LDA_03","LDA_04",
    "avg_positive_polarity","min_positive_polarity","max_positive_polarity",
]
for c in clip_01:
    if c in mod.columns:
        mod[c] = mod[c].clip(0, 1)

# polaridades con negativos
clip_m11 = ["global_sentiment_polarity","title_sentiment_polarity"]
for c in clip_m11:
    if c in mod.columns:
        mod[c] = mod[c].clip(-1, 1)

neg_pol = ["avg_negative_polarity","min_negative_polarity","max_negative_polarity"]
for c in neg_pol:
    if c in mod.columns:
        mod[c] = mod[c].clip(-1, 0)

# límites específicos
if "num_keywords" in mod.columns:
    mod["num_keywords"] = mod["num_keywords"].clip(0, 10)

**Winsorización (1%-99%) para numéricas que NO tienen rango fijo**

In [21]:
fixed_cols = set(["url"]) | set(binary_cols) | set(clip_01) | set(clip_m11) | set(neg_pol) | {"timedelta","num_keywords"}
num_cols = [c for c in mod.select_dtypes(include=[np.number]).columns if c not in fixed_cols]

def winsorize(s, low=0.01, high=0.99):
    if s.notna().sum() == 0:
        return s
    ql = s.quantile(low)
    qh = s.quantile(high)
    # Si todos NaN o qh<ql por datos raros, solo devuelve s
    if pd.isna(ql) or pd.isna(qh) or qh < ql:
        return s
    return s.clip(ql, qh)

for c in num_cols:
    mod[c] = winsorize(mod[c])

**Normalizar LDA para que por fila sumen ≈1 (si existen)**

In [22]:
lda_cols = [c for c in ["LDA_00","LDA_01","LDA_02","LDA_03","LDA_04"] if c in mod.columns]
if lda_cols:
    s = mod[lda_cols].sum(axis=1)
    mask = s > 0
    mod.loc[mask, lda_cols] = mod.loc[mask, lda_cols].div(s[mask], axis=0)


**Eliminar valores nulos de primary key**



In [23]:
primary_key = 'url'
mod = mod[mod['url'].notna() & (mod['url'] != '')]

# Guarda solo los datos que empiezan con http

mod['url'] = mod['url'].astype(str)
mod['url'] = mod['url'].str.lower()

# Elimina espacios antes y despues del url
mod['url'] = mod['url'].str.strip()
mod = mod[mod['url'].str.startswith('http', na=False)]

**Imputacion valores faltantes**

In [24]:
for columna in mod.columns[1:]:
    # Se calcula el sesgo (skewness) de la columna actual
    sesgo = mod[columna].skew()

    # Condición: si el sesgo está entre -1 y 1 (distribución simétrica)
    if -1 < sesgo < 1:
        # Se imputan los valores faltantes con la MEDIA
        valor_imputacion = mod[columna].mean()
        mod[columna] = mod[columna].fillna(valor_imputacion)
        print(f"✅ Columna '{columna}' (sesgo={sesgo:.2f}) -> Imputada con la MEDIA ({valor_imputacion:.2f}).")

    # Si el sesgo es mayor a 1 o menor a -1 (distribución sesgada)
    else:
        # Se imputan los valores faltantes con la MEDIANA
        valor_imputacion = mod[columna].median()
        mod[columna] = mod[columna].fillna(valor_imputacion)
        print(f"🟡 Columna '{columna}' (sesgo={sesgo:.2f}) -> Imputada con la MEDIANA ({valor_imputacion:.2f}).")



✅ Columna 'timedelta' (sesgo=0.12) -> Imputada con la MEDIA (357.40).
🟡 Columna 'n_tokens_title' (sesgo=9.28) -> Imputada con la MEDIANA (10.00).
🟡 Columna 'n_tokens_content' (sesgo=1.97) -> Imputada con la MEDIANA (413.00).
✅ Columna 'n_unique_tokens' (sesgo=-1.00) -> Imputada con la MEDIA (0.54).
🟡 Columna 'n_non_stop_words' (sesgo=-5.55) -> Imputada con la MEDIANA (1.00).
🟡 Columna 'n_non_stop_unique_tokens' (sesgo=-2.24) -> Imputada con la MEDIANA (0.69).
🟡 Columna 'num_hrefs' (sesgo=4.48) -> Imputada con la MEDIANA (8.00).
🟡 Columna 'num_self_hrefs' (sesgo=4.46) -> Imputada con la MEDIANA (3.00).
🟡 Columna 'num_imgs' (sesgo=3.55) -> Imputada con la MEDIANA (1.00).
🟡 Columna 'num_videos' (sesgo=4.59) -> Imputada con la MEDIANA (0.00).
🟡 Columna 'average_token_length' (sesgo=9.52) -> Imputada con la MEDIANA (4.67).
✅ Columna 'num_keywords' (sesgo=-0.15) -> Imputada con la MEDIA (7.25).
🟡 Columna 'data_channel_is_lifestyle' (sesgo=3.83) -> Imputada con la MEDIANA (0.00).
🟡 Columna 'd

**Guardar limpio**

In [25]:
mod.to_csv("../data/online_news_cleaned.csv", index=False)
print("✅ Guardado: online_news_cleaned.csv")
print("Shape limpio:", mod.shape)

✅ Guardado: online_news_cleaned.csv
Shape limpio: (40010, 61)


**Mini-resumen para revisar rápido**

In [26]:
# 8) Mini-resumen para revisar rápido
resumen = pd.DataFrame({
    "min": mod.select_dtypes(include=[np.number]).min(),
    "max": mod.select_dtypes(include=[np.number]).max(),
    "mean": mod.select_dtypes(include=[np.number]).mean(),
    "median": mod.select_dtypes(include=[np.number]).median(),
})
#resumen.to_csv("../data/resumen_cleaned.csv")
print("📄 Guardado resumen: resumen_cleaned.csv")
resumen.head(10)

📄 Guardado resumen: resumen_cleaned.csv


Unnamed: 0,min,max,mean,median
timedelta,8.0,731.0,357.398092,347.0
n_tokens_title,6.0,124.16,11.537167,10.0
n_tokens_content,0.0,2640.8,551.29806,413.0
n_unique_tokens,0.0,1.0,0.535094,0.538462
n_non_stop_words,0.0,1.0,0.970807,1.0
n_non_stop_unique_tokens,0.0,1.0,0.676425,0.691743
num_hrefs,0.0,120.0,11.870682,8.0
num_self_hrefs,0.0,36.73,3.579641,3.0
num_imgs,0.0,61.0,4.918595,1.0
num_videos,0.0,26.0,1.334141,0.0


**Comparar el limpio contra el original para validar que todo quedó coherente.**

In [27]:
import pandas as pd

# 1) Cargar archivos
orig = pd.read_csv("../data/online_news_original.csv")
clean = pd.read_csv("../data/online_news_cleaned.csv")

print("Original shape:", orig.shape)
print("Cleaned shape:", clean.shape)

# 2) Comparar estadísticas descriptivas (media y mediana)
comparacion = pd.DataFrame({
    "mean_orig": orig.mean(numeric_only=True),
    "mean_clean": clean.mean(numeric_only=True),
    "median_orig": orig.median(numeric_only=True),
    "median_clean": clean.median(numeric_only=True),
})

# 3) Agregar diferencias absolutas
comparacion["diff_mean"] = (comparacion["mean_clean"] - comparacion["mean_orig"]).abs()
comparacion["diff_median"] = (comparacion["median_clean"] - comparacion["median_orig"]).abs()

# 4) Proporción de valores faltantes
comparacion["missing_orig_%"] = (orig.isna().sum() / len(orig)) * 100
comparacion["missing_clean_%"] = (clean.isna().sum() / len(clean)) * 100

# 5) Guardar en CSV para revisión detallada
#comparacion.to_csv("../data/comparacion_final.csv")
print("📊 Comparación guardada en comparacion_final.csv")

# 6) Mostrar primeras filas
display(comparacion.head(15))



Original shape: (39644, 61)
Cleaned shape: (40010, 61)
📊 Comparación guardada en comparacion_final.csv


Unnamed: 0,mean_orig,mean_clean,median_orig,median_clean,diff_mean,diff_median,missing_orig_%,missing_clean_%
timedelta,354.530471,357.398092,339.0,347.0,2.867621,8.0,0.0,0.0
n_tokens_title,10.398749,11.537167,10.0,10.0,1.138418,0.0,0.0,0.0
n_tokens_content,546.514731,551.29806,409.0,413.0,4.783329,4.0,0.0,0.0
n_unique_tokens,0.548216,0.535094,0.539226,0.538462,0.013122,0.0007640113,0.0,0.0
n_non_stop_words,0.996469,0.970807,1.0,1.0,0.025661,4.799994e-11,0.0,0.0
n_non_stop_unique_tokens,0.689175,0.676425,0.690476,0.691743,0.01275,0.00126693,0.0,0.0
num_hrefs,10.88369,11.870682,8.0,8.0,0.986992,0.0,0.0,0.0
num_self_hrefs,3.293638,3.579641,3.0,3.0,0.286002,0.0,0.0,0.0
num_imgs,4.544143,4.918595,1.0,1.0,0.374452,0.0,0.0,0.0
num_videos,1.249874,1.334141,0.0,0.0,0.084268,0.0,0.0,0.0


**Informacion del data frame limpio**

In [29]:
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40010 entries, 0 to 40009
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            40010 non-null  object 
 1   timedelta                      40010 non-null  float64
 2   n_tokens_title                 40010 non-null  float64
 3   n_tokens_content               40010 non-null  float64
 4   n_unique_tokens                40010 non-null  float64
 5   n_non_stop_words               40010 non-null  float64
 6   n_non_stop_unique_tokens       40010 non-null  float64
 7   num_hrefs                      40010 non-null  float64
 8   num_self_hrefs                 40010 non-null  float64
 9   num_imgs                       40010 non-null  float64
 10  num_videos                     40010 non-null  float64
 11  average_token_length           40010 non-null  float64
 12  num_keywords                   40010 non-null 