In [1]:
# Config paths (robusto para VS Code)
from pathlib import Path
import pandas as pd

# Si se ejecuta el notebook desde /notebooks, subimos un nivel al raíz del proyecto
PROJ = Path.cwd() if (Path.cwd().name != "notebooks") else Path.cwd().parent
DATA = PROJ / "data"

train_path = DATA / "train.csv"
assert train_path.exists(), f"No encuentro {train_path}. ¿Está en /data?"

test_path = DATA / "test.csv"
assert test_path.exists(), f"No encuentro {test_path}. ¿Está en /data?"

# Carga de datos
train = pd.read_csv(train_path)  # si pesa mucho, podés usar low_memory=True
print("Shape:", train.shape)
display(train.head())

# Carga de datos
test = pd.read_csv(test_path)
print("Shape:", test.shape)
display(test.head())

# Chequeos básicos
print("\nTipos:")
print(train.dtypes)
print("\nNulos (top 10):")
print(train.isna().sum().sort_values(ascending=False).head(10))


ModuleNotFoundError: No module named 'pandas'

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#GESTION DE VALORES NULOS O FALTANTES

# Separar la variable objetivo
target_col = "popularity"

X_train = train.drop(columns=[target_col])
y_train = train[target_col]

# Detectar columnas numéricas y categóricas SOLO en X_train
num_cols = X_train.select_dtypes(include=['number']).columns
cat_cols = X_train.select_dtypes(exclude=['number']).columns

# Definir el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", SimpleImputer(strategy="constant", fill_value="Unknown"), cat_cols),
    ]
)

# Ajustar solo con X_train (sin popularity)
preprocessor.fit(X_train)

# Transformar train y test
train_prepared = preprocessor.transform(X_train)
test_prepared  = preprocessor.transform(test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NUMÉRICAS CONTINUAS: solo estas variables tienen sentido para outliers
num_continuous = [
    "duration_ms",
    "danceability",
    "energy",
    "loudness",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo"
]

# 1. VISUALIZACIÓN DE OUTLIERS
fig, axes = plt.subplots(len(num_continuous)//3 + 1, 3, figsize=(15, len(num_continuous)*2))
axes = axes.flatten()

for i, col in enumerate(num_continuous):
    sns.boxplot(x=train[col], ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout()
plt.show()


# 2. MÉTODO IQR (Interquartile Range)
def detect_outliers_iqr(df, columns, factor=1.5):

    outliers_dict = {}
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        
        outliers_dict[col] = {
            'count': len(outliers),
            'percentage': len(outliers) / len(df) * 100,
            'lower': lower_bound,
            'upper': upper_bound
        }
    
    return outliers_dict


outliers_info = detect_outliers_iqr(train, num_continuous, factor=1.5)

# Mostrar resumen
print("RESUMEN DE OUTLIERS POR COLUMNA:")
for col, info in outliers_info.items():
    print(f"{col}: {info['count']} outliers ({info['percentage']:.2f}%)")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor


# ============================================================
# 1. DEFINIR VARIABLES CONTINUAS (correctas para skew/outliers)
# ============================================================

num_continuous = [
    "duration_ms",
    "danceability",
    "energy",
    "loudness",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo"
]

# ============================================================
# 2. ANÁLISIS DE ASIMETRÍA (solo para decidir log-transform)
# ============================================================

print("ANÁLISIS DE ASIMETRÍA (SKEWNESS):")
print("-" * 60)

skewness_results = {}
for col in num_continuous:
    s = train[col].skew()
    skewness_results[col] = s
    msg = "MUY ASIMÉTRICA ⚠️" if abs(s) > 1 else ("Asimetría moderada" if abs(s) > 0.5 else "Simétrica ✓")
    print(f"{col:20s}: skew = {s:6.2f}  --> {msg}")

# Variables muy asimétricas y positivas → aplicar log1p
skewed_cols = [
    col for col in num_continuous
    if abs(skewness_results[col]) > 1 and (train[col] >= 0).all()
]

print("\nColumnas seleccionadas para log transform:", skewed_cols)


# ============================================================
# 3. TRANSFORMADOR LOGARÍTMICO (FUNCIONAL Y SIN ERRORES)
# ============================================================

class LogTransformer(FunctionTransformer):
    """
    Transformador logarítmico seguro para pipelines
    """
    def __init__(self, columns_to_transform):
        self.columns_to_transform = columns_to_transform
        super().__init__(
            func=self._log_transform,
            inverse_func=self._inverse_log_transform,
            validate=False,
            check_inverse=False   # ← NECESARIO PARA NO FALLAR
        )

    def _log_transform(self, X):
        X = X.copy()
        for col in self.columns_to_transform:
            if col in X.columns:
                X[col] = np.log1p(X[col])
        return X

    def _inverse_log_transform(self, X):
        X = X.copy()
        for col in self.columns_to_transform:
            if col in X.columns:
                X[col] = np.expm1(X[col])
        return X


log_transformer = LogTransformer(columns_to_transform=skewed_cols)


# ============================================================
# 4. DEFINIR COLUMNAS NUMÉRICAS Y CATEGÓRICAS (CORREGIDO)
# ============================================================

# Numéricas verdaderas
num_cols = train.select_dtypes(include=["number"]).columns.tolist()
num_cols.remove("popularity")  # remover target

# COLUMNAS CATEGÓRICAS DE BAJA CARDINALIDAD → SOLO ESTAS
cat_cols = ["key", "mode", "time_signature", "explicit", "track_genre"]

print(f"\nColumnas numéricas finales: {num_cols}")
print(f"Columnas categóricas finales (baja cardinalidad): {cat_cols}")


# ============================================================
# 5. PIPELINES NUMÉRICO Y CATEGÓRICO
# ============================================================

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])


# ============================================================
# 6. COLUMN TRANSFORMER FINAL (SIN EXPLOTAR LA RAM)
# ============================================================

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
], remainder="drop")


# ============================================================
# 7. PIPELINE COMPLETO CON MODELO
# ============================================================

pipeline = Pipeline([
    ("log_transform", log_transformer),
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=10,
        random_state=42,
        n_jobs=-1
    ))
])


# ============================================================
# 8. ENTRENAMIENTO (SIN DATA LEAKAGE)
# ============================================================

X_train = train.drop(columns=["popularity"])
y_train = train["popularity"]

print("\nEntrenando modelo...")
pipeline.fit(X_train, y_train)
print("✓ Entrenamiento completado correctamente")


# ============================================================
# 9. Cargar test y predecir
# ============================================================

test_path = DATA / "test.csv"
test = pd.read_csv(test_path)

y_pred = pipeline.predict(test)

print(f"\n✓ Predicciones generadas: {len(y_pred)}")
print(f"Rango de predicciones: {y_pred.min():.2f} – {y_pred.max():.2f}")



# ============================================================
# 10. VISUALIZAR EFECTO DE LA TRANSFORMACIÓN LOGARÍTMICA
# ============================================================

fig, axes = plt.subplots(len(skewed_cols), 2, figsize=(12, len(skewed_cols)*3))

for i, col in enumerate(skewed_cols):
    ax0 = axes[i, 0] if len(skewed_cols) > 1 else axes[0]
    ax1 = axes[i, 1] if len(skewed_cols) > 1 else axes[1]

    ax0.hist(train[col], bins=50, color="red", edgecolor="black")
    ax0.set_title(f"{col} - Original (skew={train[col].skew():.2f})")

    transformed = np.log1p(train[col])
    ax1.hist(transformed, bins=50, color="green", edgecolor="black")
    ax1.set_title(f"{col} - Log Transform (skew={transformed.skew():.2f})")

plt.tight_layout()
plt.show()


In [None]:
train.to_csv("../data/train_processed.csv", index=False)
test.to_csv("../data/test_processed.csv", index=False)