# Modelo 4: Regresión Bayesiana para Predicción de Revenue

Este modelo predice el `total_revenue` usando múltiples features del dataset de Amazon.

In [None]:
# Importaciones necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc as pm
import arviz as az
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Configuración de visualización
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Cargar y explorar datos
df = pd.read_csv('dataset/amazon_sales_dataset.csv')
print(f"Dimensiones del dataset: {df.shape}")
print(f"\nPrimeras filas:")
df.head()

In [None]:
# Información básica del dataset
print("Información del dataset:")
print(df.info())
print(f"\nValores nulos:\n{df.isnull().sum()}")
print(f"\nEstadísticas descriptivas:")
df.describe()

## Análisis Exploratorio de Datos (EDA)

In [None]:
# EDA - Distribución de la variable objetivo
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['total_revenue'], bins=50, alpha=0.7)
plt.title('Distribución de Total Revenue')
plt.xlabel('Total Revenue')
plt.ylabel('Frecuencia')

plt.subplot(1, 3, 2)
plt.boxplot(df['total_revenue'])
plt.title('Boxplot de Total Revenue')
plt.ylabel('Total Revenue')

plt.subplot(1, 3, 3)
from scipy import stats
stats.probplot(df['total_revenue'], dist="norm", plot=plt)
plt.title('Q-Q Plot')

plt.tight_layout()
plt.show()

In [None]:
# Correlaciones
numeric_cols = ['price', 'discount_percent', 'quantity_sold', 'rating', 'review_count', 
                'discounted_price', 'total_revenue']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matriz de Correlación')
plt.show()

print(f"Correlación con total_revenue:\n{correlation_matrix['total_revenue'].sort_values(ascending=False)}")

In [None]:
# Scatter plots con variables más correlacionadas
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price vs Total Revenue
axes[0,0].scatter(df['price'], df['total_revenue'], alpha=0.5)
axes[0,0].set_xlabel('Price')
axes[0,0].set_ylabel('Total Revenue')
axes[0,0].set_title('Price vs Total Revenue')

# Quantity vs Total Revenue
axes[0,1].scatter(df['quantity_sold'], df['total_revenue'], alpha=0.5)
axes[0,1].set_xlabel('Quantity Sold')
axes[0,1].set_ylabel('Total Revenue')
axes[0,1].set_title('Quantity Sold vs Total Revenue')

# Discounted Price vs Total Revenue
axes[1,0].scatter(df['discounted_price'], df['total_revenue'], alpha=0.5)
axes[1,0].set_xlabel('Discounted Price')
axes[1,0].set_ylabel('Total Revenue')
axes[1,0].set_title('Discounted Price vs Total Revenue')

# Rating vs Total Revenue
axes[1,1].scatter(df['rating'], df['total_revenue'], alpha=0.5)
axes[1,1].set_xlabel('Rating')
axes[1,1].set_ylabel('Total Revenue')
axes[1,1].set_title('Rating vs Total Revenue')

plt.tight_layout()
plt.show()

## Preparación de Datos

In [None]:
# Seleccionar features para el modelo
# Basándonos en las correlaciones, usaremos las variables más relevantes
features = ['discounted_price', 'quantity_sold', 'rating']
target = 'total_revenue'

X = df[features].copy()
y = df[target].copy()

print(f"Features seleccionadas: {features}")
print(f"Dimensiones X: {X.shape}")
print(f"Dimensiones y: {y.shape}")

# Verificar datos faltantes
print(f"\nDatos faltantes en X:\n{X.isnull().sum()}")
print(f"\nDatos faltantes en y: {y.isnull().sum()}")

In [None]:
# División train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Tamaño conjunto entrenamiento: {X_train.shape[0]}")
print(f"Tamaño conjunto prueba: {X_test.shape[0]}")

# Normalización de features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nEstadísticas después de normalización (train):")
print(f"Media: {X_train_scaled.mean(axis=0)}")
print(f"Std: {X_train_scaled.std(axis=0)}")

## Modelo Bayesiano

In [None]:
# Definir modelo bayesiano
def create_bayesian_model(X, y):
    with pm.Model() as model:
        # Priors para los coeficientes
        intercept = pm.Normal('intercept', mu=0, sigma=10)
        coeffs = pm.Normal('coeffs', mu=0, sigma=10, shape=X.shape[1])
        
        # Prior para la desviación estándar del error
        sigma = pm.HalfNormal('sigma', sigma=10)
        
        # Media del modelo
        mu = intercept + pm.math.dot(X, coeffs)
        
        # Likelihood
        likelihood = pm.Normal('y', mu=mu, sigma=sigma, observed=y)
    
    return model

# Crear y entrenar modelo
print("Creando modelo bayesiano...")
model = create_bayesian_model(X_train_scaled, y_train)

print("Entrenando modelo...")
with model:
    # Sampling
    trace = pm.sample(2000, tune=1000, random_seed=42, chains=2)

print("Modelo entrenado exitosamente!")

In [None]:
# Diagnósticos del modelo
print("Resumen de parámetros:")
print(az.summary(trace))

# Visualizar trazas
az.plot_trace(trace)
plt.tight_layout()
plt.show()

# R-hat (debe estar cerca de 1)
rhat = az.rhat(trace)
print(f"\nR-hat valores (deben estar cerca de 1):")
print(rhat)

## Predicciones y Evaluación

In [None]:
# Función para hacer predicciones
def make_predictions(model, trace, X_new):
    with model:
        pm.set_data({'X': X_new})
        posterior_predictive = pm.sample_posterior_predictive(trace, random_seed=42)
    
    predictions = posterior_predictive.posterior_predictive['y'].values
    pred_mean = predictions.mean(axis=(0, 1))
    pred_std = predictions.std(axis=(0, 1))
    
    return pred_mean, pred_std

# Hacer predicciones para train y test
print("Realizando predicciones...")

# Para entrenamient
with model:
    pm.set_data({'X': X_train_scaled, 'y': y_train})
    ppc_train = pm.sample_posterior_predictive(trace, random_seed=42)

# Para test
with model:
    pm.set_data({'X': X_test_scaled, 'y': y_test})
    ppc_test = pm.sample_posterior_predictive(trace, random_seed=42)

# Extraer predicciones
y_pred_train = ppc_train.posterior_predictive['y'].mean(dim=['chain', 'draw']).values
y_pred_test = ppc_test.posterior_predictive['y'].mean(dim=['chain', 'draw']).values

print("Predicciones completadas!")

In [None]:
# Calcular métricas
def calculate_metrics(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\nMétricas {dataset_name}:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")
    
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2}

# Métricas de entrenamiento y prueba
metrics_train = calculate_metrics(y_train, y_pred_train, "Entrenamiento")
metrics_test = calculate_metrics(y_test, y_pred_test, "Prueba")

# Verificar overfitting
overfitting_rmse = abs(metrics_train['RMSE'] - metrics_test['RMSE']) / metrics_train['RMSE'] * 100
overfitting_r2 = abs(metrics_train['R2'] - metrics_test['R2']) / metrics_train['R2'] * 100

print(f"\nAnálisis de Overfitting:")
print(f"Diferencia RMSE: {overfitting_rmse:.2f}%")
print(f"Diferencia R²: {overfitting_r2:.2f}%")

if overfitting_rmse < 5 and overfitting_r2 < 5:
    print("✅ Overfitting bajo: < 5%")
else:
    print("⚠️ Posible overfitting detectado")

In [None]:
# Visualizaciones de evaluación
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Predicciones vs Reales (Train)
axes[0,0].scatter(y_train, y_pred_train, alpha=0.5)
axes[0,0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0,0].set_xlabel('Valores Reales')
axes[0,0].set_ylabel('Predicciones')
axes[0,0].set_title(f'Entrenamiento: R² = {metrics_train["R2"]:.4f}')

# Predicciones vs Reales (Test)
axes[0,1].scatter(y_test, y_pred_test, alpha=0.5)
axes[0,1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0,1].set_xlabel('Valores Reales')
axes[0,1].set_ylabel('Predicciones')
axes[0,1].set_title(f'Prueba: R² = {metrics_test["R2"]:.4f}')

# Residuos (Train)
residuals_train = y_train - y_pred_train
axes[1,0].scatter(y_pred_train, residuals_train, alpha=0.5)
axes[1,0].axhline(y=0, color='r', linestyle='--')
axes[1,0].set_xlabel('Predicciones')
axes[1,0].set_ylabel('Residuos')
axes[1,0].set_title('Residuos - Entrenamiento')

# Residuos (Test)
residuals_test = y_test - y_pred_test
axes[1,1].scatter(y_pred_test, residuals_test, alpha=0.5)
axes[1,1].axhline(y=0, color='r', linestyle='--')
axes[1,1].set_xlabel('Predicciones')
axes[1,1].set_ylabel('Residuos')
axes[1,1].set_title('Residuos - Prueba')

plt.tight_layout()
plt.show()

## Feature Importance

In [None]:
# Extraer coeficientes posteriores
coefficients = trace.posterior['coeffs'].values
coeff_means = coefficients.mean(axis=(0, 1))
coeff_stds = coefficients.std(axis=(0, 1))

# Crear DataFrame con importancia de features
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient_Mean': coeff_means,
    'Coefficient_Std': coeff_stds,
    'Absolute_Importance': np.abs(coeff_means)
})

feature_importance = feature_importance.sort_values('Absolute_Importance', ascending=False)
print("Importancia de Features:")
print(feature_importance)

# Visualizar importancia
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Coefficient_Mean'])
plt.xlabel('Coeficiente Promedio')
plt.title('Importancia de Features (Coeficientes del Modelo Bayesiano)')
plt.grid(axis='x', alpha=0.3)
plt.show()

## Guardar Modelo y Resultados

In [None]:
# Guardar artefactos del modelo
import pickle
import joblib

# Guardar scaler
joblib.dump(scaler, 'modelo_4_scaler.pkl')

# Guardar trace (objeto PyMC)
with open('modelo_4_trace.pkl', 'wb') as f:
    pickle.dump(trace, f)

# Guardar métricas
results = {
    'features': features,
    'metrics_train': metrics_train,
    'metrics_test': metrics_test,
    'feature_importance': feature_importance.to_dict(),
    'overfitting_rmse': overfitting_rmse,
    'overfitting_r2': overfitting_r2
}

with open('modelo_4_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("Modelo y resultados guardados exitosamente!")
print("Archivos generados:")
print("- modelo_4_scaler.pkl")
print("- modelo_4_trace.pkl")
print("- modelo_4_results.pkl")