# Modelo 4: Regresi√≥n Bayesiana Poisson - Predicci√≥n de Cantidad Vendida

**Objetivo**: Predecir `quantity_sold` usando modelo Poisson con variables temporales y descuentos.

**Variables**: `discount_percent`, `rating`, `is_weekend`, `day_of_week`, `month`

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Cargamos el dataset
df = pl.read_csv("dataset/amazon_sales_dataset.csv")
print(f"Dataset cargado: {df.shape}")
df.head()

In [None]:
# ========================================
# 1. PREPARACI√ìN DE DATOS E INGENIER√çA DE VARIABLES
# ========================================

# Convertir a pandas para facilitar manipulaci√≥n de fechas
df_pd = df.to_pandas()

# Limpieza: quantity_sold debe ser entero >= 0
df_pd['quantity_sold'] = df_pd['quantity_sold'].astype(int).clip(lower=0)

# Extracci√≥n temporal desde order_date
df_pd['order_date'] = pd.to_datetime(df_pd['order_date'])
df_pd['day_of_week'] = df_pd['order_date'].dt.dayofweek  # 0=Lunes, 6=Domingo
df_pd['is_weekend'] = (df_pd['day_of_week'] >= 5).astype(int)  # S√°bado y Domingo
df_pd['month'] = df_pd['order_date'].dt.month

# Codificaci√≥n: product_category a dummies
category_dummies = pd.get_dummies(df_pd['product_category'], prefix='category')
df_final = pd.concat([df_pd, category_dummies], axis=1)

print("‚úÖ Variables creadas:")
print(f"- day_of_week: {df_final['day_of_week'].unique()}")
print(f"- is_weekend: {df_final['is_weekend'].value_counts().to_dict()}")
print(f"- month: rango {df_final['month'].min()}-{df_final['month'].max()}")
print(f"- Categor√≠as: {list(category_dummies.columns)}")

df_final.head()

In [None]:
# ========================================
# 2. AN√ÅLISIS EXPLORATORIO (EDA)
# ========================================

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribuci√≥n de quantity_sold (debe parecer Poisson)
axes[0,0].hist(df_final['quantity_sold'], bins=range(1, df_final['quantity_sold'].max()+2), 
               alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribuci√≥n de Quantity Sold (Forma de Poisson)')
axes[0,0].set_xlabel('Cantidad Vendida')
axes[0,0].set_ylabel('Frecuencia')

# An√°lisis media vs varianza (sobredispersi√≥n)
mean_qty = df_final['quantity_sold'].mean()
var_qty = df_final['quantity_sold'].var()
axes[0,1].bar(['Media', 'Varianza'], [mean_qty, var_qty], color=['orange', 'red'])
axes[0,1].set_title(f'Media vs Varianza\n(Ratio: {var_qty/mean_qty:.2f})')
axes[0,1].set_ylabel('Valor')
if var_qty/mean_qty > 1.5:
    axes[0,1].text(0.5, max(mean_qty, var_qty)*0.8, 'Sobredispersi√≥n\nDetectada', 
                   ha='center', fontsize=10, color='red')

# Weekend vs quantity_sold
weekend_data = df_final.groupby('is_weekend')['quantity_sold'].mean()
axes[1,0].bar(['Weekdays', 'Weekends'], weekend_data.values, color=['lightblue', 'lightcoral'])
axes[1,0].set_title('Promedio Ventas: Weekdays vs Weekends')
axes[1,0].set_ylabel('Promedio Quantity Sold')

# Discount vs quantity_sold
axes[1,1].scatter(df_final['discount_percent'], df_final['quantity_sold'], alpha=0.5)
axes[1,1].set_title('Descuento vs Cantidad Vendida')
axes[1,1].set_xlabel('Discount Percent')
axes[1,1].set_ylabel('Quantity Sold')

plt.tight_layout()
plt.show()

print(f"üìä Estad√≠sticas de quantity_sold:")
print(f"Media: {mean_qty:.2f}")
print(f"Varianza: {var_qty:.2f}")
print(f"Ratio Varianza/Media: {var_qty/mean_qty:.2f} {'(Sobredispersi√≥n)' if var_qty/mean_qty > 1.5 else '(Normal)'}")

In [None]:
# ========================================
# 3. PREPARAR VARIABLES PARA EL MODELO
# ========================================

# Variables seleccionadas
y = df_final['quantity_sold'].values
X_discount = df_final['discount_percent'].values
X_rating = df_final['rating'].values
X_weekend = df_final['is_weekend'].values

# Estandarizar variables continuas
X_discount_scaled = (X_discount - X_discount.mean()) / X_discount.std()
X_rating_scaled = (X_rating - X_rating.mean()) / X_rating.std()

print(f"Variables preparadas:")
print(f"- y (quantity_sold): {len(y)} registros")
print(f"- X_discount_scaled: media={X_discount_scaled.mean():.3f}, std={X_discount_scaled.std():.3f}")
print(f"- X_rating_scaled: media={X_rating_scaled.mean():.3f}, std={X_rating_scaled.std():.3f}")
print(f"- X_weekend: {np.unique(X_weekend, return_counts=True)}")

In [None]:
# ========================================
# 4. MODELO BAYESIANO POISSON
# ========================================

import pymc as pm
import arviz as az

with pm.Model() as modelo_4_poisson:
    # Priors para los coeficientes
    intercept = pm.Normal("intercept", mu=np.log(y.mean()), sigma=1)
    beta_discount = pm.Normal("beta_discount", mu=0, sigma=1)
    beta_rating = pm.Normal("beta_rating", mu=0, sigma=1) 
    beta_weekend = pm.Normal("beta_weekend", mu=0, sigma=1)
    
    # Funci√≥n lineal (en escala log)
    mu = pm.math.exp(intercept + 
                     beta_discount * X_discount_scaled + 
                     beta_rating * X_rating_scaled +
                     beta_weekend * X_weekend)
    
    # Likelihood Poisson
    y_obs = pm.Poisson("y_obs", mu=mu, observed=y)
    
    # Muestreo
    print("üöÄ Iniciando muestreo MCMC...")
    trace = pm.sample(1000, tune=500, chains=2, random_seed=42)
    print("‚úÖ Modelo entrenado exitosamente!")

In [None]:
# ========================================
# 5. DIAGN√ìSTICOS DEL MODELO
# ========================================

# Resumen de par√°metros
print("üìä Resumen de Par√°metros:")
summary = az.summary(trace)
print(summary)

# Verificar convergencia (R-hat)
rhat_values = az.rhat(trace)
print(f"\nüîç Convergencia (R-hat):")
for param, rhat in rhat_values.data_vars.items():
    rhat_val = float(rhat.values)
    status = "‚úÖ" if rhat_val < 1.1 else "‚ö†Ô∏è"
    print(f"{status} {param}: {rhat_val:.4f}")

# Gr√°fico de trazas
az.plot_trace(trace)
plt.tight_layout()
plt.show()

In [None]:
# ========================================
# 6. PREDICCIONES Y VALIDACI√ìN
# ========================================

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Divisi√≥n train/test
indices = np.arange(len(y))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Predicciones usando la media posterior
posterior_means = az.summary(trace)["mean"]
intercept_mean = posterior_means["intercept"]
beta_discount_mean = posterior_means["beta_discount"]
beta_rating_mean = posterior_means["beta_rating"]
beta_weekend_mean = posterior_means["beta_weekend"]

# Funci√≥n de predicci√≥n
def predict_poisson(discount, rating, weekend):
    log_mu = (intercept_mean + 
              beta_discount_mean * discount + 
              beta_rating_mean * rating + 
              beta_weekend_mean * weekend)
    return np.exp(log_mu)

# Predicciones para train y test
y_pred_train = predict_poisson(X_discount_scaled[train_idx], X_rating_scaled[train_idx], X_weekend[train_idx])
y_pred_test = predict_poisson(X_discount_scaled[test_idx], X_rating_scaled[test_idx], X_weekend[test_idx])

# M√©tricas
mae_train = mean_absolute_error(y[train_idx], y_pred_train)
mae_test = mean_absolute_error(y[test_idx], y_pred_test)
rmse_train = np.sqrt(mean_squared_error(y[train_idx], y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y[test_idx], y_pred_test))

# An√°lisis de overfitting
overfitting_mae = abs(mae_train - mae_test) / mae_train * 100
overfitting_rmse = abs(rmse_train - rmse_test) / rmse_train * 100

print(f"üìà M√âTRICAS DEL MODELO:")
print(f"\nEntrenamiento:")
print(f"  MAE: {mae_train:.3f}")
print(f"  RMSE: {rmse_train:.3f}")
print(f"\nPrueba:")
print(f"  MAE: {mae_test:.3f}")
print(f"  RMSE: {rmse_test:.3f}")
print(f"\nüîç AN√ÅLISIS DE OVERFITTING:")
print(f"  Diferencia MAE: {overfitting_mae:.2f}%")
print(f"  Diferencia RMSE: {overfitting_rmse:.2f}%")

if overfitting_mae < 5 and overfitting_rmse < 5:
    print(f"  ‚úÖ Overfitting BAJO (<5%)")
else:
    print(f"  ‚ö†Ô∏è Posible overfitting detectado (>5%)")

In [None]:
# ========================================
# 7. VISUALIZACIONES Y AN√ÅLISIS
# ========================================

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Predicciones vs Reales
axes[0,0].scatter(y[test_idx], y_pred_test, alpha=0.6, color='blue')
axes[0,0].plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
axes[0,0].set_xlabel('Cantidad Real')
axes[0,0].set_ylabel('Cantidad Predicha')
axes[0,0].set_title(f'Predicciones vs Reales (Test)\nMAE: {mae_test:.3f}')

# Residuos
residuals = y[test_idx] - y_pred_test
axes[0,1].scatter(y_pred_test, residuals, alpha=0.6)
axes[0,1].axhline(y=0, color='r', linestyle='--')
axes[0,1].set_xlabel('Predicciones')
axes[0,1].set_ylabel('Residuos')
axes[0,1].set_title('Gr√°fico de Residuos (Test)')

# Distribuci√≥n de coeficientes
coeffs_df = pd.DataFrame({
    'Par√°metro': ['Descuento', 'Rating', 'Weekend'],
    'Coeficiente': [beta_discount_mean, beta_rating_mean, beta_weekend_mean]
})
axes[1,0].barh(coeffs_df['Par√°metro'], coeffs_df['Coeficiente'], color=['orange', 'green', 'purple'])
axes[1,0].set_xlabel('Coeficiente (Escala Log)')
axes[1,0].set_title('Importancia de Variables')
axes[1,0].grid(axis='x', alpha=0.3)

# Weekend effect
weekend_effect = df_final.groupby('is_weekend')['quantity_sold'].agg(['mean', 'count'])
axes[1,1].bar(['Weekdays', 'Weekends'], weekend_effect['mean'], 
              color=['lightblue', 'lightcoral'])
axes[1,1].set_ylabel('Promedio Quantity Sold')
axes[1,1].set_title('Efecto Weekend en Ventas')

plt.tight_layout()
plt.show()

In [None]:
# ========================================
# 8. INTERPRETACI√ìN DE RESULTADOS
# ========================================

print("üìã INTERPRETACI√ìN DEL MODELO:")
print("=" * 50)

# Interpretaci√≥n de coeficientes (efecto multiplicativo en Poisson)
discount_effect = np.exp(beta_discount_mean)
rating_effect = np.exp(beta_rating_mean)
weekend_effect = np.exp(beta_weekend_mean)

print(f"\nüéØ Efectos en la Cantidad Vendida:")
print(f"\n1. DESCUENTO:")
print(f"   - Por cada desviaci√≥n est√°ndar de aumento en descuento:")
print(f"   - Las ventas se multiplican por {discount_effect:.3f}")
print(f"   - Cambio: {(discount_effect-1)*100:+.1f}%")

print(f"\n2. RATING:")
print(f"   - Por cada desviaci√≥n est√°ndar de aumento en rating:")
print(f"   - Las ventas se multiplican por {rating_effect:.3f}")
print(f"   - Cambio: {(rating_effect-1)*100:+.1f}%")

print(f"\n3. WEEKEND:")
print(f"   - En weekends vs weekdays:")
print(f"   - Las ventas se multiplican por {weekend_effect:.3f}")
print(f"   - Cambio: {(weekend_effect-1)*100:+.1f}%")

# Resumen final
print(f"\nüìä RESUMEN FINAL:")
print(f"- Modelo: Regresi√≥n Poisson Bayesiana")
print(f"- MAE Test: {mae_test:.3f} unidades")
print(f"- Overfitting: {overfitting_mae:.1f}% {'(BAJO ‚úÖ)' if overfitting_mae < 5 else '(ALTO ‚ö†Ô∏è)'}")
print(f"- Convergencia: {'‚úÖ BUENA' if all(az.rhat(trace)[var].values < 1.1 for var in az.rhat(trace).data_vars) else '‚ö†Ô∏è REVISAR'}")

variable_mas_importante = coeffs_df.loc[coeffs_df['Coeficiente'].abs().idxmax(), 'Par√°metro']
print(f"- Variable m√°s importante: {variable_mas_importante}")

In [None]:
# ========================================
# 9. TESTS INTEGRADOS
# ========================================

def run_tests():
    """Tests integrados en el mismo notebook"""
    print("üß™ EJECUTANDO TESTS...")
    print("=" * 40)
    
    tests_passed = 0
    total_tests = 6
    
    # Test 1: Datos cargados correctamente
    try:
        assert len(df_final) > 0
        assert 'quantity_sold' in df_final.columns
        print("‚úÖ Test 1: Datos cargados correctamente")
        tests_passed += 1
    except:
        print("‚ùå Test 1: Error en carga de datos")
    
    # Test 2: Variables temporales creadas
    try:
        assert 'day_of_week' in df_final.columns
        assert 'is_weekend' in df_final.columns
        assert 'month' in df_final.columns
        assert df_final['is_weekend'].isin([0, 1]).all()
        print("‚úÖ Test 2: Variables temporales creadas correctamente")
        tests_passed += 1
    except:
        print("‚ùå Test 2: Error en variables temporales")
    
    # Test 3: Quantity_sold es entero >= 0
    try:
        assert df_final['quantity_sold'].dtype in [int, 'int64']
        assert (df_final['quantity_sold'] >= 0).all()
        print("‚úÖ Test 3: quantity_sold es entero >= 0")
        tests_passed += 1
    except:
        print("‚ùå Test 3: Error en quantity_sold")
    
    # Test 4: Modelo convergi√≥ (R-hat < 1.1)
    try:
        rhat_ok = all(az.rhat(trace)[var].values < 1.1 for var in az.rhat(trace).data_vars)
        assert rhat_ok
        print("‚úÖ Test 4: Modelo convergi√≥ (R-hat < 1.1)")
        tests_passed += 1
    except:
        print("‚ùå Test 4: Problemas de convergencia")
    
    # Test 5: Overfitting < 5%
    try:
        assert overfitting_mae < 5
        print(f"‚úÖ Test 5: Overfitting bajo ({overfitting_mae:.1f}% < 5%)")
        tests_passed += 1
    except:
        print(f"‚ùå Test 5: Overfitting alto ({overfitting_mae:.1f}% >= 5%)")
    
    # Test 6: Predicciones razonables
    try:
        assert mae_test < 2.0  # MAE menor a 2 unidades
        assert np.all(y_pred_test > 0)  # Predicciones positivas
        print(f"‚úÖ Test 6: Predicciones razonables (MAE: {mae_test:.3f})")
        tests_passed += 1
    except:
        print(f"‚ùå Test 6: Predicciones problem√°ticas (MAE: {mae_test:.3f})")
    
    print("\n" + "=" * 40)
    print(f"üìä RESULTADO: {tests_passed}/{total_tests} tests pasaron")
    
    if tests_passed == total_tests:
        print("üéâ ¬°TODOS LOS TESTS PASARON!")
    else:
        print("‚ö†Ô∏è Algunos tests fallaron, revisar arriba")
    
    return tests_passed == total_tests

# Ejecutar tests
tests_ok = run_tests()

In [None]:
# ========================================
# 10. GUARDAR MODELO Y RESULTADOS
# ========================================

import joblib
import pickle

# Guardar par√°metros del modelo
model_params = {
    'intercept': intercept_mean,
    'beta_discount': beta_discount_mean,
    'beta_rating': beta_rating_mean,
    'beta_weekend': beta_weekend_mean,
    'X_discount_mean': X_discount.mean(),
    'X_discount_std': X_discount.std(),
    'X_rating_mean': X_rating.mean(),
    'X_rating_std': X_rating.std()
}

# Guardar resultados
results = {
    'model_params': model_params,
    'metrics': {
        'mae_train': mae_train,
        'mae_test': mae_test,
        'rmse_train': rmse_train,
        'rmse_test': rmse_test,
        'overfitting_mae': overfitting_mae,
        'overfitting_rmse': overfitting_rmse
    },
    'effects': {
        'discount_effect': discount_effect,
        'rating_effect': rating_effect,
        'weekend_effect': weekend_effect
    },
    'tests_passed': tests_ok
}

# Guardar archivos
with open('modelo_4_poisson_results.pkl', 'wb') as f:
    pickle.dump(results, f)

with open('modelo_4_poisson_trace.pkl', 'wb') as f:
    pickle.dump(trace, f)

print("üíæ Modelo guardado exitosamente:")
print("- modelo_4_poisson_results.pkl")
print("- modelo_4_poisson_trace.pkl")
print("\nüéØ Modelo 4 completado exitosamente!")