# EDA y ETL MongoDB - Replicación Primario-Secundario
## Dataset: Brazilian E-commerce (Kaggle)

Este notebook realiza:
1. **Descarga automática** del dataset desde Kaggle usando kagglehub
2. **Análisis Exploratorio de Datos (EDA)** de los archivos CSV
3. **Extracción, Transformación y Carga (ETL)**
4. **Carga a MongoDB** con verificación de replicación

In [13]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pymongo import MongoClient
import warnings
import os
import kagglehub
from pathlib import Path

warnings.filterwarnings('ignore')

# Configurar estilo de gráficos
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Librerías importadas correctamente")

✅ Librerías importadas correctamente


## 1. Descarga del Dataset desde Kaggle

In [14]:
# Descargar dataset usando kagglehub
print("📥 Descargando dataset de Brazilian E-commerce...")

try:
    # Descargar el dataset
    path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")
    print(f"✅ Dataset descargado en: {path}")
    
    # Listar archivos descargados
    files = list(Path(path).glob("*.csv"))
    print(f"\n📁 Archivos CSV encontrados ({len(files)}):")
    for file in files:
        print(f"  - {file.name}")
        
except Exception as e:
    print(f"❌ Error al descargar: {e}")
    print("💡 Asegúrate de tener kagglehub instalado: pip install kagglehub")

📥 Descargando dataset de Brazilian E-commerce...
✅ Dataset descargado en: C:\Users\axel_\.cache\kagglehub\datasets\olistbr\brazilian-ecommerce\versions\2

📁 Archivos CSV encontrados (9):
  - olist_customers_dataset.csv
  - olist_geolocation_dataset.csv
  - olist_orders_dataset.csv
  - olist_order_items_dataset.csv
  - olist_order_payments_dataset.csv
  - olist_order_reviews_dataset.csv
  - olist_products_dataset.csv
  - olist_sellers_dataset.csv
  - product_category_name_translation.csv


## 2. Carga y Exploración de los Datos

In [15]:
# Cargar todos los archivos CSV
print("📊 Cargando archivos CSV...")

dataframes = {}

for file in files:
    df_name = file.stem  # Nombre del archivo sin extensión
    print(f"\n📖 Cargando {file.name}...")
    
    try:
        df = pd.read_csv(file)
        dataframes[df_name] = df
        print(f"  ✅ Filas: {len(df)}, Columnas: {len(df.columns)}")
        print(f"  📋 Columnas: {list(df.columns)}")
        
    except Exception as e:
        print(f"  ❌ Error al cargar {file.name}: {e}")

print(f"\n🎉 Total de datasets cargados: {len(dataframes)}")

📊 Cargando archivos CSV...

📖 Cargando olist_customers_dataset.csv...
  ✅ Filas: 99441, Columnas: 5
  📋 Columnas: ['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

📖 Cargando olist_geolocation_dataset.csv...
  ✅ Filas: 1000163, Columnas: 5
  📋 Columnas: ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']

📖 Cargando olist_orders_dataset.csv...
  ✅ Filas: 99441, Columnas: 8
  📋 Columnas: ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

📖 Cargando olist_order_items_dataset.csv...
  ✅ Filas: 112650, Columnas: 7
  📋 Columnas: ['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

📖 Cargando olist_order_payments_dataset.csv...
  ✅ Filas: 103886, Columnas: 5
  📋 Columnas: ['order_id', 'payme

## 3. Análisis Exploratorio de Datos (EDA)

In [16]:
# Mostrar información básica de cada dataset
print("🔍 ANÁLISIS EXPLORATORIO DE DATOS")
print("=" * 50)

for name, df in dataframes.items():
    print(f"\n📊 DATASET: {name.upper()}")
    print(f"Dimensiones: {df.shape}")
    print(f"\nPrimeras 3 filas:")
    display(df.head(3))
    
    print(f"\nInformación del dataset:")
    print(df.info())
    
    print(f"\nValores nulos:")
    null_counts = df.isnull().sum()
    if null_counts.sum() > 0:
        print(null_counts[null_counts > 0])
    else:
        print("✅ No hay valores nulos")
    
    print(f"\nEstadísticas descriptivas:")
    display(df.describe())
    
    print("-" * 50)

🔍 ANÁLISIS EXPLORATORIO DE DATOS

📊 DATASET: OLIST_CUSTOMERS_DATASET
Dimensiones: (99441, 5)

Primeras 3 filas:


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,customer_zip_code_prefix
count,99441.0
mean,35137.474583
std,29797.938996
min,1003.0
25%,11347.0
50%,24416.0
75%,58900.0
max,99990.0


--------------------------------------------------

📊 DATASET: OLIST_GEOLOCATION_DATASET
Dimensiones: (1000163, 5)

Primeras 3 filas:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
count,1000163.0,1000163.0,1000163.0
mean,36574.17,-21.17615,-46.39054
std,30549.34,5.715866,4.269748
min,1001.0,-36.60537,-101.4668
25%,11075.0,-23.60355,-48.57317
50%,26530.0,-22.91938,-46.63788
75%,63504.0,-19.97962,-43.76771
max,99990.0,45.06593,121.1054


--------------------------------------------------

📊 DATASET: OLIST_ORDERS_DATASET
Dimensiones: (99441, 8)

Primeras 3 filas:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None

Valores nulos:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
dtype: int64

Estadísticas descriptivas:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
count,99441,99441,99441,99441,99281,97658,96476,99441
unique,99441,99441,8,98875,90733,81018,95664,459
top,66dea50a8b16d9b4dee7af250b4be1a5,edb027a75a1449115f6b43211ae02a24,delivered,2018-08-02 12:05:26,2018-02-27 04:31:10,2018-05-09 15:48:00,2018-05-08 19:36:48,2017-12-20 00:00:00
freq,1,1,96478,3,9,47,3,522


--------------------------------------------------

📊 DATASET: OLIST_ORDER_ITEMS_DATASET
Dimensiones: (112650, 7)

Primeras 3 filas:


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,order_item_id,price,freight_value
count,112650.0,112650.0,112650.0
mean,1.197834,120.653739,19.99032
std,0.705124,183.633928,15.806405
min,1.0,0.85,0.0
25%,1.0,39.9,13.08
50%,1.0,74.99,16.26
75%,1.0,134.9,21.15
max,21.0,6735.0,409.68


--------------------------------------------------

📊 DATASET: OLIST_ORDER_PAYMENTS_DATASET
Dimensiones: (103886, 5)

Primeras 3 filas:


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154.10038
std,0.706584,2.687051,217.494064
min,1.0,0.0,0.0
25%,1.0,1.0,56.79
50%,1.0,1.0,100.0
75%,1.0,4.0,171.8375
max,29.0,24.0,13664.08


--------------------------------------------------

📊 DATASET: OLIST_ORDER_REVIEWS_DATASET
Dimensiones: (99224, 7)

Primeras 3 filas:


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None

Valores nulos:
review_comment_title      87656
review_comment_message    58247
dtype: int64

Estadísticas descriptivas:


Unnamed: 0,review_score
count,99224.0
mean,4.086421
std,1.347579
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


--------------------------------------------------

📊 DATASET: OLIST_PRODUCTS_DATASET
Dimensiones: (32951, 9)

Primeras 3 filas:


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB
None

Valores nulos:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g              

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
count,32341.0,32341.0,32341.0,32949.0,32949.0,32949.0,32949.0
mean,48.476949,771.495285,2.188986,2276.472488,30.815078,16.937661,23.196728
std,10.245741,635.115225,1.736766,4282.038731,16.914458,13.637554,12.079047
min,5.0,4.0,1.0,0.0,7.0,2.0,6.0
25%,42.0,339.0,1.0,300.0,18.0,8.0,15.0
50%,51.0,595.0,1.0,700.0,25.0,13.0,20.0
75%,57.0,972.0,3.0,1900.0,38.0,21.0,30.0
max,76.0,3992.0,20.0,40425.0,105.0,105.0,118.0


--------------------------------------------------

📊 DATASET: OLIST_SELLERS_DATASET
Dimensiones: (3095, 4)

Primeras 3 filas:


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,seller_zip_code_prefix
count,3095.0
mean,32291.059451
std,32713.45383
min,1001.0
25%,7093.5
50%,14940.0
75%,64552.5
max,99730.0


--------------------------------------------------

📊 DATASET: PRODUCT_CATEGORY_NAME_TRANSLATION
Dimensiones: (71, 2)

Primeras 3 filas:


Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto



Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB
None

Valores nulos:
✅ No hay valores nulos

Estadísticas descriptivas:


Unnamed: 0,product_category_name,product_category_name_english
count,71,71
unique,71,71
top,beleza_saude,health_beauty
freq,1,1


--------------------------------------------------


## 4. Limpieza y Transformación de Datos (ETL)

In [8]:
# ETL: Combinar datasets para crear un dataset unificado de ventas
print("🔄 PROCESO ETL - COMBINANDO DATASETS")
print("=" * 50)

# Obtener los datasets principales
orders_df = dataframes.get('olist_orders_dataset', pd.DataFrame())
items_df = dataframes.get('olist_order_items_dataset', pd.DataFrame())
products_df = dataframes.get('olist_products_dataset', pd.DataFrame())
customers_df = dataframes.get('olist_customers_dataset', pd.DataFrame())
sellers_df = dataframes.get('olist_sellers_dataset', pd.DataFrame())

print(f"📦 Orders: {orders_df.shape}")
print(f"📦 Items: {items_df.shape}")
print(f"📦 Products: {products_df.shape}")
print(f"📦 Customers: {customers_df.shape}")
print(f"📦 Sellers: {sellers_df.shape}")

🔄 PROCESO ETL - COMBINANDO DATASETS
📦 Orders: (99441, 8)
📦 Items: (112650, 7)
📦 Products: (32951, 9)
📦 Customers: (99441, 5)
📦 Sellers: (3095, 4)


In [9]:
# Limpiar y transformar fechas
print("\n🕒 Transformando fechas...")

if not orders_df.empty:
    # Convertir columnas de fecha
    date_columns = ['order_purchase_date', 'order_approved_at', 'order_delivered_carrier_date', 
                   'order_delivered_customer_date', 'order_estimated_delivery_date']
    
    for col in date_columns:
        if col in orders_df.columns:
            orders_df[col] = pd.to_datetime(orders_df[col], errors='coerce')
    
    print("✅ Fechas transformadas")
    
    # Mostrar rango de fechas
    if 'order_purchase_date' in orders_df.columns:
        print(f"📅 Rango de fechas de compra: {orders_df['order_purchase_date'].min()} a {orders_df['order_purchase_date'].max()}")


🕒 Transformando fechas...
✅ Fechas transformadas


In [10]:
# Combinar datasets
print("\n🔗 Combinando datasets...")

try:
    # Merge 1: Orders + Items
    if not orders_df.empty and not items_df.empty:
        ventas_df = orders_df.merge(items_df, on='order_id', how='inner')
        print(f"✅ Orders + Items: {ventas_df.shape}")
    
    # Merge 2: + Products
    if not products_df.empty:
        ventas_df = ventas_df.merge(products_df, on='product_id', how='left')
        print(f"✅ + Products: {ventas_df.shape}")
    
    # Merge 3: + Customers
    if not customers_df.empty:
        ventas_df = ventas_df.merge(customers_df, on='customer_id', how='left')
        print(f"✅ + Customers: {ventas_df.shape}")
    
    # Merge 4: + Sellers
    if not sellers_df.empty:
        ventas_df = ventas_df.merge(sellers_df, on='seller_id', how='left')
        print(f"✅ + Sellers: {ventas_df.shape}")
    
    print(f"\n🎉 Dataset combinado final: {ventas_df.shape}")
    
except Exception as e:
    print(f"❌ Error al combinar: {e}")
    # Crear dataset de ejemplo si falla la combinación
    ventas_df = pd.DataFrame()


🔗 Combinando datasets...
✅ Orders + Items: (112650, 14)
✅ + Products: (112650, 22)
✅ + Customers: (112650, 26)
✅ + Sellers: (112650, 29)

🎉 Dataset combinado final: (112650, 29)


In [17]:
# Limpiar y preparar el dataset final
print("\n🧹 Limpiando dataset final...")

if not ventas_df.empty:
    # Seleccionar columnas relevantes y renombrar
    columnas_finales = {
        'order_id': 'pedido_id',
        'order_purchase_date': 'fecha_compra',
        'order_status': 'estado_pedido',
        'product_id': 'producto_id',
        'product_name_lenght': 'longitud_nombre_producto',
        'product_description_lenght': 'longitud_descripcion_producto',
        'product_photos_qty': 'cantidad_fotos_producto',
        'product_weight_g': 'peso_producto_g',
        'product_length_cm': 'longitud_producto_cm',
        'product_height_cm': 'altura_producto_cm',
        'product_width_cm': 'ancho_producto_cm',
        'price': 'precio',
        'freight_value': 'valor_flete',
        'customer_id': 'cliente_id',
        'customer_city': 'ciudad_cliente',
        'customer_state': 'estado_cliente',
        'seller_id': 'vendedor_id',
        'seller_city': 'ciudad_vendedor',
        'seller_state': 'estado_vendedor'
    }
    
    # Filtrar columnas que existen
    columnas_existentes = {k: v for k, v in columnas_finales.items() if k in ventas_df.columns}
    ventas_limpio = ventas_df[list(columnas_existentes.keys())].copy()
    ventas_limpio.rename(columns=columnas_existentes, inplace=True)
    
    # Agregar campos calculados
    if 'precio' in ventas_limpio.columns and 'valor_flete' in ventas_limpio.columns:
        ventas_limpio['precio_total'] = ventas_limpio['precio'] + ventas_limpio['valor_flete']
    
    # Agregar campo de stock simulado
    ventas_limpio['cantidad_stock'] = np.random.randint(0, 100, len(ventas_limpio))
    
    # Limpiar valores nulos
    ventas_limpio.dropna(subset=['fecha_compra', 'precio'], inplace=True)
    
    print(f"✅ Dataset limpio: {ventas_limpio.shape}")
    print(f"📋 Columnas finales: {list(ventas_limpio.columns)}")
    
else:
    print("⚠️ No se pudo crear el dataset combinado, usando datos de ejemplo")
    # Crear dataset de ejemplo
    ventas_limpio = pd.DataFrame({
        'pedido_id': range(1, 1001),
        'fecha_compra': pd.date_range('2023-01-01', periods=1000, freq='D'),
        'producto_id': np.random.randint(1, 101, 1000),
        'precio': np.random.uniform(10, 500, 1000),
        'cliente_id': np.random.randint(1, 201, 1000),
        'ciudad_cliente': np.random.choice(['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Fortaleza'], 1000),
        'cantidad_stock': np.random.randint(0, 100, 1000)
    })


🧹 Limpiando dataset final...


KeyError: ['fecha_compra']

## 5. Visualizaciones del EDA

In [None]:
# Visualizaciones del EDA
print("📊 CREANDO VISUALIZACIONES")
print("=" * 50)

# Configurar subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Análisis Exploratorio de Datos - Brazilian E-commerce', fontsize=16, fontweight='bold')

# 1. Distribución de precios
if 'precio' in ventas_limpio.columns:
    axes[0, 0].hist(ventas_limpio['precio'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Distribución de Precios')
    axes[0, 0].set_xlabel('Precio (R$)')
    axes[0, 0].set_ylabel('Frecuencia')
    axes[0, 0].grid(True, alpha=0.3)

# 2. Ventas por mes
if 'fecha_compra' in ventas_limpio.columns:
    ventas_por_mes = ventas_limpio.groupby(ventas_limpio['fecha_compra'].dt.to_period('M')).size()
    axes[0, 1].plot(range(len(ventas_por_mes)), ventas_por_mes.values, marker='o', linewidth=2, markersize=6)
    axes[0, 1].set_title('Ventas por Mes')
    axes[0, 1].set_xlabel('Mes')
    axes[0, 1].set_ylabel('Número de Ventas')
    axes[0, 1].grid(True, alpha=0.3)

# 3. Top ciudades por ventas
if 'ciudad_cliente' in ventas_limpio.columns:
    top_ciudades = ventas_limpio['ciudad_cliente'].value_counts().head(10)
    axes[1, 0].barh(range(len(top_ciudades)), top_ciudades.values, color='lightcoral')
    axes[1, 0].set_yticks(range(len(top_ciudades)))
    axes[1, 0].set_yticklabels(top_ciudades.index)
    axes[1, 0].set_title('Top 10 Ciudades por Ventas')
    axes[1, 0].set_xlabel('Número de Ventas')
    axes[1, 0].grid(True, alpha=0.3)

# 4. Distribución de stock
if 'cantidad_stock' in ventas_limpio.columns:
    axes[1, 1].hist(ventas_limpio['cantidad_stock'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[1, 1].set_title('Distribución de Stock')
    axes[1, 1].set_xlabel('Cantidad en Stock')
    axes[1, 1].set_ylabel('Frecuencia')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Visualizaciones creadas")

## 6. Conexión y Carga a MongoDB

In [None]:
# Configuración de MongoDB
MONGO_URI = "mongodb://admin:password123@localhost:27017/"
DB_NAME = "ventas_tienda_db"
COLLECTION_NAME = "ventas"

print("🔌 Conectando a MongoDB...")

try:
    client = MongoClient(MONGO_URI)
    client.admin.command('ping')
    print("✅ Conexión exitosa a MongoDB")
    
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    
    # Limpiar colección existente
    collection.delete_many({})
    print("🧹 Colección limpiada")
    
except Exception as e:
    print(f"❌ Error de conexión: {e}")
    print("💡 Asegúrate de que MongoDB esté ejecutándose con: docker-compose -f docker/docker-compose.yml up -d")

In [None]:
# Cargar datos a MongoDB
print("\n📤 Cargando datos a MongoDB...")

try:
    # Convertir DataFrame a documentos
    records = ventas_limpio.to_dict('records')
    
    # Procesar fechas
    for record in records:
        if isinstance(record.get('fecha_compra'), str):
            record['fecha_compra'] = pd.to_datetime(record['fecha_compra'])
    
    # Insertar en lotes para mejor rendimiento
    batch_size = 1000
    total_inserted = 0
    
    for i in range(0, len(records), batch_size):
        batch = records[i:i + batch_size]
        result = collection.insert_many(batch)
        total_inserted += len(result.inserted_ids)
        print(f"  📦 Lote {i//batch_size + 1}: {len(result.inserted_ids)} registros")
    
    print(f"\n🎉 Total de registros insertados: {total_inserted}")
    
    # Verificar inserción
    count = collection.count_documents({})
    print(f"📊 Documentos en la colección: {count}")
    
except Exception as e:
    print(f"❌ Error al cargar datos: {e}")

## 7. Verificación de Replicación

In [None]:
# Verificar replicación entre nodos
print("🔄 VERIFICANDO REPLICACIÓN")
print("=" * 50)

try:
    # Insertar documento de prueba en el primario
    test_doc = {
        'pedido_id': 'TEST-001',
        'fecha_compra': datetime.now(),
        'producto_id': 'PROD-TEST',
        'precio': 999.99,
        'cliente_id': 99999,
        'ciudad_cliente': 'Ciudad de Prueba',
        'cantidad_stock': 50,
        'test_replicacion': True
    }
    
    result = collection.insert_one(test_doc)
    print(f"✅ Documento de prueba insertado: {result.inserted_id}")
    
    # Esperar a que se replique
    import time
    print("⏳ Esperando replicación...")
    time.sleep(3)
    
    # Verificar en nodos secundarios
    secondary_ports = [27018, 27019]
    
    for port in secondary_ports:
        try:
            secondary_uri = f"mongodb://admin:password123@localhost:{port}/"
            secondary_client = MongoClient(secondary_uri)
            secondary_db = secondary_client[DB_NAME]
            secondary_collection = secondary_db[COLLECTION_NAME]
            
            # Buscar documento de prueba
            doc = secondary_collection.find_one({'pedido_id': 'TEST-001'})
            
            if doc:
                print(f"✅ Replicación exitosa en puerto {port}: {doc['producto_id']} - ${doc['precio']}")
            else:
                print(f"❌ Replicación falló en puerto {port}")
                
        except Exception as e:
            print(f"⚠️ No se pudo verificar puerto {port}: {e}")
    
    # Limpiar documento de prueba
    collection.delete_one({'pedido_id': 'TEST-001'})
    print("🧹 Documento de prueba eliminado")
    
except Exception as e:
    print(f"❌ Error en verificación: {e}")
    print("💡 Verifica que el replica set esté configurado correctamente")

## 8. Resumen del EDA y ETL

In [None]:
# Resumen final
print("📋 RESUMEN DEL PROCESO EDA Y ETL")
print("=" * 50)

print(f"\n📊 DATASET ORIGINAL:")
for name, df in dataframes.items():
    print(f"  - {name}: {df.shape}")

print(f"\n🔄 DATASET PROCESADO:")
print(f"  - Dimensiones: {ventas_limpio.shape}")
print(f"  - Columnas: {list(ventas_limpio.columns)}")

print(f"\n📈 ESTADÍSTICAS CLAVE:")
if 'precio' in ventas_limpio.columns:
    print(f"  - Precio promedio: R$ {ventas_limpio['precio'].mean():.2f}")
    print(f"  - Precio máximo: R$ {ventas_limpio['precio'].max():.2f}")
    print(f"  - Precio mínimo: R$ {ventas_limpio['precio'].min():.2f}")

if 'fecha_compra' in ventas_limpio.columns:
    print(f"  - Período: {ventas_limpio['fecha_compra'].min().date()} a {ventas_limpio['fecha_compra'].max().date()}")

if 'ciudad_cliente' in ventas_limpio.columns:
    print(f"  - Ciudades únicas: {ventas_limpio['ciudad_cliente'].nunique()}")

print(f"\n🗄️ MONGODB:")
try:
    count = collection.count_documents({})
    print(f"  - Documentos cargados: {count}")
    print(f"  - Base de datos: {DB_NAME}")
    print(f"  - Colección: {COLLECTION_NAME}")
except:
    print("  - No disponible")

print(f"\n🎉 ¡PROCESO COMPLETADO EXITOSAMENTE!")
print(f"💡 Ahora puedes ejecutar el notebook de Consultas CRUD para probar las operaciones")