In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import matplotlib.pyplot as plt

In [2]:
# 1️⃣ Cargar dataset
df = pd.read_csv("src/pokemonDB_dataset.csv")
df.head()

Unnamed: 0,Pokemon,Type,Species,Height,Weight,Abilities,EV Yield,Catch Rate,Base Friendship,Base Exp,...,Defense Max,Special Attack Base,Special Attack Min,Special Attack Max,Special Defense Base,Special Defense Min,Special Defense Max,Speed Base,Speed Min,Speed Max
0,Abomasnow,"Grass, Ice",Frost Tree Pokémon,2.2 m (7′03″),135.5 kg (298.7 lbs),"1. Snow Warning, Soundproof (hidden ability)","1 Attack, 1 Sp. Atk","60 (7.8% with PokéBall, full HP)",50 (normal),173,...,273,92,170,311,85,157,295,60,112,240
1,Mega Abomasnow,"Grass, Ice",Frost Tree Pokémon,2.7 m (8′10″),185.0 kg (407.9 lbs),1. Snow Warning,"1 Attack, 1 Sp. Atk","60 (7.8% with PokéBall, full HP)",50 (normal),208,...,339,132,242,399,105,193,339,30,58,174
2,Abra,Psychic,Psi Pokémon,0.9 m (2′11″),19.5 kg (43.0 lbs),"1. Synchronize, 2. Inner Focus, Magic Guard (h...",1 Sp. Atk,"200 (26.1% with PokéBall, full HP)",50 (normal),62,...,141,105,193,339,55,103,229,90,166,306
3,Absol,Dark,Disaster Pokémon,1.2 m (3′11″),47.0 kg (103.6 lbs),"1. Pressure, 2. Super Luck, Justified (hidden ...",2 Attack,"30 (3.9% with PokéBall, full HP)",35 (lower than normal),163,...,240,75,139,273,60,112,240,75,139,273
4,Mega Absol,Dark,Disaster Pokémon,1.2 m (3′11″),49.0 kg (108.0 lbs),1. Magic Bounce,2 Attack,"30 (3.9% with PokéBall, full HP)",35 (lower than normal),198,...,240,115,211,361,60,112,240,115,211,361


In [3]:
# Inspección rápida
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Pokemon               1215 non-null   object
 1   Type                  1215 non-null   object
 2   Species               1215 non-null   object
 3   Height                1215 non-null   object
 4   Weight                1215 non-null   object
 5   Abilities             1215 non-null   object
 6   EV Yield              1215 non-null   object
 7   Catch Rate            1215 non-null   object
 8   Base Friendship       1215 non-null   object
 9   Base Exp              1215 non-null   object
 10  Growth Rate           1215 non-null   object
 11  Egg Groups            1215 non-null   object
 12  Gender                1215 non-null   object
 13  Egg Cycles            1215 non-null   object
 14  HP Base               1215 non-null   int64 
 15  HP Min                1215 non-null   

In [4]:
# Conexión a MongoDB y referencias a colecciones
client = MongoClient("mongodb://localhost:27017")
db = client["pokemon_db"]
raw = db["raw_pokemon"]
curated = db["curated_pokemon"]
analytics = db["analytics_pokemon"]
# Garantizar existencia de las colecciones (creación perezosa alternativa)
for name in ["raw_pokemon", "curated_pokemon", "analytics_pokemon"]:
    if name not in db.list_collection_names():
        db.create_collection(name)
print("Collections present:", db.list_collection_names())

Collections present: ['pokemon', 'raw_pokemon', 'curated_pokemon', 'analytics_pokemon']


In [None]:
# 2️⃣ ETL ligero según steps.MD
# Normalización de nombres
df.columns = df.columns.str.lower().str.replace(" ", "_")
# Conversión de tipos (intentos seguros)
if "height" in df.columns:
    df["height_m"] = df["height"].astype(str).str.replace(" m","").replace("nan","").replace("", np.nan)
    df["height_m"] = pd.to_numeric(df["height_m"], errors="coerce")
if "weight" in df.columns:
    df["weight_kg"] = df["weight"].astype(str).str.replace(" kg","").replace("nan","").replace("", np.nan)
    df["weight_kg"] = pd.to_numeric(df["weight_kg"], errors="coerce")
for col in ["catch_rate", "base_exp"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
# Gestión mínima de nulos
if ("pokemon" in df.columns) and ("type" in df.columns):
    df = df.dropna(subset=["pokemon","type"])
# Eliminación de duplicados por 'pokemon'
if "pokemon" in df.columns:
    df = df.drop_duplicates(subset="pokemon")
# Eliminar campos no deseados según nuevo preprocesamiento
for col in ["egg_group", "egg_groups", "gender", "bmi", "catch_rate", "base_friendship", "ev_yield", "egg_cycle"]:
    if col in df.columns:
        df.drop(columns=col, inplace=True)
df.shape

(1215, 34)

In [6]:
# 3️⃣ Insertar en RAW (re-ejecutable)
raw.delete_many({})
raw.insert_many(df.to_dict("records"))
print("Inserted raw count:", raw.count_documents({}))

Inserted raw count: 1215


In [7]:
# Fase 3 — CRUD: CREATE ejemplo
sample = {"pokemon":"Testmon", "type":["Normal"], "height_m":1.0}
raw.insert_one(sample)
raw.find_one({"pokemon":"Testmon"})

{'_id': ObjectId('69973e862e5e5d06cccbbcfd'),
 'pokemon': 'Testmon',
 'type': ['Normal'],
 'height_m': 1.0}

In [8]:
# CRUD: READ ejemplo (5 resultados máximos)
list(raw.find({}).limit(5))

[{'_id': ObjectId('69973e862e5e5d06cccbb83e'),
  'pokemon': 'Abomasnow',
  'type': 'Grass, Ice',
  'species': 'Frost Tree Pokémon',
  'height': '2.2 m (7′03″)',
  'weight': '135.5 kg (298.7 lbs)',
  'ev_yield': '1 Attack, 1 Sp. Atk',
  'catch_rate': nan,
  'base_friendship': '50 (normal)',
  'base_exp': 173.0,
  'growth_rate': 'Slow',
  'egg_groups': 'Grass, Monster',
  'gender': '50% male, 50% female',
  'egg_cycles': '20 (4,884–5,140 steps)',
  'hp_base': 90,
  'hp_min': 290,
  'hp_max': 384,
  'attack_base': 92,
  'attack_min': 170,
  'attack_max': 311,
  'defense_base': 75,
  'defense_min': 139,
  'defense_max': 273,
  'special_attack_base': 92,
  'special_attack_min': 170,
  'special_attack_max': 311,
  'special_defense_base': 85,
  'special_defense_min': 157,
  'special_defense_max': 295,
  'speed_base': 60,
  'speed_min': 112,
  'speed_max': 240,
  'height_m': nan,
  'weight_kg': nan},
 {'_id': ObjectId('69973e862e5e5d06cccbb83f'),
  'pokemon': 'Mega Abomasnow',
  'type': 'Grass

In [None]:
# CRUD: UPDATE ejemplo (no usa campos eliminados)
if "base_exp" in df.columns:
    raw.update_one({"pokemon":"Testmon"},{"$set":{"base_exp":200}})
else:
    raw.update_one({"pokemon":"Testmon"},{"$set":{"notes":"updated"}})
raw.find_one({"pokemon":"Testmon"})

{'_id': ObjectId('69973e862e5e5d06cccbbcfd'),
 'pokemon': 'Testmon',
 'type': ['Normal'],
 'height_m': 1.0,
 'catch_rate': 100}

In [10]:
# CRUD: DELETE ejemplo (limpiar marcador)
raw.delete_one({"pokemon":"Testmon"})
print("Exists after delete:", raw.count_documents({"pokemon":"Testmon"}))

Exists after delete: 0


In [None]:
# Fase 4 — CURATED: Transformaciones obligatorias
df_cur = df.copy()
# Gestión de nulos mínima
if "weight_kg" in df_cur.columns:
    df_cur["weight_kg"] = df_cur["weight_kg"].fillna(df_cur["weight_kg"].median())
if "height_m" in df_cur.columns:
    df_cur["height_m"] = df_cur["height_m"].fillna(df_cur["height_m"].median())
# Corrección de tipos
if "base_exp" in df_cur.columns:
    df_cur["base_exp"] = pd.to_numeric(df_cur["base_exp"], errors="coerce").fillna(0).astype(int)
# Columnas derivadas
base_cols = [c for c in df_cur.columns if c.endswith("_base")]
if base_cols:
    df_cur["total_base_stats"] = df_cur[[c for c in base_cols if c in df_cur.columns]].sum(axis=1)
else:
    df_cur["total_base_stats"] = 0
# Nota: se elimina el campo 'bmi' en preprocesamiento; no se recalcula aquí
df_cur["is_dual_type"] = df_cur["type"].astype(str).str.contains("/") | df_cur["type"].astype(str).str.contains(",")
# Guardar en MongoDB (re-ejecutable)
curated.delete_many({})
curated.insert_many(df_cur.to_dict("records"))
print("Inserted curated count:", curated.count_documents({}))

Inserted curated count: 1215


In [12]:
# Fase 5 — ANALYTICS: agregaciones y guardado en 'analytics_pokemon'
pipeline1 = [
    {"$project":{"type_primary":{"$arrayElemAt":["$type",0]}, "total_base_stats":1}},
    {"$group":{"_id":"$type_primary", "avg_stats":{"$avg":"$total_base_stats"}, "count":{"$sum":1}}}
]
res1 = list(curated.aggregate(pipeline1))
import pandas as pd
df_kpis = pd.DataFrame(res1)
df_kpis
# Guardar resultados de analytics (si hay datos)
analytics.delete_many({})
if not df_kpis.empty:
    analytics.insert_many(df_kpis.to_dict("records"))
print("Analytics stored count:", analytics.count_documents({}))

OperationFailure: Executor error during aggregate command on namespace: pokemon_db.curated_pokemon :: caused by :: $arrayElemAt's first argument must be an array, but is string, full error: {'ok': 0.0, 'errmsg': "Executor error during aggregate command on namespace: pokemon_db.curated_pokemon :: caused by :: $arrayElemAt's first argument must be an array, but is string", 'code': 28689, 'codeName': 'Location28689'}

In [None]:
# Fase 6 — Índices recomendados
curated.create_index("type_primary")
curated.create_index("total_base_stats")
print("Curated indexes:", curated.index_information())

In [None]:
# Fase 7 — Visualización simple desde analytics (si existe)
if 'df_kpis' in globals() and not df_kpis.empty:
    df_viz = df_kpis.sort_values("avg_stats", ascending=False)
    ax = df_viz.plot(kind="bar", x="_id", y="avg_stats", legend=False)
    ax.set_xlabel("type_primary")
    ax.set_ylabel("avg_total_base_stats")
    plt.tight_layout()
    plt.show()
else:
    print("No analytics results to plot. Run the analytics cell first.")