In [364]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.impute import SimpleImputer
from datetime import datetime

In [365]:
df=pd.read_csv("Marvel_Comics.csv")
df.head()

Unnamed: 0,comic_name,active_years,issue_title,publish_date,issue_description,penciler,writer,cover_artist,Imprint,Format,Rating,Price
0,A Year of Marvels: April Infinite Comic (2016),(2016),A Year of Marvels: April Infinite Comic (2016) #1,"April 01, 2016",The Infinite Comic that will have everyone tal...,Yves Bigerel,Yves Bigerel,Jamal Campbell,Marvel Universe,Infinite Comic,Rated T+,Free
1,A Year of Marvels: August Infinite Comic (2016),(2016),A Year of Marvels: August Infinite Comic (2016...,"August 10, 2016","It’s August, and Nick Fury is just in time to ...",Jamal Campbell,"Chris Sims, Chad Bowers",,Marvel Universe,Infinite Comic,,Free
2,A Year of Marvels: February Infinite Comic (2016),(2016),A Year of Marvels: February Infinite Comic (20...,"February 10, 2016",Join us in a brand new Marvel comics adventure...,"Danilo S. Beyruth, M Mast",Ryan North,,Marvel Universe,Infinite Comic,Rated T+,Free
3,A Year of Marvels: July Infinite Comic (2016),(2016),A Year of Marvels: July Infinite Comic (2016) #1,"June 29, 2016",Celebrating the Fourth of July is complicated ...,Juanan Ramirez,Chuck Wendig,Jamal Campbell,Marvel Universe,Infinite Comic,,Free
4,A Year of Marvels: June Infinite Comic (2016),(2016),A Year of Marvels: June Infinite Comic (2016) #1,"June 15, 2016",Sam Alexander’s finding it hard to cope with t...,Diego Olortegui,Paul Allor,Jamal Campbell,Marvel Universe,Infinite Comic,,Free


In [366]:
df.shape

(34992, 12)

In [367]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34992 entries, 0 to 34991
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   comic_name         34992 non-null  object
 1   active_years       34992 non-null  object
 2   issue_title        34466 non-null  object
 3   publish_date       34466 non-null  object
 4   issue_description  30395 non-null  object
 5   penciler           25482 non-null  object
 6   writer             27595 non-null  object
 7   cover_artist       12255 non-null  object
 8   Imprint            11684 non-null  object
 9   Format             32894 non-null  object
 10  Rating             12619 non-null  object
 11  Price              32894 non-null  object
dtypes: object(12)
memory usage: 3.2+ MB


In [368]:
df.describe()

Unnamed: 0,comic_name,active_years,issue_title,publish_date,issue_description,penciler,writer,cover_artist,Imprint,Format,Rating,Price
count,34992,34992,34466,34466,30395,25482,27595,12255,11684,32894,12619,32894
unique,4935,412,33757,3317,29331,3915,3082,1010,39,11,36,53
top,Uncanny X-Men (1981 - 2011),(1968 - 1996),X-Men: The Complete Age of Apocalypse Epic Boo...,"November 30, -0001",Please note that these digital editions collec...,Sal Buscema,Brian Michael Bendis,Gil Kane,Marvel Universe,Comic,Rated T+,Free
freq,588,718,10,543,37,394,859,217,7786,31899,3204,15136


## FASE 1 Diseño de la base de datos 

In [369]:
client = MongoClient("mongodb://localhost:27017")
db = client['marvel']

collections = ["raw_marvel", "curated_marvel", "analytics_marvel"]

for coll in collections:
    if coll not in db.list_collection_names():
        db.create_collection(coll)
        print(f"Colección '{coll}' creada")
    else:
        print(f"Colección '{coll}' ya existe")



Colección 'raw_marvel' ya existe
Colección 'curated_marvel' ya existe
Colección 'analytics_marvel' ya existe


## FASE 2 Carga de datos en RAW

In [370]:
df_raw=df.copy()

In [371]:
df_raw.drop(columns=["issue_description","issue_title"],inplace=True)

In [372]:
#NORMALIZACIÓN DE NOMBRES DE COLUMNAS
cols=df_raw.columns.astype(str).str.title()
print(cols)

Index(['Comic_Name', 'Active_Years', 'Publish_Date', 'Penciler', 'Writer',
       'Cover_Artist', 'Imprint', 'Format', 'Rating', 'Price'],
      dtype='object')


In [373]:
#EXTRAEMOS LOS AÑOS DE LOS PARÉNTESIS Y LOS CONVERTIMOS A ENTEROS
df_raw["active_years"] = df_raw["active_years"].str.extract(r"\((\d{4})")[0].astype("int64")

In [374]:
#CAMBIAMOS LOS PRECIOS A FLOAT
df_raw["Price"] = df_raw["Price"].str.strip().replace("Free", "0")  # Free → 0
df_raw["Price"] = df_raw["Price"].str.replace(r"[\$,]", "", regex=True)  # quitar $ y comas
df_raw["Price"] = df_raw["Price"].astype("float64")

In [375]:
#CAMBIAMOS EL FORMATO DE FECHA
df_raw["publish_date"] = pd.to_datetime(df_raw["publish_date"], errors="coerce")
datetime_cols = df_raw.select_dtypes(include=["datetime64[ns]"]).columns
df_raw[datetime_cols] = df_raw[datetime_cols].astype("object")

#IMPUTAMOS
imputer = SimpleImputer(strategy="most_frequent")
df_raw[datetime_cols] = imputer.fit_transform(df_raw[datetime_cols])
df_raw[datetime_cols] = df_raw[datetime_cols].apply(pd.to_datetime, errors="coerce")
print(df_raw[datetime_cols].isna().sum())


publish_date    0
dtype: int64


In [376]:
df_raw.dtypes

comic_name              object
active_years             int64
publish_date    datetime64[ns]
penciler                object
writer                  object
cover_artist            object
Imprint                 object
Format                  object
Rating                  object
Price                  float64
dtype: object

In [377]:
#CAMBIAMOS LOS NAN DE STRING A NULL
obj_cols = df_raw.select_dtypes(include="O").columns

df_raw[obj_cols] = df_raw[obj_cols].where(df_raw[obj_cols].notna(), None)

In [378]:
df_raw.head()

Unnamed: 0,comic_name,active_years,publish_date,penciler,writer,cover_artist,Imprint,Format,Rating,Price
0,A Year of Marvels: April Infinite Comic (2016),2016,2016-04-01,Yves Bigerel,Yves Bigerel,Jamal Campbell,Marvel Universe,Infinite Comic,Rated T+,0.0
1,A Year of Marvels: August Infinite Comic (2016),2016,2016-08-10,Jamal Campbell,"Chris Sims, Chad Bowers",,Marvel Universe,Infinite Comic,,0.0
2,A Year of Marvels: February Infinite Comic (2016),2016,2016-02-10,"Danilo S. Beyruth, M Mast",Ryan North,,Marvel Universe,Infinite Comic,Rated T+,0.0
3,A Year of Marvels: July Infinite Comic (2016),2016,2016-06-29,Juanan Ramirez,Chuck Wendig,Jamal Campbell,Marvel Universe,Infinite Comic,,0.0
4,A Year of Marvels: June Infinite Comic (2016),2016,2016-06-15,Diego Olortegui,Paul Allor,Jamal Campbell,Marvel Universe,Infinite Comic,,0.0


In [379]:
print(df_raw.isna().sum())

comic_name          0
active_years        0
publish_date        0
penciler         9510
writer           7397
cover_artist    22737
Imprint         23308
Format           2098
Rating          22373
Price            2098
dtype: int64


In [380]:
print(df_raw.isna().sum()/len(df_raw))

comic_name      0.000000
active_years    0.000000
publish_date    0.000000
penciler        0.271776
writer          0.211391
cover_artist    0.649777
Imprint         0.666095
Format          0.059957
Rating          0.639375
Price           0.059957
dtype: float64


In [381]:
#VAMOS A COMPROBAR QUE CATEGORIAS DE RATING HAY
unique_values = df_raw["Rating"].unique()
print(unique_values)



[' Rated T+' None ' Rated T' ' ALL AGES' ' A' ' Parental Advisory'
 ' Marvel Psr' ' No Rating' ' MARVEL PSR' ' T' ' RATED T' ' Max'
 ' RATED T+' ' RATED A' ' All Ages' ' T+' ' Rated a' ' Rated A'
 ' Parental Advisory/Explicit Content' ' PARENTAL SUPERVISION'
 ' PARENTAL ADVISORY' ' Mature' ' MARVEL PSR+' ' EXPLICIT CONTENT'
 ' PARENTAL ADVISORYSLC' ' Parental AdvisorySLC' ' Parental Advisoryslc'
 ' Explicit Content' ' PARENTAL ADVISORY/EXPLICIT CONTENT' ' NO RATING'
 ' NOT IN ORACLE' ' Parental Guidance' ' Ages 10 & Up' ' Not in Oracle'
 ' MAX' ' Marvel Psr+' ' Ages 9+']


In [382]:
#UNIFICAMOS LAS CATEGORÍAS DE RATING
df_raw["Rating_clean"] = df_raw["Rating"].astype(str).str.lower().str.strip()
rating_map = {
    "none": "No Rated",
    "no rating": "No Rated",
    "not in oracle": "No Rated",
    "t+": "T+",
    "rated t+": "T+",
    "rated t": "T",
    "t": "T",
    "all ages": "All Ages",
    "a": "A",
    "rated a": "A",
    "parental advisory": "Parental Advisory",
    "parental advisory/explicit content": "Parental Advisory",
    "parental advisoryslc": "Parental Advisory",
    "explicit content": "Explicit Content",
    "marvel psr": "Marvel PSR",
    "marvel psr+": "Marvel PSR+",
    "max": "Max",
    "mature": "Mature",
    "parental guidance": "Parental Advisory",
    "parental supervision": "Parental Advisory",
    "ages 10 & up": "All Ages",
    "ages 9+": "All Ages",
    # puedes agregar más variantes si aparecen
}

df_raw["Rating"] = df_raw["Rating_clean"].replace(rating_map)

df_raw.drop(columns=["Rating_clean"], inplace=True)

# Ver cuántos valores únicos quedan
print(df_raw["Rating"].value_counts())




Rating
No Rated             22502
T+                    4410
T                     3057
Parental Advisory     1613
All Ages              1199
Marvel PSR            1060
A                      903
Explicit Content       149
Mature                  44
Max                     34
Marvel PSR+             21
Name: count, dtype: int64


In [383]:
#DUPLICADOS
num_duplicates = df_raw.duplicated().sum()
print(f"Hay {num_duplicates} registros duplicados")


Hay 771 registros duplicados


In [384]:
#ELIMINAMOS LOS DUPLICADOS
df_raw = df_raw.drop_duplicates()



In [385]:
#AÑADIMOS UN ID
df_raw["id"] = range(1, len(df_raw) + 1)


In [386]:
coleccion = db["raw_marvel"] 

In [387]:
coleccion.delete_many({})
print("Colección 'raw_marvel' limpia")

Colección 'raw_marvel' limpia


In [388]:
#INSERCIÓN EN LA BASE DE DATOS
data_to_insert = df_raw.to_dict(orient='records')

result = coleccion.insert_many(data_to_insert)

print(f"Se insertaron {len(result.inserted_ids)} documentos en 'raw_marvel'")


Se insertaron 34221 documentos en 'raw_marvel'


## FASE 3 CRUD completo sobre RAW

In [389]:
#CREAMOS UN COMIC
nuevo_comic = {
    "comic_name": "Avengers: Future Fight (2026)",
    "active_years": 2026,
    "publish_date": datetime(2026, 3, 1),
    "penciler": "Emily White",
    "writer": "Mark Black",
    "cover_artist": "Chris Sanders",
    "Imprint": "Marvel Universe",
    "Format": "Graphic Novel",
    "Rating": "T+",
    "Price": 5.99,
    "id": 34223
}


### INSERCIÓN

In [390]:
#INSERTAMOS UN COMIC
result_create = coleccion.insert_one(nuevo_comic)
print(f"Documento insertado con _id: {result_create.inserted_id}")

Documento insertado con _id: 69976675c5b84e1c8cba16a5


### FIND

In [391]:
#ENCONTRAMOS CON FILTRO
t_plus_comics = list(coleccion.find({"Rating": "T+"}))
print(f"Comics con Rating 'T+': {len(t_plus_comics)}")

Comics con Rating 'T+': 4400


In [392]:
#COMICS QUE TIENEN PRECIO
priced_comics = list(coleccion.find({"Price": {"$gt": 0}}))
print(f"Comics con Price > 0: {len(priced_comics)}")

Comics con Price > 0: 17701


In [393]:
#PROYECCIÓN
projection_comics = list(coleccion.find({}, {"comic_name": 1, "Price": 1, "_id": 0}))
print("Proyección comic_name y Price (primeros 5):", projection_comics[:5])

Proyección comic_name y Price (primeros 5): [{'comic_name': 'A Year of Marvels: April Infinite Comic (2016)', 'Price': 0.0}, {'comic_name': 'A Year of Marvels: August Infinite Comic (2016)', 'Price': 0.0}, {'comic_name': 'A Year of Marvels: February Infinite Comic (2016)', 'Price': 0.0}, {'comic_name': 'A Year of Marvels: July Infinite Comic (2016)', 'Price': 0.0}, {'comic_name': 'A Year of Marvels: June Infinite Comic (2016)', 'Price': 0.0}]


In [394]:
#LOS 3 MAS CAROS
top_priced = list(coleccion.find().sort("Price", -1).limit(3))
print("Top 3 comics más caros:", top_priced)

Top 3 comics más caros: [{'_id': ObjectId('69976674c5b84e1c8cb9bd6f'), 'comic_name': 'Howard the Duck Annual (1977)', 'active_years': 1977, 'publish_date': datetime.datetime(2008, 7, 23, 0, 0), 'penciler': None, 'writer': None, 'cover_artist': None, 'Imprint': ' MARVEL UNIVERSE', 'Format': ' Hardcover', 'Rating': 'T+', 'Price': 99.99, 'id': 11384}, {'_id': ObjectId('69976674c5b84e1c8cb9bd29'), 'comic_name': 'Howard the Duck (1976 - 1979)', 'active_years': 1976, 'publish_date': datetime.datetime(2008, 7, 23, 0, 0), 'penciler': None, 'writer': None, 'cover_artist': None, 'Imprint': ' MARVEL UNIVERSE', 'Format': ' Hardcover', 'Rating': 'T+', 'Price': 99.99, 'id': 11314}, {'_id': ObjectId('69976674c5b84e1c8cb9aa59'), 'comic_name': 'Daring Mystery Comics (1940 - 1942)', 'active_years': 1940, 'publish_date': datetime.datetime(2008, 1, 23, 0, 0), 'penciler': None, 'writer': None, 'cover_artist': None, 'Imprint': ' MARVEL UNIVERSE', 'Format': ' Hardcover', 'Rating': 'A', 'Price': 59.99, 'id': 

### UPDATE

In [395]:
coleccion.update_one(
    {"id": 1},         
    {"$set": {"Rating": "All Ages"}} 
)

UpdateResult({'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)

In [396]:
coleccion.update_many(
    {},  
    [{"$set": {"Rating": {"$toUpper": "$Rating"}}}] 
)

UpdateResult({'n': 34222, 'nModified': 25881, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)

### DELETE

In [397]:
coleccion.delete_one({"id": 1})

DeleteResult({'n': 1, 'ok': 1.0}, acknowledged=True)

In [398]:
coleccion.delete_many({"Price": 0})

DeleteResult({'n': 14470, 'ok': 1.0}, acknowledged=True)

## FASE 4 CURATED (limpieza y transformación)

## FASE 5 ANALYTICS (agregaciones)

## FASE 6 Rendimiento y diseño

## FASE 7 Visualización y conclusiones 