In [None]:
import requests
import time
import boto3
import pandas as pd
import io
import json
from typing import List, Dict, Any
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders import TargetEncoder
import re




In [None]:
all_Cards = []

In [None]:
last = False

while last == False:
    try:
        response = requests.get(url)

        list_rels = response.headers.get('Link').split(",")
        list_rels_comp = [item.split(";") for item in list_rels]

        dict_rels = {rel[1].replace("rel=", "").replace('"', '').replace(" ","") : rel[0].replace("<","").replace(">","").replace(" ","") for rel in list_rels_comp}
        
        all_Cards.extend(response.json()["cards"])
        
        if "next" in dict_rels:
            url = dict_rels["next"]
            time.sleep(1)
        else:
            last = True
    except requests.exceptions.HTTPError as e:
        print(f"Error HTTP ({e.response.status_code}). Reintentando en 5s...")
        time.sleep(5)
    except requests.exceptions.RequestException as e:
        print(f"Error de conexión: {e}. Reintentando en 5s...")
        time.sleep(5)
    except Exception as error:
        print(f"Ha ocurrido otro error: {error}")

### DESCARGA DESDE BULK

In [None]:
all_Cards = requests.get("https://data.scryfall.io/default-cards/default-cards-20251023090836.json").json()

In [None]:
def get_spanish_value(card_data, key_to_extract):
    foreign_names = card_data.get("foreignNames")
    if foreign_names:
        for item in foreign_names:
            if item.get("language") == "Spanish":
                return item.get(key_to_extract, "")
    return ""

def get_formats(legalities: Dict[str, str]) -> List[str]:
    if isinstance(legalities, dict):
        return [
            formato
            for formato, estatus in legalities.items()
            if estatus == "legal"
        ]
    return []

def map_color_identity(color_list):
    
    # Asegurarse de que el input es una lista de strings y que no es None
    if not isinstance(color_list, list) or len(color_list) == 0:
        return 'Colorless'
    
    sorted_colors = tuple(sorted(color_list))
    
    # W = White, U = Blue, B = Black, R = Red, G = Green
    color_map = {
        # --- Monocolor ---
        ("W",): "Monowhite",
        ("U",): "Monoblue",
        ("B",): "Monoblack",
        ("R",): "Monored",
        ("G",): "Monogreen",
        
        # --- Bicolor (Guilds) ---
        ("W", "U"): "Azorius",
        ("W", "B"): "Orzhov",
        ("B", "R"): "Rakdos",
        ("B", "G"): "Golgari",
        ("G", "W"): "Selesnya",
        ("U", "B"): "Dimir",
        ("U", "R"): "Izzet",
        ("R", "G"): "Gruul",
        ("R", "W"): "Boros",
        ("G", "U"): "Simic",
        
        # --- Tricolor (Shards/Wedges) ---
        # Shards of Alara
        ("W", "U", "B"): "Esper",
        ("U", "B", "R"): "Grixis",
        ("B", "R", "G"): "Jund",
        ("R", "G", "W"): "Naya",
        ("G", "W", "U"): "Bant",
        # Wedges of Tarkir
        ("W", "B", "G"): "Abzan",
        ("U", "R", "W"): "Jeskai",
        ("B", "G", "U"): "Sultai",
        ("R", "W", "B"): "Mardu",
        ("G", "U", "R"): "Temur",
        
        # --- Cuatricolor ---
        ("W", "U", "B", "R"): "Glint", # Sin Verde
        ("U", "B", "R", "G"): "Dune",  # Sin Blanco
        ("B", "R", "G", "W"): "Ink",   # Sin Azul
        ("R", "G", "W", "U"): "Yore",  # Sin Negro
        ("G", "W", "U", "B"): "Witch", # Sin Rojo
        
        # --- Cinco Colores ---
        ("W", "U", "B", "R", "G"): "FiveColor" # WUBRG
    }
    
    # Buscar la combinación en el mapa
    if sorted_colors in color_map:
        return color_map[sorted_colors]

def extract_generic_cost(cost_string):
    generic_matches = re.findall(r'\{(\d+)\}', cost_string)
    return sum(int(n) for n in generic_matches)

In [None]:
all_cards_cleaned = [{
        "Nombre": card.get("name"),
        "Texto": card.get("oracle_text"),
        "Coste": card.get("mana_cost", ""),
        "Identidad_Color": card.get("color_identity"),
        "Tipo": card.get("type_line"),
        "Set": card.get("set_name"),
        "Rareza": card.get("rarity"),

        "Fuerza": card.get("power", None), 
        "Resistencia": card.get("toughness", None),
        
        "Cmc": card.get("cmc", None),
        "Formatos_legales": get_formats((card.get("legalities", {}))),
        "Reserved_list": card.get("reserved"),
        "Game_changer": card.get("game_changer"),
        "Promo": card.get("promo"),
        "Precio ($)": card["prices"]["usd"]
    } for card in all_Cards]

df_cards = pd.DataFrame(all_cards_cleaned)
df_cards.dropna(subset=["Precio ($)"], inplace=True)
df_cards["Identidad_Color"] = df_cards["Identidad_Color"].apply(map_color_identity)

In [None]:
df_cards

In [None]:
#Este es un buen momento para alimentar la DDBB después, adicionalmente continuamos trabajando los datos para el modelo

###

df_to_train = df_cards

In [None]:
# Voy a guardar que cartas tenían coste 0 auténtico ya que es algo muy importante, y voy a rellenar los Nan manteniendo ese dato.
df_to_train["cmc_Nulo"] = df_to_train['Cmc'].isna().astype(int)
df_to_train["Cmc"].fillna(0, inplace=True)

# Haré lo mismo para Fuerza y Resistencia de Criaturas y cartas de otros tipos que no tienen esto valores.
df_to_train["es_criatura"] = df_to_train["Tipo"].str.contains("Creature", case=False, na=False).astype(int)
df_to_train["Fuerza"].fillna(0, inplace=True)
df_to_train["Resistencia"].fillna(0, inplace=True)

# Voy a rellenar el campo de texto de las cartas que no tengan (como por ejemplo algunas tierras básicas) con un string vacío.
df_to_train["Texto"].fillna("", inplace=True)



In [None]:
df_to_train["Promo"] = df_to_train["Promo"].astype(int)
df_to_train["Game_changer"] = df_to_train["Game_changer"].astype(int)
df_to_train["Reserved_list"] = df_to_train["Reserved_list"].astype(int)

df_to_train["Identidad_Color"] = df_to_train["Identidad_Color"].astype("category")
df_to_train["Tipo"] = df_to_train["Tipo"].astype("category")
df_to_train["Rareza"] = df_to_train["Rareza"].astype("category")

In [None]:
mana_symbols = ["W", "U", "B", "R", "G", "C", "X"] 

df_to_train["Coste_Incoloro"] = 0
df_to_train["Coste_Incoloro"] = df_to_train["Coste"].apply(extract_generic_cost)

for symbol in mana_symbols:
    df_to_train[f"Coste_{symbol}"] = df_to_train["Coste"].str.count(f"\{{{symbol}\}}")

df_to_train["Tiene_X_Coste"] = df_to_train["Coste"].str.contains(r"\{X\}").astype(int)
df_to_train.drop(columns=["Coste_X"], inplace=True) # Eliminamos la columna de conteo Coste_X

### SEPARACIÓN DE DATOS EN X E Y PARA CONTINUAR EL TRATADO Y POSTERIOR ENTRENAMIENTO DEL MODELO

In [None]:
X = df_to_train.drop(columns=['Precio ($)', 'Nombre', 'Texto'])
y = df_to_train['Precio ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)