## Installs and Imports

### Installs

In [46]:
%pip install seaborn --quiet
%pip install requests --quiet
%pip install scikit-learn --quiet


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Imports

In [223]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import json
from sklearn.preprocessing import LabelEncoder
import pickle

# ETL

## Extract

In [48]:
def get_teams(log):
    try:
        
        part = log.split('|clearpoke')[1].split('|teampreview')[0]
        lines = part.split('\n')

        pokes = []
        for line in lines:
            if line.startswith('|poke|'):
                try:
                    name = line.split('|')[3].split(',')[0]
                    pokes.append(name)
                except:
                    continue

        if len(pokes) != 12:
            return None, None
        
        return sorted(pokes[:6]), sorted(pokes[6:12])

    except:
        return None, None


In [49]:
def get_leads(log):
    try:
        part = log.split('|start')[1]
        lines = part.split('\n')

        switch_lines = [l for l in lines if l.startswith('|switch|')]

        if len(switch_lines) < 4:
            return None, None

        p1, p2 = [], []

        for line in switch_lines[:4]:
            pieces = line.split('|')
            slot = pieces[2]
            name = pieces[3].split(',')[0]

            if slot.startswith('p1'):
                p1.append(name)
            else:
                p2.append(name)

        return sorted(p1), sorted(p2)

    except:
        return None, None

In [50]:
public_replays_url = 'https://replay.pokemonshowdown.com/api/replays/search?username=&format=gen9vgc2025regi&page=PAGE'

replay_ids = []

for i in range(1, 100):
  request = requests.get(public_replays_url.replace('PAGE', str(i)))

  for metadata in json.loads(request.text[1:]):
    replay_ids.append(metadata['id'])

print('Collected ', len(replay_ids), 'replay IDS')

KeyboardInterrupt: 

In [None]:
logs = []

for replay_id in replay_ids:
  r = requests.get(f'https://replay.pokemonshowdown.com/{replay_id}.log')
  logs.append(r.text)

len(logs)

5049

In [None]:
with_clearpoke = [log for log in logs if '|clearpoke' in log]
print("Logs con clearpoke:", len(with_clearpoke))

Logs con clearpoke: 5049


In [None]:
with_poke = [log for log in logs if "|poke|p1" in log and "|poke|p2" in log]
print("Logs que sí tienen poke:", len(with_poke))

Logs que sí tienen poke: 5049


In [None]:
with_poke = [log for log in logs if "|start" in log]
print("Logs que sí tienen start:", len(with_poke))

Logs que sí tienen start: 5037


In [None]:
valid = 0
invalid = 0
data = []

for log in logs:
    try:
        team1, team2 = get_teams(log)
        lead1, lead2 = get_leads(log)

        if len(team1) < 6 or len(team2) < 6 or len(lead1) < 1 or len(lead2) < 1:
            invalid += 1
            continue

        row = team1 + team2 + lead1 + lead2
        data.append(row)
        valid += 1

    except:
        invalid += 1

print("Logs válidos:", valid)
print("Logs inválidos:", invalid)


cols = [
    'j1_1','j1_2','j1_3','j1_4','j1_5','j1_6',
    'j2_1','j2_2','j2_3','j2_4','j2_5','j2_6',
    'j1_l1','j1_l2','j2_l1','j2_l2'
]

df = pd.DataFrame(data, columns=cols)
df.head()

Logs válidos: 4935
Logs inválidos: 114


Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,Incineroar,Lunala,Raging Bolt,Rillaboom,Terapagos,Volcarona,Calyrex-Shadow,Incineroar,Ogerpon-Hearthflame,Raging Bolt,Rillaboom,Zamazenta-*,Lunala,Volcarona,Calyrex-Shadow,Zamazenta-Crowned
1,Incineroar,Lunala,Raging Bolt,Rillaboom,Terapagos,Volcarona,Calyrex-Shadow,Incineroar,Ogerpon-Hearthflame,Raging Bolt,Rillaboom,Zamazenta-*,Terapagos,Volcarona,Calyrex-Shadow,Incineroar
2,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Chi-Yu,Koraidon,Incineroar,Lunala
3,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Brute Bonnet,Indeedee-F,Incineroar,Miraidon
4,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Calyrex-Shadow,Indeedee-F,Incineroar,Miraidon


In [None]:
df.to_csv("pokemons_vgc.csv", index=False)

In [None]:
enc = LabelEncoder()
all_vals = pd.unique(df[cols].values.ravel())

enc.fit(all_vals)

for col in cols:
    df[col] = enc.transform(df[col])
    
df.head()

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,240,291,394,412,504,560,57,240,350,394,412,582,291,560,57,583
1,240,291,394,412,504,560,57,240,350,394,412,582,504,560,57,240
2,51,57,69,242,268,570,208,240,291,327,535,539,69,268,240,291
3,51,57,69,242,268,570,208,240,291,327,535,539,51,242,240,327
4,51,57,69,242,268,570,208,240,291,327,535,539,57,242,240,327


In [None]:
df.to_csv('pokemon_teams_and_leads.csv', index=False)

In [None]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(enc, f)

In [None]:
df = pd.read_csv('pokemon_teams_and_leads.csv')
df.head()

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,240,291,394,412,504,560,57,240,350,394,412,582,291,560,57,583
1,240,291,394,412,504,560,57,240,350,394,412,582,504,560,57,240
2,51,57,69,242,268,570,208,240,291,327,535,539,69,268,240,291
3,51,57,69,242,268,570,208,240,291,327,535,539,51,242,240,327
4,51,57,69,242,268,570,208,240,291,327,535,539,57,242,240,327


In [None]:
with open("encoder.pkl", "rb") as f:
    enc = pickle.load(f)

#decode all data
for col in df.columns:
    df[col] = enc.inverse_transform(df[col])
df.head()

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,Incineroar,Lunala,Raging Bolt,Rillaboom,Terapagos,Volcarona,Calyrex-Shadow,Incineroar,Ogerpon-Hearthflame,Raging Bolt,Rillaboom,Zamazenta-*,Lunala,Volcarona,Calyrex-Shadow,Zamazenta-Crowned
1,Incineroar,Lunala,Raging Bolt,Rillaboom,Terapagos,Volcarona,Calyrex-Shadow,Incineroar,Ogerpon-Hearthflame,Raging Bolt,Rillaboom,Zamazenta-*,Terapagos,Volcarona,Calyrex-Shadow,Incineroar
2,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Chi-Yu,Koraidon,Incineroar,Lunala
3,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Brute Bonnet,Indeedee-F,Incineroar,Miraidon
4,Brute Bonnet,Calyrex-Shadow,Chi-Yu,Indeedee-F,Koraidon,Whimsicott,Grimmsnarl,Incineroar,Lunala,Miraidon,Ursaluna,Urshifu-*,Calyrex-Shadow,Indeedee-F,Incineroar,Miraidon


# STATS y PERMUTACIONES

In [284]:
df = pd.read_csv('pokemons_vgc.csv')

In [285]:

with open("pokedex.json", encoding="utf-8") as f:
    pokedex = json.load(f)

special_cases = {
    "urshifu-*": "urshifurapidstrike",
    "zamazenta-*": "zamazentacrowned",
    "zamazenta-crowned": "zamazentacrowned",
    "zacian-*": "zaciancrowned",
    "zacian-crowned": "zaciancrowned",
    "tatsugiridroopy":  "tatsugiri",
    "tatsugiristretchy":  "tatsugiri",
}

def normalize_name(name):
    if not isinstance(name, str):
        return name

    lower_name = name.lower()
    if lower_name in special_cases:
        #print("Special case:", name, "->", special_cases[lower_name])
        name = special_cases[lower_name]

    if lower_name.startswith("alcremie"):
        name = "alcremie"

    name = name.lower()
    name = name.replace(" ", "")
    name = name.replace("-", "").replace("*", "")
    name = re.sub(r"[^a-z0-9]", "", name)
    return name

normalized_pokedex = {normalize_name(k): v for k, v in pokedex.items()}

def get_stats(pokemon_name):
    poke_data = normalized_pokedex.get(pokemon_name)

    if poke_data and 'baseStats' in poke_data:
        return poke_data['baseStats']
    elif poke_data == "tatsugiri":
        return {'hp': 68, 'atk': 50, 'def': 60, 'spa': 120, 'spd': 95, 'spe': 82}
    else:
        return {'hp': None, 'atk': None, 'def': None, 'spa': None, 'spd': None, 'spe': None}


In [None]:
player1_cols = [f'j1_{i}' for i in range(1, 7)]
player2_cols = [f'j2_{i}' for i in range(1, 7)]
player1_lead_cols = [f'j1_l{i}' for i in range(1, 3)]
player2_lead_cols = [f'j2_l{i}' for i in range(1, 3)]

def add_stats_columns(df, cols):
    for col in cols:
        df[col] = df[col].apply(normalize_name)
        stats = df[col].apply(get_stats)
        stats_df = pd.json_normalize(stats).add_prefix(f'{col}_')
        df = pd.concat([df, stats_df], axis=1)
    return df

df = add_stats_columns(df, player1_cols)
df = add_stats_columns(df, player2_cols)

lead_cols_to_normalize = player1_lead_cols + player2_lead_cols

for col in lead_cols_to_normalize:
    df[col] = df[col].apply(normalize_name)


In [287]:
df.head()

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,...,j2_5_def,j2_5_spa,j2_5_spd,j2_5_spe,j2_6_hp,j2_6_atk,j2_6_def,j2_6_spa,j2_6_spd,j2_6_spe
0,incineroar,lunala,ragingbolt,rillaboom,terapagos,volcarona,calyrexshadow,incineroar,ogerponhearthflame,ragingbolt,...,90.0,60.0,70.0,85.0,92.0,120.0,140.0,80.0,140.0,128.0
1,incineroar,lunala,ragingbolt,rillaboom,terapagos,volcarona,calyrexshadow,incineroar,ogerponhearthflame,ragingbolt,...,90.0,60.0,70.0,85.0,92.0,120.0,140.0,80.0,140.0,128.0
2,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0
3,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0
4,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0


In [288]:
cols = player1_cols + player2_cols + player1_lead_cols + player2_lead_cols

enc2 = LabelEncoder()
all_vals = pd.unique(df[cols].values.ravel())

enc2.fit(all_vals)

for col in cols:
    df[col] = enc2.transform(df[col])


In [289]:
df = df.dropna()
df.to_csv("vgc_encoded.csv", index=False)

In [290]:
with open("encoder2.pkl", "wb") as f:
    pickle.dump(enc2, f)

In [291]:
with open("encoder2.pkl", "rb") as f:
    enc2 = pickle.load(f)

In [292]:
for col in player1_cols + player2_cols + player1_lead_cols + player2_lead_cols:
    df[col] = enc2.inverse_transform(df[col])

df.head()

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,...,j2_5_def,j2_5_spa,j2_5_spd,j2_5_spe,j2_6_hp,j2_6_atk,j2_6_def,j2_6_spa,j2_6_spd,j2_6_spe
0,incineroar,lunala,ragingbolt,rillaboom,terapagos,volcarona,calyrexshadow,incineroar,ogerponhearthflame,ragingbolt,...,90.0,60.0,70.0,85.0,92.0,120.0,140.0,80.0,140.0,128.0
1,incineroar,lunala,ragingbolt,rillaboom,terapagos,volcarona,calyrexshadow,incineroar,ogerponhearthflame,ragingbolt,...,90.0,60.0,70.0,85.0,92.0,120.0,140.0,80.0,140.0,128.0
2,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0
3,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0
4,brutebonnet,calyrexshadow,chiyu,indeedeef,koraidon,whimsicott,grimmsnarl,incineroar,lunala,miraidon,...,105.0,45.0,80.0,50.0,100.0,130.0,100.0,63.0,60.0,97.0


In [293]:
cols = player1_cols + player2_cols + player1_lead_cols + player2_lead_cols

enc2 = LabelEncoder()
all_vals = pd.unique(df[cols].values.ravel())

enc2.fit(all_vals)

for col in cols:
    df[col] = enc2.transform(df[col])

player1_cols = [f'j1_{i}' for i in range(1, 7)]
player2_cols = [f'j2_{i}' for i in range(1, 7)]
player1_lead_cols = [f'j1_l{i}' for i in range(1, 3)]
player2_lead_cols = [f'j2_l{i}' for i in range(1, 3)]


from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)


In [294]:
df.head()
df_train.shape

(3332, 88)

In [295]:
def permute_teams(df, n_aug=3):
    """
    df: DataFrame con las columnas especificadas
    n_aug: cuántas permutaciones generar por fila
    """
    X_orig = df.values
    X_aug = X_orig.copy()
    
    j1_cols = [0,1,2,3,4,5]  # indices de j1_1 ... j1_6
    j2_cols = [6,7,8,9,10,11]  # indices de j2_1 ... j2_6
    
    for _ in range(n_aug):
        X_perm = X_orig.copy()
        for i in range(X_orig.shape[0]):
            # permutar J1
            X_perm[i, j1_cols] = np.random.permutation(X_perm[i, j1_cols])
            # permutar J2
            X_perm[i, j2_cols] = np.random.permutation(X_perm[i, j2_cols])
        # agregar al dataset
        X_aug = np.vstack([X_aug, X_perm])
    
    return X_aug

# Uso:
X_train_aug = permute_teams(df_train, n_aug=6)
print("Tamaño original:", df_train.shape)
print("Tamaño aumentado:", X_train_aug.shape)

df_train = pd.DataFrame(X_train_aug, columns=df_train.columns)

Tamaño original: (3332, 88)
Tamaño aumentado: (23324, 88)


In [296]:
df_train.to_csv("vgc_encoded_augmentation_train.csv", index=False)
df_val.to_csv("vgc_encoded_augmentation_val.csv", index=False)
df_test.to_csv("vgc_encoded_augmentation_test.csv", index=False)