## Installs and Imports

### Installs

In [10]:
%pip install seaborn --quiet
%pip install requests --quiet
%pip install scikit-learn --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Imports

In [46]:
import numpy as nu
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import json
from sklearn.preprocessing import LabelEncoder
import pickle


# ETL

## Extract

In [40]:
def get_teams(log):
    try:
        
        part = log.split('|clearpoke')[1].split('|teampreview')[0]
        lines = part.split('\n')

        pokes = []
        for line in lines:
            if line.startswith('|poke|'):
                try:
                    name = line.split('|')[3].split(',')[0]
                    pokes.append(name)
                except:
                    continue

        if len(pokes) != 12:
            return None, None
        
        return sorted(pokes[:6]), sorted(pokes[6:12])

    except:
        return None, None


In [41]:
def get_leads(log):
    try:
        part = log.split('|start')[1]
        lines = part.split('\n')

        switch_lines = [l for l in lines if l.startswith('|switch|')]

        if len(switch_lines) < 4:
            return None, None

        p1, p2 = [], []

        for line in switch_lines[:4]:
            pieces = line.split('|')
            slot = pieces[2]
            name = pieces[3].split(',')[0]

            if slot.startswith('p1'):
                p1.append(name)
            else:
                p2.append(name)

        return sorted(p1), sorted(p2)

    except:
        return None, None

In [14]:
public_replays_url = 'https://replay.pokemonshowdown.com/api/replays/search?username=&format=gen9vgc2025regi&page=PAGE'

replay_ids = []

for i in range(1, 100):
  request = requests.get(public_replays_url.replace('PAGE', str(i)))

  for metadata in json.loads(request.text[1:]):
    replay_ids.append(metadata['id'])

print('Collected ', len(replay_ids), 'replay IDS')

Collected  5049 replay IDS


In [None]:
logs = []

for replay_id in replay_ids:
  r = requests.get(f'https://replay.pokemonshowdown.com/{replay_id}.log')
  logs.append(r.text)

len(logs)

#

5049

In [36]:
with_clearpoke = [log for log in logs if '|clearpoke' in log]
print("Logs con clearpoke:", len(with_clearpoke))

Logs con clearpoke: 5049


In [37]:
with_poke = [log for log in logs if "|poke|p1" in log and "|poke|p2" in log]
print("Logs que sí tienen poke:", len(with_poke))

Logs que sí tienen poke: 5049


In [38]:
with_poke = [log for log in logs if "|start" in log]
print("Logs que sí tienen start:", len(with_poke))

Logs que sí tienen start: 5037


In [42]:
valid = 0
invalid = 0
data = []

for log in logs:
    try:
        team1, team2 = get_teams(log)
        lead1, lead2 = get_leads(log)

        if len(team1) < 6 or len(team2) < 6 or len(lead1) < 1 or len(lead2) < 1:
            invalid += 1
            continue

        row = team1 + team2 + lead1 + lead2
        data.append(row)
        valid += 1

    except:
        invalid += 1

print("Logs válidos:", valid)
print("Logs inválidos:", invalid)


cols = [
    'j1_1','j1_2','j1_3','j1_4','j1_5','j1_6',
    'j2_1','j2_2','j2_3','j2_4','j2_5','j2_6',
    'j1_l1','j1_l2','j2_l1','j2_l2'
]

df = pd.DataFrame(data, columns=cols)
df.head()

Logs válidos: 4935
Logs inválidos: 114


Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,Calyrex-Shadow,Rillaboom,Smeargle,Tornadus,Urshifu-*,Zamazenta-*,Incineroar,Lunala,Raging Bolt,Rillaboom,Terapagos,Volcarona,Tornadus,Urshifu-Rapid-Strike,Incineroar,Terapagos
1,Amoonguss,Excadrill,Ho-Oh,Kommo-o,Lunala,Tyranitar,Calyrex-Shadow,Lugia,Maushold,Muk-Alola,Smeargle,Ting-Lu,Amoonguss,Kommo-o,Muk-Alola,Smeargle
2,Gouging Fire,Grimmsnarl,Groudon,Lilligant-Hisui,Lunala,Raging Bolt,Amoonguss,Calyrex-Shadow,Incineroar,Ogerpon-Cornerstone,Rillaboom,Terapagos,Gouging Fire,Groudon,Calyrex-Shadow,Ogerpon-Cornerstone
3,Calyrex-Shadow,Grimmsnarl,Incineroar,Landorus,Ogerpon-Wellspring,Terapagos,Amoonguss,Excadrill,Ho-Oh,Kommo-o,Lunala,Tyranitar,Calyrex-Shadow,Ogerpon-Wellspring,Ho-Oh,Kommo-o
4,Amoonguss,Calyrex-Shadow,Chi-Yu,Miraidon,Raging Bolt,Tornadus,Amoonguss,Calyrex-Shadow,Flutter Mane,Gothitelle,Incineroar,Koraidon,Chi-Yu,Miraidon,Incineroar,Koraidon


In [44]:
enc = LabelEncoder()
all_vals = pd.unique(df[cols].values.ravel())

enc.fit(all_vals)

for col in cols:
    df[col] = enc.transform(df[col])
    
df.head(50)

Unnamed: 0,j1_1,j1_2,j1_3,j1_4,j1_5,j1_6,j2_1,j2_2,j2_3,j2_4,j2_5,j2_6,j1_l1,j1_l2,j2_l1,j2_l2
0,56,406,462,508,530,573,236,286,388,406,495,551,508,531,236,495
1,9,144,226,262,286,524,56,285,306,331,462,501,9,262,331,462
2,200,206,209,280,286,388,9,56,236,343,406,495,200,209,56,343
3,56,206,236,269,345,495,9,144,226,262,286,524,56,345,226,262
4,9,56,67,321,388,508,9,56,161,198,236,263,67,321,236,263
5,200,206,209,280,286,388,50,56,67,263,556,560,209,280,50,556
6,40,68,93,112,170,337,50,56,67,263,556,560,93,112,67,263
7,85,293,361,372,446,466,218,298,339,428,449,526,361,372,339,449
8,85,293,361,372,446,466,218,298,339,428,449,526,446,466,298,428
9,68,143,343,508,530,573,56,112,209,388,522,561,68,574,112,522


In [45]:
df.to_csv('pokemon_teams_and_leads.csv', index=False)

In [47]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(enc, f)