# Pokémon Battle Winner Prediction Notebook


In [79]:
import pandas as pd
import numpy as np
from typing import Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [80]:
# Paths
base = './datasets/'
paths = {
    'combats': base + 'combats.csv',
    'type_chart': base + 'type_chart.csv',
    'pokemon': base + 'pokemon.csv',
    'team_combat': base + 'team_combat.csv',
    'teams': base + 'pokemon_id_each_team.csv'
}

# Load data
df_combat = pd.read_csv(paths['combats'])
df_types = pd.read_csv(paths['type_chart']).fillna('None')
df_pokemon = pd.read_csv(paths['pokemon']).set_index('#')
df_team_combat = pd.read_csv(paths['team_combat'])
df_teams = pd.read_csv(paths['teams']).set_index('#')


In [81]:
# Prepare 1v1 battles
df = (
    df_combat
    .join(df_pokemon, on='First_pokemon', rsuffix='_A')
    .join(df_pokemon, on='Second_pokemon', rsuffix='_B')
    .fillna('None')
)

# Map winner to binary
df['Winner'] = np.where(df['Winner'] == df['First_pokemon'], 0, 1)

In [82]:
df

Unnamed: 0,First_pokemon,Second_pokemon,Winner,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,...,Type 1_B,Type 2_B,HP_B,Attack_B,Defense_B,Sp. Atk_B,Sp. Def_B,Speed_B,Generation_B,Legendary_B
0,266,298,1,Larvitar,Rock,Ground,50,64,50,45,...,Grass,Dark,70,70,40,60,40,60,3,False
1,702,701,1,Virizion,Grass,Fighting,91,90,72,90,...,Rock,Fighting,91,129,90,72,90,108,5,True
2,191,668,1,Togetic,Fairy,Flying,55,40,85,80,...,Psychic,,75,75,75,125,95,40,5,False
3,237,683,1,Slugma,Fire,,40,40,40,70,...,Dragon,,77,120,90,60,90,48,5,False
4,151,231,0,Omastar,Rock,Water,70,60,125,115,...,Bug,Rock,20,10,230,10,230,5,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,707,126,0,Reshiram,Dragon,Fire,100,120,100,150,...,Water,,30,40,70,70,25,60,1,False
49996,589,664,0,Drilbur,Ground,,60,85,40,30,...,Electric,,35,55,40,45,40,60,5,False
49997,303,368,1,Pelipper,Water,Flying,60,50,100,85,...,Normal,,73,115,60,60,60,90,3,False
49998,109,89,0,Voltorb,Electric,,40,30,50,55,...,Electric,Steel,25,35,70,95,55,45,1,False


In [83]:
mult_dict = {
    (row['attack'], row['defense1']): row['multiplier']
    for _, row in df_types.iterrows()
}

def add_multiplier_fast(df: pd.DataFrame, atk: str, def1: str, col: str) -> pd.DataFrame:
    df[col] = df.apply(
        lambda row: mult_dict.get((row[atk], row[def1]), 1.0),
        axis=1
    )
    return df

df = add_multiplier_fast(df, 'Type 1', 'Type 1_B', 'mult_A_to_B')
df = add_multiplier_fast(df, 'Type 2', 'Type 1_B', 'mult2_A_to_B')
df = add_multiplier_fast(df, 'Type 1_B', 'Type 1', 'mult_B_to_A')
df = add_multiplier_fast(df, 'Type 2_B', 'Type 1', 'mult2_B_to_A')

In [84]:
stats = ['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']
for s in stats:
    df[f'delta_{s}'] = df[f'{s}'] - df[f'{s}_B']

In [85]:
df = df.drop(columns=[
    'Name', 'Name_B', 'Type 1', 'Type 2', 'Type 1_B', 'Type 2_B', 'Generation', 'Generation_B',
    'Sp. Atk', 'Sp. Atk_B', 'Sp. Def', 'Sp. Def_B', 'multiplier', 
    'First_pokemon', 'Second_pokemon', 'HP', 'Attack', 'Defense', 
    'HP_B', 'Attack_B', 'Defense_B', 'Speed', 'Speed_B'
], errors='ignore')

df

Unnamed: 0,Winner,Legendary,Legendary_B,mult_A_to_B,mult2_A_to_B,mult_B_to_A,mult2_B_to_A,delta_HP,delta_Attack,delta_Defense,delta_Sp. Atk,delta_Sp. Def,delta_Speed
0,1,False,False,1.0,0.5,2.0,0.5,-20,-6,10,-15,10,-19
1,1,True,True,2.0,1.0,1.0,0.5,0,-39,-18,18,39,0
2,1,False,False,1.0,1.0,0.5,1.0,-20,-35,10,-45,10,0
3,1,False,False,0.5,1.0,0.0,1.0,-37,-80,-50,10,-50,-28
4,0,False,False,2.0,1.0,0.5,1.0,50,50,-105,105,-160,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,True,False,0.0,0.5,0.5,1.0,70,80,30,80,95,30
49996,0,False,False,2.0,1.0,0.0,1.0,25,30,0,-15,5,8
49997,1,False,False,1.0,1.0,1.0,1.0,-13,-65,40,25,10,-25
49998,0,False,False,0.5,1.0,0.5,1.0,15,-5,-20,-40,0,55


In [86]:
# Select features and target
features = [c for c in df.columns if c.startswith('delta_') or c.startswith('mult')]
features += ['Legendary', 'Legendary_B']
X = df[features]
y = df['Winner']

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), [c for c in X if X[c].dtype in [np.int64, np.float64]]),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), ['Legendary','Legendary_B'])
])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
# Models
models = {
    'Logistic L1': Pipeline([('pre', preprocessor), ('clf', LogisticRegression(penalty='l1', solver='liblinear'))]),
    'RandomForest': Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=100, random_state=42))]),
    'XGBoost': Pipeline([('pre', preprocessor), ('clf', XGBClassifier(eval_metric='logloss', random_state=42))])
}

# Train & evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n{name} accuracy: {accuracy_score(y_test, preds):.3f}")
    print(classification_report(y_test, preds))


Logistic L1 accuracy: 0.886
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4753
           1       0.89      0.90      0.89      5247

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


RandomForest accuracy: 0.958
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      4753
           1       0.96      0.95      0.96      5247

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000


XGBoost accuracy: 0.956
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      4753
           1       0.96      0.95      0.96      5247

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96 