In [56]:

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import xgboost as xgb

# Load data
pokemon = pd.read_csv("../data/pokemon.csv")
combats = pd.read_csv("../data/combats.csv")
type_chart = pd.read_csv("../data/Pokemon Type Chart.csv")
pokemon.rename(columns={"#": "ID"}, inplace=True)


In [57]:

def calculate_stat(base, level=50, iv=31, ev=0, nature=1.0, is_hp=False):
    if is_hp:
        return int((((2 * base + iv + (ev // 4)) * level) / 100) + level + 10)
    else:
        return int(((((2 * base + iv + (ev // 4)) * level) / 100) + 5) * nature)

def compute_type_effectiveness(attacker_types, defender_types, chart_df):
    effectiveness = 1.0
    for atk_type in attacker_types:
        for def_type in defender_types:
            try:
                multiplier = chart_df.loc[chart_df['Attacking'] == atk_type, def_type].values[0]
                effectiveness *= multiplier
            except (KeyError, IndexError):
                effectiveness *= 1.0
    return effectiveness


In [58]:

features = []
labels = []

for _, row in combats.iterrows():
    p1 = pokemon[pokemon['ID'] == row['First_pokemon']].iloc[0]
    p2 = pokemon[pokemon['ID'] == row['Second_pokemon']].iloc[0]

    stats_1 = [
        calculate_stat(p1["HP"], is_hp=True),
        calculate_stat(p1["Attack"]),
        calculate_stat(p1["Defense"]),
        calculate_stat(p1["Sp. Atk"]),
        calculate_stat(p1["Sp. Def"]),
    ]
    stats_2 = [
        calculate_stat(p2["HP"], is_hp=True),
        calculate_stat(p2["Attack"]),
        calculate_stat(p2["Defense"]),
        calculate_stat(p2["Sp. Atk"]),
        calculate_stat(p2["Sp. Def"]),
    ]

    bst1 = sum(stats_1)
    bst2 = sum(stats_2)

    t1 = [p1["Type 1"]] + ([p1["Type 2"]] if pd.notna(p1["Type 2"]) else [])
    t2 = [p2["Type 1"]] + ([p2["Type 2"]] if pd.notna(p2["Type 2"]) else [])
    type_diff = compute_type_effectiveness(t1, t2, type_chart) - compute_type_effectiveness(t2, t1, type_chart)

    features.append(stats_1 + stats_2 + [bst1, bst2, type_diff])
    labels.append(1 if row["Winner"] == row["First_pokemon"] else 0)

X = np.array(features)
y = np.array(labels)


In [59]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)

# Evaluation
lr_acc = accuracy_score(y_test, lr.predict(X_test_scaled))
rf_acc = accuracy_score(y_test, rf.predict(X_test_scaled))
xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test_scaled))

print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"XGBoost Accuracy: {xgb_acc:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Logistic Regression Accuracy: 0.6969
Random Forest Accuracy: 0.8153
XGBoost Accuracy: 0.8439


In [60]:

accuracies = {'Logistic Regression': lr_acc, 'Random Forest': rf_acc, 'XGBoost': xgb_acc}
best_name = max(accuracies, key=accuracies.get)
best_model = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': xgb_model}[best_name]

print(f"Best model: {best_name} with accuracy: {accuracies[best_name]:.4f}")

joblib.dump(best_model, "../models/model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")
print("Model and scaler saved.")


Best model: XGBoost with accuracy: 0.8439
Model and scaler saved.
