In [35]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn')

In [27]:
# The number of pokemon apearing in each dataset, times 2 (for each team)
vgcSunk = 788
ouk = 1108

In [28]:
oudf = pd.read_csv("ou_hot.csv")
vgcdf = pd.read_csv("vgcSun_hot.csv")

In [29]:
target = "winner"
oufeatures = oudf.columns.drop("winner")
vgcfeatures = vgcdf.columns.drop("winner")

In [30]:
ouXTrain, ouXValidate, ouYTrain, ouYValidate = train_test_split(
    oudf[oufeatures], oudf[target], train_size=0.8, test_size=0.2)

vgcXTrain, vgcXValidate, vgcYTrain, vgcYValidate = train_test_split(
    vgcdf[vgcfeatures], vgcdf[target], train_size=0.8, test_size=0.2)

In [31]:
categorical_oufeatures = oufeatures[0:ouk]
numeric_oufeatures = oufeatures[ouk:]

categorical_vgcfeatures = vgcfeatures[0:vgcSunk]
numeric_vgcfeatures = vgcfeatures[vgcSunk:]

In [38]:
# Feature Selection Pipelines
oufeats = ColumnTransformer([
           ('cats', SelectKBest(score_func=chi2, k=291), categorical_oufeatures), 
           ('nums', SelectKBest(score_func=f_classif, k=90), numeric_oufeatures)])

vgcfeats = ColumnTransformer([
           ('cats', SelectKBest(score_func=chi2, k=291), categorical_vgcfeatures), 
           ('nums', SelectKBest(score_func=f_classif, k=90), numeric_vgcfeatures)])

In [39]:
ouModel = Pipeline([
      ("feats", oufeats),
      ("Rand", RandomForestClassifier(n_estimators=100, n_jobs=-1, max_features = 0.97))
  ])

vgcModel = Pipeline([
      ("feats", vgcfeats),
      ("Rand", RandomForestClassifier(n_estimators=100, n_jobs=-1, max_features = 0.97))
  ])

In [40]:
ouModel.fit(ouXTrain, ouYTrain)
vgcModel.fit(vgcXTrain, vgcYTrain)

Pipeline(memory=None,
         steps=[('feats',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cats',
                                                  SelectKBest(k=291,
                                                              score_func=<function chi2 at 0x0000013A87E953A8>),
                                                  Index(['Abomasnow_team_1', 'Accelgor_team_1', 'Aegislash_team_1',
       'Aerodactyl_team_1', 'Aggron_team_1', 'Alakazam_team_1',
       'Alomomola_team_...
                ('Rand',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=0.97, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,


In [41]:
# Used to swap team 1 and team 2
def switchTeams(x, k):
    newx = x.copy()
    
    newx[k//2:k] = x[0:k//2] # Team 1 Pokes
    newx[0:k//2] = x[k//2:k] # Team 2 Pokes

    newx[k+19:k+38] = x[k:k+19] # Team 1 Types
    newx[k:k+19] = x[k+19:k+38] # Team 2 Types

    newx[k+44:k+50] = x[k+38:k+44] # Team 1 mean
    newx[k+38:k+44] = x[k+44:k+50] # Team 2 mean

    newx[k+56:k+62] = x[k+50:k+56] # Team 1 std
    newx[k+50:k+56] = x[k+56:k+62] # Team 2 std

    newx[k+80:k+98] = x[k+62:k+80] # Team 1 typedef
    newx[k+62:k+80] = x[k+80:k+98] # Team 2 typedef

    newx[k+99] = x[k+98] # Team 1 mean types
    newx[k+98] = x[k+99] # Team 2 mean types

    newx[k+101] = x[k+100] # Team 1 immunities
    newx[k+100] = x[k+101] # Team 2 immunities

    newx[k+110:k+118] = x[k+102:k+110] # Team 1 imunity
    newx[k+102:k+110] = x[k+110:k+118] # Team 2 imunity

    newx[k+119] = x[k+118] # Team 1 superweaknesses
    newx[k+118] = x[k+119] # Team 2 superweaknesses

    newx[k+126:k+132] = x[k+120:k+126] # Team 1 Max
    newx[k+120:k+126] = x[k+126:k+132] # Team 2 Max

    newx[k+138:k+144] = x[k+132:k+138] # Team 1 Min
    newx[k+132:k+138] = x[k+138:k+144] # Team 2 Min
    
    return newx

# Used to swap team 1 and team 2 within a dataframe
def switchTeamsPd(df, k):
    df = df.copy()
    for i in range(len(df)):
        df.iloc[i] = switchTeams(df.iloc[i], k)
    
    return df

# Combine two lists of probabilities into a mean of probabilities
def combineProbs(p1, p2):
    pr = p1.copy()
    for i in range(len(p1)):
        pr[i] = [(p1[i][0]+p2[i][1])/2, (p1[i][1]+p2[i][0])/2]
    return pr

In [42]:
# For inputs, switch teams around
ouTeSw = switchTeamsPd(ouXValidate, ouk)
# Run both the original and swapped teams through the model, and combine them
predProbs = combineProbs(ouModel.predict_proba(ouXValidate), ouModel.predict_proba(ouTeSw))
# Turn probabilities into a prediction
pred = [ "team_1" if x[0] > .5 else "team_2" for x in predProbs ]
# prediction accuracy
accuracy_score(pred, ouYValidate)

0.6089334548769371

In [43]:
# For inputs, switch teams around
vgcTeSw = switchTeamsPd(vgcXValidate, vgcSunk)
# Run both the original and swapped teams through the model, and combine them
predProbs = combineProbs(vgcModel.predict_proba(vgcXValidate), vgcModel.predict_proba(vgcTeSw))
# Turn probabilities into a prediction
pred = [ "team_1" if x[0] > .5 else "team_2" for x in predProbs ]
# prediction accuracy
accuracy_score(pred, vgcYValidate)

0.625

In [44]:
from joblib import dump
dump(ouModel, 'ouModel.joblib')
dump(vgcModel, 'vgcModel.joblib')

['vgcModel.joblib']