In [1]:
import numpy as np
import pandas as pd
import pystan
import matplotlib.pyplot as plt
import random
import time
%matplotlib inline

In [2]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players
  int<lower=1> E;             # number of games
  real<lower=0> scale;        # scale value for probability computation
  int<lower=0,upper=1> win[E]; # PA wins vs PB
  int PA[E];                  # player info between each game
  int PB[E];                  # 
}
parameters {
  vector [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(0,3); }
  for (i in 1:E){
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a logit function of skill difference
}
"""

In [3]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

In [4]:
# load data
def load_data(file_name):
    data=pd.read_csv(file_name,index_col=False,
                     names=['date', 'p1', 'p1_outcome', 'score', 'p2', 'p2_outcome', 'p1_race', 'p2_race', 'addon', 'type'])
    #drop other columns for now
    data.drop(columns=['date','score','p2_outcome','p1_race', 'p2_race', 'addon', 'type'], inplace=True)
    data['p1_outcome'].replace({"[loser]":0,"[winner]":1},inplace=True)
    return data

In [5]:
train_data = load_data('train.csv')
train_data

Unnamed: 0,p1,p1_outcome,p2
0,MC,0,Stats
1,MC,0,Dark
2,MC,0,INnoVation
3,MC,0,TRUE
4,MC,1,Super
...,...,...,...
193069,Keiras,0,Harpner
193070,Keiras,1,Harpner
193071,Keiras,0,maTTzour
193072,Keiras,0,nukestrike


In [6]:
class Player:
    def __init__(self, id, name):
        self.id = id
        self.name = name
        self.games = []
        self.skill = None
        
    def add_match(self, name, outcome):
        self.games.append((name, outcome))

In [7]:
# list of players
player_names = np.unique(np.concatenate((train_data['p1'], train_data['p2'])))

players = dict()
for i in range(len(player_names)):
    name = player_names[i]
    players[name] = Player(i+1, name)
    
for game in train_data.to_numpy():
    p1, win, p2 = game
    players[p1].add_match(p2, win)

In [8]:
def create_skill_data(players, n_games, scale, verbose = False):
    n_player = len(players)
    sample_size = round(n_games / n_player)
    if verbose:
        print("Number of players:", n_player)
        print("Number of games to consider for each player:", sample_size)
    win = []
    pa = []
    pb = []
    for p in players.values():
        games = p.games if len(p.games) <= sample_size \
                        else [p.games[i] for i in np.random.choice(len(p.games), sample_size, replace = False)]
        for g in games:
            win.append(g[1])
            pa.append(p.id)
            pb.append(players[g[0]].id)
            
    if verbose:
        print("Number of games fetched:", len(win))
        print("Number of p1:", len(pa))
        print("Number of p2:", len(pb))
     
    return {
        'N': n_player,
        'E': len(win),
        'scale': scale,
        'win': win,
        'PA': pa,
        'PB': pb
    }

In [9]:
skill_data = create_skill_data(players, 50000, 0.5, True)

Number of players: 999
Number of games to consider for each player: 50
Number of games fetched: 47254
Number of p1: 47254
Number of p2: 47254


In [10]:
import time as tm
def grid_search(data, iter_range, chain_range):
    fits = {}
    times = []
    for x in iter_range:
        for y in chain_range:
            start = tm.perf_counter()
            fits[(x,y)] = sm.sampling(data=data, iter=x, chains=y)
            times.append(tm.perf_counter() - start)
            print((x,y), times[-1])
    return fits, times

In [11]:
fits, time = grid_search(skill_data, [1000], [4])

(1000, 4) 732.5755654999999


# Validation

In [12]:
valid_data = load_data('valid.csv')
valid_data

Unnamed: 0,p1,p1_outcome,p2
0,MC,1,NaTuRal
1,MC,0,Cure
2,MC,1,Journey
3,MC,0,Trust
4,MC,0,Billowy
...,...,...,...
94002,Keiras,0,Dragon
94003,Keiras,0,nExt
94004,Keiras,0,EJK
94005,Keiras,0,DeViL


In [13]:
playerid = {}
for p in players:
    playerid[p] = players[p].id
    
valid_data['p1'].replace(playerid,inplace=True)
valid_data['p2'].replace(playerid,inplace=True)
valid_games=[tuple((r[0],r[2],r[1])) for r in valid_data.to_numpy()]
#valid_games

In [14]:
def logit(z): return 1./(1.+np.exp(-z))
def validate(fits, valid_games, n_games):
    accuracy = []
    for fit in fits:
        samples = fits[fit].extract()
        acc = 0
        n = n_games
        for g in [valid_games[i] for i in np.random.choice(len(valid_games), n, replace = False)]:
            try:
                i,j,result=int(g[0]),int(g[1]),int(g[2])
            except:
                n-=1
                continue

            prob = logit( skill_data['scale']*(samples['skill'][:,i-1]-samples['skill'][:,j-1]) ).mean()
            pred = np.random.choice([0,1], p=[prob, 1-prob])
            acc += (pred==result)
        accuracy.append(acc/n)
        print(fit, accuracy[-1], n)
    return accuracy

In [15]:
accuracy = validate(fits, valid_games, 10000)
accuracy

(1000, 4) 0.4084 10000


[0.4084]