In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import csv
import random
import time
import pystan
import pandas as pd


In [2]:
def logit(z): return 1./(1.+np.exp(-z))

In [3]:
def convert_race(race):
    if race == 'Z':
        return 1
    elif race == 'P':
        return 2
    elif race == 'T':
        return 3
    else:
        return 4
    

def load_data_stan(file_name, pk=1, edge=3, nk=5):
    with open(file_name, encoding='utf-8') as f: lines = f.read().split('\n')
    p = 0; playerid = {};
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue   # parse error or blank line
        player0,player1 = csv[1],csv[4]
        if player0 not in playerid: playerid[player0]=p; p+=1
        if player1 not in playerid: playerid[player1]=p; p+=1

    nplayers = len(playerid)
    playername = ['']*nplayers
    for player in playerid:
        playername[ playerid[player] ]=player;  # id to name lookup


    pKeep = pk   # fraction of edges to consider (immed. throw out 1-p edges)
    nEdge = edge     # try to keep nEdge opponents per player (may be more; asymmetric)
    nKeep = nk     # keep at most nKeep games per opponent pairs (play each other multiple times)

    count = 0
    nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
    player_A = []
    player_B = []
    wins = []
    RA = []
    RB = []
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        a,b = playerid[csv[1]],playerid[csv[4]];
        aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
        if (np.random.rand() < pKeep):
            if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
                count += 1
                nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;
                player_A.append(a+1)
                player_B.append(b+1)
                wins.append(aw)
                RA.append(convert_race(csv[6]))
                RB.append(convert_race(csv[7]))

    win_re = []
    print(wins)
    for i in wins:
        if i:
            win_re.append(1)
        else:
            win_re.append(0)
    return player_A,player_B,win_re,RA,RB, playerid, playername

In [4]:

# skill_model = """
# data {
#   int<lower=1> N;             # Total number of players
#   int<lower=1> E;             # number of games
#   real<lower=0> scale;        # scale value for probability computation
#   int<lower=0,upper=1> win[E]; # PA wins vs PB
#   int PA[E];                  # player info between each game
#   int PB[E];                  # 
# }
# parameters {
#   vector [N] skill;           # skill values for each player
# }

# model{
#   for (i in 1:N){ skill[i]~normal(0,10); }
#   for (i in 1:E){
#     win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
#   }   # win probability is a logit function of skill difference
# }
# """

skill_model = """
data {
  int<lower=1> N;             # Total number of players
  int<lower=1> E;             # number of games
  real<lower=0> scale;        # scale value for probability computation
  int<lower=0,upper=1> win[E]; # PA wins vs PB
  int PA[E];                  # player info between each game
  int PB[E];                  # 
  int RA[E];
  int RB[E];
}
parameters {
  vector [N] skill;           # skill values for each player
  vector [6] R;
}

model{
  for (i in 1:N){ skill[i]~normal(25,8); }
  for (i in 1:E){
  if(RA[i] == 1 && RB[i] == 2)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[1]) );
  else if(RA[i] == 1 && RB[i] == 3)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[2]) );
  else if(RA[i] == 2 && RB[i] == 3)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[3]) );
  else if(RA[i] == 2 && RB[i] == 1)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[4]) );
  else if(RA[i] == 3 && RB[i] == 1)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[5]) );
  else if(RA[i] == 3 && RB[i] == 2)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R[6]) );
  else
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
    
  }   # win probability is a logit function of skill difference
}
"""

In [5]:
import pickle

try:     # load it if already compiled
    sm = pickle.load(open('skill_model_race.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model_race.pkl', 'wb') as f: pickle.dump(sm, f)



In [6]:

player_A,player_B,win_re,RA,RB,playerid, playername = load_data_stan("train.csv",1,32,5)



[False, False, False, False, True, True, True, False, False, True, False, False, True, False, True, True, True, False, True, True, False, True, True, False, True, False, False, True, True, False, False, True, False, False, False, False, True, True, True, True, False, False, True, True, True, False, True, False, False, True, True, False, True, True, False, False, False, False, True, False, True, False, False, True, False, False, False, True, True, False, False, True, True, False, True, True, True, True, True, True, True, False, False, True, True, False, True, False, True, True, False, False, True, True, False, True, True, False, True, False, False, True, True, True, True, False, True, True, True, False, False, False, True, True, True, False, True, False, True, False, True, False, True, True, False, True, True, True, True, False, True, True, False, True, True, True, False, True, False, True, True, True, False, True, False, True, True, True, False, True, True, True, True, False, True, Tru

In [7]:
# skill_data = {
#     'N': 999,
#     'E': 19825,
#     'scale': 0.5,
#     'win':win_re,
#     'PA': player_A,
#     'PB': player_B
# }

skill_data = {
    'N': 999,
    'E': len(RA),
    'scale': 0.5,
    'win':win_re,
    'PA': player_A,
    'PB': player_B,
    'RA': RA,
    'RB': RB
}


In [8]:
fit = sm.sampling(data=skill_data, iter=200, chains=2)
samples = fit.extract()



To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [9]:
def prediction_accuracy_2(skill_data, samples, valid_data, scale=0.3):
    
    #valid_games = generate_games(true_players, num_valid_game, scale, style='pystan')

    acc=0
    for g in range(valid_data.shape[1]):
        p1,p2,result=valid_data[0][g],valid_data[1][g],valid_data[2][g]

        win_rate = logit( skill_data['scale']*(samples['skill'][:,p1-1]-samples['skill'][:,p2-1]) ).mean()
        
        predict_result = random.choices([1,0],weights=[win_rate,1-win_rate])

        if predict_result == result:
            acc += 1
    return acc/valid_data.shape[1]

In [10]:
def load_data(file_name):
    data=pd.read_csv(file_name,index_col=False,
                     names=['date', 'p1', 'p1_outcome', 'score', 'p2', 'p2_outcome', 'p1_race', 'p2_race', 'addon', 'type'])
    #drop other columns for now
    data.drop(columns=['date','score','p2_outcome', 'addon', 'type'], inplace=True)
    data['p1_outcome'].replace({"[loser]":0,"[winner]":1},inplace=True)
    return data

valid_data = load_data('valid.csv')
print(valid_data)
valid_data['p1'].replace(playerid,inplace=True)
valid_data['p2'].replace(playerid,inplace=True)
valid_games=[tuple((r[0],r[2],r[1],r[3],r[4])) for r in valid_data.to_numpy()]
#valid_games


           p1  p1_outcome       p2 p1_race p2_race
0          MC           1  NaTuRal       P       T
1          MC           0     Cure       P       T
2          MC           1  Journey       P       T
3          MC           0    Trust       P       P
4          MC           0  Billowy       P       P
...       ...         ...      ...     ...     ...
94002  Keiras           0   Dragon       Z       T
94003  Keiras           0     nExt       Z       Z
94004  Keiras           0      EJK       Z       T
94005  Keiras           0    DeViL       Z       Z
94006  Keiras           0      HuK       Z       P

[94007 rows x 5 columns]


In [11]:

def logit(z): return 1./(1.+np.exp(-z))
def validate(fits, valid_games, n_games):
    accuracy = []
    for fit in fits:
        samples = fits[fit].extract()
        acc, acc_simulate = 0, 0
        n = n_games
        #for g in valid_games[:n_games]:
        for g in [valid_games[i] for i in np.random.choice(len(valid_games), n, replace = False)]:
            try:
                i,j,result,ra,rb=int(g[0]),int(g[1]),int(g[2]),convert_race(g[3]),convert_race(g[4])
            except:
                n-=1
                continue
            if(ra == 1 and rb == 2):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,0]) ).mean()
            elif(ra == 1 and rb == 3):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,1]) ).mean()
            elif(ra == 2 and rb == 3):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,2]) ).mean()
            elif(ra == 2 and rb == 1):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,3]) ).mean()
            elif(ra == 3 and rb == 1):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,4]) ).mean()
            elif(ra == 3 and rb == 2):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R'][:,5]) ).mean()
            else:
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j]) ).mean()
                #print(f'{ra}{rb}*', end='')
            pred = 1 if prob > 0.5 else 0 #
            pred_simulate = np.random.choice([1,0], p=[prob, 1-prob])
            acc += (pred==result)
            acc_simulate += (pred_simulate==result)
        accuracy.append((acc/n, acc_simulate/n))
        print(fit, accuracy[-1], n)
    return accuracy


In [12]:
accuracy = validate({1:fit}, valid_games, len(valid_games))
accuracy

1 (0.6804280532300786, 0.604103949705873) 94007


[(0.6804280532300786, 0.604103949705873)]

In [14]:
samples['R'].mean(0)

array([ 0.63671826,  0.90989793,  0.34666899, -0.45990961, -0.54462867,
       -0.13790172])

In [15]:
samples['skill'].mean(0)

array([28.6417091 , 30.02427354, 29.55644712, 30.99362098, 28.0724826 ,
       27.52021755, 29.10097641, 28.0748666 , 30.15687766, 30.02815245,
       26.96961531, 27.79214148, 28.32243778, 27.56563401, 27.65058437,
       26.04786878, 24.10302068, 26.75651083, 27.33327844, 28.30605413,
       26.19163881, 27.98292979, 27.7723244 , 26.61723207, 30.17776653,
       28.25125669, 27.92420362, 26.58380475, 26.98656443, 26.07517609,
       29.25456947, 23.06772916, 27.79222619, 29.11244701, 28.83169912,
       29.69232261, 26.16440951, 27.32782418, 30.163799  , 29.23680516,
       25.72970149, 29.44294595, 28.80010869, 29.34879058, 26.1017902 ,
       29.96327674, 30.34347377, 27.62634857, 28.78590393, 29.84771163,
       27.60010325, 25.79899876, 24.23872823, 24.92821648, 24.45398966,
       27.04863581, 27.30837275, 25.12520797, 25.7504148 , 29.78766781,
       29.74423732, 27.42849268, 27.7853708 , 25.75340382, 28.08974558,
       23.61392561, 26.81988469, 26.9512184 , 29.11586286, 28.53