In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import csv
import random
import time
import pystan
import pandas as pd


In [2]:
def logit(z): return 1./(1.+np.exp(-z))

In [17]:
def convert_race(race):
    if race == 'Z':
        return 1
    elif race == 'P':
        return 2
    elif race == 'T':
        return 3
    else:
        return 4
    

def load_data_stan(file_name, pk=1, edge=3, nk=5):
    with open(file_name, encoding='utf-8') as f: lines = f.read().split('\n')
    p = 0; playerid = {};
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue   # parse error or blank line
        player0,player1 = csv[1],csv[4]
        if player0 not in playerid: playerid[player0]=p; p+=1
        if player1 not in playerid: playerid[player1]=p; p+=1

    nplayers = len(playerid)
    playername = ['']*nplayers
    for player in playerid:
        playername[ playerid[player] ]=player;  # id to name lookup


    pKeep = pk   # fraction of edges to consider (immed. throw out 1-p edges)
    nEdge = edge     # try to keep nEdge opponents per player (may be more; asymmetric)
    nKeep = nk     # keep at most nKeep games per opponent pairs (play each other multiple times)

    count = 0
    nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
    player_A = []
    player_B = []
    wins = []
    RA = []
    RB = []
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        a,b = playerid[csv[1]],playerid[csv[4]];
        aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
        if (np.random.rand() < pKeep):
            if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
                count += 1
                nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;
                player_A.append(a+1)
                player_B.append(b+1)
                wins.append(aw)
                RA.append(convert_race(csv[6]))
                RB.append(convert_race(csv[7]))

    win_re = []
    print(wins)
    for i in wins:
        if i:
            win_re.append(1)
        else:
            win_re.append(0)
    return player_A,player_B,win_re,RA,RB, playerid, playername

In [34]:

# skill_model = """
# data {
#   int<lower=1> N;             # Total number of players
#   int<lower=1> E;             # number of games
#   real<lower=0> scale;        # scale value for probability computation
#   int<lower=0,upper=1> win[E]; # PA wins vs PB
#   int PA[E];                  # player info between each game
#   int PB[E];                  # 
# }
# parameters {
#   vector [N] skill;           # skill values for each player
# }

# model{
#   for (i in 1:N){ skill[i]~normal(0,10); }
#   for (i in 1:E){
#     win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
#   }   # win probability is a logit function of skill difference
# }
# """

skill_model = """
data {
  int<lower=1> N;             # Total number of players
  int<lower=1> E;             # number of games
  real<lower=0> scale;        # scale value for probability computation
  int<lower=0,upper=1> win[E]; # PA wins vs PB
  int PA[E];                  # player info between each game
  int PB[E];                  # 
  int RA[E];
  int RB[E];
}
parameters {
  vector [N] skill;           # skill values for each player
  int R12;
  int R13;
  int R23;
  int R21;
  int R31;
  int R32
}

model{
  for (i in 1:N){ skill[i]~normal(25,8); }
  for (i in 1:E){
  if(RA[i] == 1 && RB[i] == 2)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R12) );
  else if(RA[i] == 1 && RB[i] == 3)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R13) );
  else if(RA[i] == 2 && RB[i] == 3)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R23) );
  else if(RA[i] == 2 && RB[i] == 1)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R21) );
  else if(RA[i] == 3 && RB[i] == 1)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R31) );
  else if(RA[i] == 3 && RB[i] == 2)
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]] + R32) );
  else
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
    
  }   # win probability is a logit function of skill difference
}
"""

In [35]:
import pickle

try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)



In [36]:

player_A,player_B,win_re,RA,RB,playerid, playername = load_data_stan("train.csv",1,10,8)



[False, False, False, False, True, True, True, False, False, True, False, False, True, False, True, True, True, False, True, True, False, True, True, False, True, False, False, True, True, False, False, True, False, False, False, False, True, True, True, True, False, False, True, True, True, False, True, False, False, True, True, False, True, True, False, False, False, False, True, False, True, False, False, True, False, False, False, True, True, False, False, True, True, False, True, True, True, True, True, True, True, False, False, True, True, False, True, False, True, True, False, False, True, True, False, True, True, False, True, False, False, True, True, True, True, False, True, True, True, False, False, False, True, True, True, False, True, False, True, False, True, False, True, True, False, True, True, True, True, False, True, True, False, True, True, True, False, True, False, True, True, True, False, True, False, True, True, True, False, True, True, True, True, True, True, Fals

In [37]:
# skill_data = {
#     'N': 999,
#     'E': 19825,
#     'scale': 0.5,
#     'win':win_re,
#     'PA': player_A,
#     'PB': player_B
# }

skill_data = {
    'N': 999,
    'E': 19825,
    'scale': 0.5,
    'win':win_re,
    'PA': player_A,
    'PB': player_B,
    'RA': RA,
    'RB': RB
}


In [38]:
fit = sm.sampling(data=skill_data, iter=200, chains=2)
samples = fit.extract()



In [None]:
def prediction_accuracy_2(skill_data, samples, valid_data, scale=0.3):
    
    #valid_games = generate_games(true_players, num_valid_game, scale, style='pystan')

    acc=0
    for g in range(valid_data.shape[1]):
        p1,p2,result=valid_data[0][g],valid_data[1][g],valid_data[2][g]

        win_rate = logit( skill_data['scale']*(samples['skill'][:,p1-1]-samples['skill'][:,p2-1]) ).mean()
        
        predict_result = random.choices([1,0],weights=[win_rate,1-win_rate])

        if predict_result == result:
            acc += 1
    return acc/valid_data.shape[1]

In [39]:
def load_data(file_name):
    data=pd.read_csv(file_name,index_col=False,
                     names=['date', 'p1', 'p1_outcome', 'score', 'p2', 'p2_outcome', 'p1_race', 'p2_race', 'addon', 'type'])
    #drop other columns for now
    data.drop(columns=['date','score','p2_outcome', 'addon', 'type'], inplace=True)
    data['p1_outcome'].replace({"[loser]":0,"[winner]":1},inplace=True)
    return data

valid_data = load_data('valid.csv')
print(valid_data)
valid_data['p1'].replace(playerid,inplace=True)
valid_data['p2'].replace(playerid,inplace=True)
valid_games=[tuple((r[0],r[2],r[1],r[3],[4])) for r in valid_data.to_numpy()]
#valid_games


           p1  p1_outcome       p2 p1_race p2_race
0          MC           1  NaTuRal       P       T
1          MC           0     Cure       P       T
2          MC           1  Journey       P       T
3          MC           0    Trust       P       P
4          MC           0  Billowy       P       P
...       ...         ...      ...     ...     ...
94002  Keiras           0   Dragon       Z       T
94003  Keiras           0     nExt       Z       Z
94004  Keiras           0      EJK       Z       T
94005  Keiras           0    DeViL       Z       Z
94006  Keiras           0      HuK       Z       P

[94007 rows x 5 columns]


In [40]:

def logit(z): return 1./(1.+np.exp(-z))
def validate(fits, valid_games, n_games):
    accuracy = []
    for fit in fits:
        samples = fits[fit].extract()
        acc, acc_simulate = 0, 0
        n = n_games
        #for g in valid_games[:n_games]:
        for g in [valid_games[i] for i in np.random.choice(len(valid_games), n, replace = False)]:
            try:
                i,j,result,ra,rb=int(g[0]),int(g[1]),int(g[2]),convert_race(g[3]),convert_race(g[4])
            except:
                n-=1
                continue
            if(ra == 1 and rb == 2):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R12']) ).mean()
            if(ra == 1 and rb == 3):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R13']) ).mean()
            if(ra == 2 and rb == 3):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R23']) ).mean()
            if(ra == 2 and rb == 1):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R21']) ).mean()
            if(ra == 3 and rb == 1):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R31']) ).mean()
            if(ra == 3 and rb == 2):
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j] + samples['R32']) ).mean()
            else:
                prob = logit( skill_data['scale']*(samples['skill'][:,i]-samples['skill'][:,j]) ).mean()
            pred = 1 if prob > 0.5 else 0 #
            pred_simulate = np.random.choice([1,0], p=[prob, 1-prob])
            acc += (pred==result)
            acc_simulate += (pred_simulate==result)
        accuracy.append((acc/n, acc_simulate/n))
        print(fit, accuracy[-1], n)
    return accuracy


In [41]:
accuracy = validate({1:fit}, valid_games, len(valid_games))
accuracy

1 (0.6598657546778431, 0.588179603646537) 94007


[(0.6598657546778431, 0.588179603646537)]