In [73]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import csv
import random
import time
import pystan


In [86]:
def load_data_stan(file_name):
    with open(file_name, encoding='utf-8') as f: lines = f.read().split('\n')
    p = 0; playerid = {};
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue   # parse error or blank line
        player0,player1 = csv[1],csv[4]
        if player0 not in playerid: playerid[player0]=p; p+=1
        if player1 not in playerid: playerid[player1]=p; p+=1

    nplayers = len(playerid)
    playername = ['']*nplayers
    for player in playerid:
        playername[ playerid[player] ]=player;  # id to name lookup


    pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
    nEdge = 5     # try to keep nEdge opponents per player (may be more; asymmetric)
    nKeep = 8     # keep at most nKeep games per opponent pairs (play each other multiple times)

    count = 0
    nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
    player_A = []
    player_B = []
    wins = []
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        a,b = playerid[csv[1]],playerid[csv[4]];
        aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
        if (np.random.rand() < pKeep):
            if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
                count += 1
                nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;
                player_A.append(a+1)
                player_B.append(b+1)
                wins.append(aw)

    win_re = []
    for i in wins:
        if i:
            win_re.append(1)
        else:
            win_re.append(0)
    return np.array([player_A,player_B,win_re])


In [87]:
train_data = load_data_stan("train.csv")
print(train_data.shape[1])

9243


In [64]:
def logit(z): return 1./(1.+np.exp(-z))

In [65]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players
  int<lower=1> E;             # number of games
  real<lower=0> scale;        # scale value for probability computation
  int<lower=0,upper=1> win[E]; # PA wins vs PB
  int PA[E];                  # player info between each game
  int PB[E];                  # 
}
parameters {
  vector [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(25,8); }
  for (i in 1:E){
    win[i] ~ bernoulli_logit( (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a logit function of skill difference
}
"""

In [66]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

In [88]:
skill_data = {
    'N': 999,
    'E': 9243,
    'scale': 0.5,
    'win':train_data[2],
    'PA': train_data[0],
    'PB': train_data[1]
}


In [81]:
fit = sm.sampling(data=skill_data, iter=10000, chains=4)
samples = fit.extract()

In [70]:
def prediction_accuracy_2(skill_data, samples, valid_data, scale=0.3):
    
    #valid_games = generate_games(true_players, num_valid_game, scale, style='pystan')

    acc=0
    for g in range(valid_data.shape[1]):
        p1,p2,result=valid_data[0][g],valid_data[1][g],valid_data[2][g]

        win_rate = logit( skill_data['scale']*(samples['skill'][:,p1-1]-samples['skill'][:,p2-1]) ).mean()
        
        predict_result = random.choices([1,0],weights=[win_rate,1-win_rate])

        if predict_result == result:
            acc += 1
    return acc/valid_data.shape[1]

In [82]:
valid_data = get_data("valid.csv")
rate = prediction_accuracy_2(skill_data, samples, valid_data, scale=0.5)


999 94008


In [75]:
print(rate)

0.5425755033557047


In [83]:
print(rate)

0.5750838926174496
