In [1]:
import pandas as pd
import math
import csv
import random
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

In [11]:
df1  =  pd.read_csv("15-16Miscellaneous_Stat.csv")
df2  =   pd.read_csv("15-16Opponent_Per_Game_Stat.csv")
df3  =   pd.read_csv("15-16Team_Per_Game_Stat.csv")
schedule = pd.read_csv("16-17Schedule.csv")
pastresult   = pd.read_csv("2015-2016_result.csv")    

In [12]:
base_elo = 1600
team_elos = {} 
team_stats = {}
X = []
y = []
folder = 'data'

In [4]:
def initialize_data(Mstat, Ostat, Tstat):
    new_Mstat = Mstat.drop(['Rk', 'Arena'], axis=1)
    new_Ostat = Ostat.drop(['Rk', 'G', 'MP'], axis=1)
    new_Tstat = Tstat.drop(['Rk', 'G', 'MP'], axis=1)

    team_stats1 = pd.merge(new_Mstat, new_Ostat, how='left', on='Team')
    team_stats1 = pd.merge(team_stats1, new_Tstat, how='left', on='Team')
    return team_stats1.set_index('Team', inplace=False, drop=True)

In [5]:
def get_elo(team):
    try:
        return team_elos[team]
    except:
        # 当最初没有elo时，给每个队伍最初赋base_elo
        team_elos[team] = base_elo
        return team_elos[team]

In [6]:
def calc_elo(win_team, lose_team):
    winner_rank = get_elo(win_team)
    loser_rank = get_elo(lose_team)

    rank_diff = winner_rank - loser_rank
    exp = (rank_diff  * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))
    # 根据rank级别修改K值
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16
    
    # 更新 rank 数值
    new_winner_rank = round(winner_rank + (k * (1 - odds)))      
    new_loser_rank = round(loser_rank + (k * (0 - odds)))
    return new_winner_rank, new_loser_rank

In [7]:
def  build_dataSet(all_data):
    print("Building data set..")
    X = []
    skip = 0
    for index, row in all_data.iterrows():

        Wteam = row['WTeam']
        Lteam = row['LTeam']

        #获取最初的elo或是每个队伍最初的elo值
        team1_elo = get_elo(Wteam)
        team2_elo = get_elo(Lteam)

        # 给主场比赛的队伍加上100的elo值
        if row['WLoc'] == 'H':
            team1_elo += 100
        else:
            team2_elo += 100

        # 把elo当为评价每个队伍的第一个特征值
        team1_features = [team1_elo]
        team2_features = [team2_elo]

        # 添加我们从basketball reference.com获得的每个队伍的统计信息
        for key, value in team_stats.loc[Wteam].iteritems():
            team1_features.append(value)
        for key, value in team_stats.loc[Lteam].iteritems():
            team2_features.append(value)

        # 将两支队伍的特征值随机的分配在每场比赛数据的左右两侧
        # 并将对应的0/1赋给y值
        if np.random.random() > 0.5:
            X.append(team1_features + team2_features)
            y.append(0)
        else:
            X.append(team2_features + team1_features)
            y.append(1)

        if skip == 0:
            print('X',X)
            skip = 1

        # 根据这场比赛的数据更新队伍的elo值
        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank

    return np.nan_to_num(X), y

In [15]:
if __name__ == '__main__':

    Mstat = df1
    Ostat = df2
    Tstat = df3
    team_stats = initialize_data(Mstat, Ostat, Tstat)
    print(team_stats)
    result_data = pastresult
    X, y = build_dataSet(result_data)

    # 训练网络模型
    #print("Fitting on %d game samples.." % len(X))

    model = linear_model.LogisticRegression()
    model.fit(X, y)

    # 利用10折交叉验证计算训练正确率
    #print("Doing cross-validation..")
    #print(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean())

1,                          Age   W   L  PW  PL    MOV   SOS    SRS   ORtg  \
Team                                                                      
Golden State Warriors   27.4  73   9  65  17  10.76 -0.38  10.38  114.5   
San Antonio Spurs       30.3  67  15  67  15  10.63 -0.36  10.28  110.3   
Oklahoma City Thunder   25.8  55  27  59  23   7.28 -0.19   7.09  113.1   
Cleveland Cavaliers     28.1  57  25  57  25   6.00 -0.55   5.45  110.9   
Los Angeles Clippers    29.7  53  29  53  29   4.28 -0.15   4.13  108.3   
Toronto Raptors         26.3  56  26  53  29   4.50 -0.42   4.08  110.0   
Atlanta Hawks           28.2  48  34  51  31   3.61 -0.12   3.49  105.1   
Boston Celtics          25.2  48  34  50  32   3.21 -0.37   2.84  106.8   
Charlotte Hornets       26.0  48  34  49  33   2.72 -0.36   2.36  107.1   
Utah Jazz               24.2  40  42  46  36   1.79  0.05   1.84  105.9   
Indiana Pacers          26.9  45  37  46  36   1.71 -0.09   1.62  104.6   
Miami Heat            



ValueError: Found input variables with inconsistent numbers of samples: [1316, 3948]

In [9]:
def predict_winner(team_1, team_2, model):
    features = []

    # team 1，客场队伍
    features.append(get_elo(team_1))
    for key, value in team_stats.loc[team_1].iteritems():
        features.append(value)

    # team 2，主场队伍
    features.append(get_elo(team_2) + 100)
    for key, value in team_stats.loc[team_2].iteritems():
        features.append(value)

    features = np.nan_to_num(features)
    return model.predict_proba([features])

In [10]:
# 利用训练好的model在16-17年的比赛中进行预测

print('Predicting on new schedule..')
schedule1617 = schedule
result = []
for index, row in schedule1617.iterrows():
    team1 = row['Vteam']
    team2 = row['Hteam']
    pred = predict_winner(team1, team2, model)
    prob = pred[0][0]
    if prob > 0.5:
        winner = team1
        loser = team2
        result.append([winner, loser, prob])
    else:
        winner = team2
        loser = team1
        result.append([winner, loser, 1 - prob])

with open('16-17Result.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['win', 'lose', 'probability'])
    writer.writerows(result)
    print('done.')
pd.read_csv('16-17Result.csv',header=0)

Predicting on new schedule..
done.


Unnamed: 0,win,lose,probability
0,Cleveland Cavaliers,New York Knicks,0.863064
1,Golden State Warriors,San Antonio Spurs,0.514101
2,Portland Trail Blazers,Utah Jazz,0.525198
3,Boston Celtics,Brooklyn Nets,0.904891
4,Indiana Pacers,Dallas Mavericks,0.583867
5,Houston Rockets,Los Angeles Lakers,0.824441
6,Memphis Grizzlies,Minnesota Timberwolves,0.669173
7,Charlotte Hornets,Milwaukee Bucks,0.758661
8,Denver Nuggets,New Orleans Pelicans,0.508109
9,Miami Heat,Orlando Magic,0.688704
