In [1]:
##### Retrospective evaluation #####

In [2]:
import pandas as pd
import numpy as np

In [3]:
np.set_printoptions(legacy='1.25')

In [4]:
matches = pd.read_csv("mens_stats.csv", index_col=0)

In [5]:
matches.head()

Unnamed: 0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,set5,attacks,blocks,serves,digs,receives,sets
1,11700,France,Bulgaria,W,3,0,27,25,25,,,39,7,4,13,54,74
2,11700,Bulgaria,France,L,0,3,25,21,23,,,36,5,4,11,56,78
3,11701,Germany,Australia,W,3,0,25,25,25,,,42,5,5,34,40,71
4,11701,Australia,Germany,L,0,3,19,18,16,,,28,3,0,26,53,60
5,11702,Japan,Iran,W,3,0,25,25,26,,,45,7,2,11,50,94


In [6]:
# matches.shape
# matches["team"].value_counts()
# matches.dtypes

In [7]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["target"] = (matches["result"] == "W").astype("int")

In [8]:
matches

Unnamed: 0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,set5,attacks,blocks,serves,digs,receives,sets,opp_code,target
1,11700,France,Bulgaria,W,3,0,27,25,25,,,39,7,4,13,54,74,3,1
2,11700,Bulgaria,France,L,0,3,25,21,23,,,36,5,4,11,56,78,9,0
3,11701,Germany,Australia,W,3,0,25,25,25,,,42,5,5,34,40,71,1,1
4,11701,Australia,Germany,L,0,3,19,18,16,,,28,3,0,26,53,60,10,0
5,11702,Japan,Iran,W,3,0,25,25,26,,,45,7,2,11,50,94,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,21550,Slovenia,Italy,L,1,3,22,25,21,18.0,,52,9,2,64,79,106,12,0
1201,21551,Brazil,Slovenia,W,3,1,23,25,25,25.0,,48,11,5,60,68,110,21,1
1202,21551,Slovenia,Brazil,L,1,3,25,20,23,19.0,,49,7,2,68,80,122,2,0
1203,21552,Poland,Italy,W,3,0,25,25,25,,,37,10,3,55,43,80,12,1


In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
# play around with parameters based on dataset
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [11]:
# edit this line to change what subset of the data to train on
train = matches[matches["matchid"] < 21000]
# edit this line to change what subset of the data to test
test = matches[matches["matchid"] > 21000]

In [12]:
predictors = ["opp_code"]

In [13]:
rf.fit(train[predictors], train["target"])

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
preds = rf.predict(test[predictors])

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
acc = accuracy_score(test["target"], preds)

In [17]:
acc

0.6206896551724138

In [18]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75,41
1,47,69


In [19]:
from sklearn.metrics import precision_score

In [20]:
precision_score(test["target"], preds)

0.6272727272727273

In [21]:
# one dataframe for every team in the dataset
# rolling averages; "how did team A do in the previous x matches?" to predict how they will do on their next match
grouped_matches = matches.groupby("team")

In [22]:
# group = grouped_matches.get_group("Poland")
# group

In [23]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("matchid")
    rolling_stats = group[cols].rolling(5, closed='left').mean() # closed='left' means take the current match out when creating rolling averages
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) # removes rows that have missing values
    return group

In [24]:
cols = ["sets won", "sets lost", "attacks", "blocks", "serves", "digs", "receives", "sets"]
new_cols = [f"{c}_rolling" for c in cols]
new_cols

['sets won_rolling',
 'sets lost_rolling',
 'attacks_rolling',
 'blocks_rolling',
 'serves_rolling',
 'digs_rolling',
 'receives_rolling',
 'sets_rolling']

In [25]:
# rolling_averages(group, cols, new_cols)

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [27]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,...,opp_code,target,sets won_rolling,sets lost_rolling,attacks_rolling,blocks_rolling,serves_rolling,digs_rolling,receives_rolling,sets_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Argentina,82,11740,Argentina,Netherlands,W,3,0,25,25,25,,...,15,1,0.8,2.8,40.6,8.4,1.6,46.0,69.8,92.4
Argentina,105,11752,Argentina,Italy,L,0,3,28,21,20,,...,12,0,1.4,2.2,41.6,7.8,1.4,47.2,65.8,93.4
Argentina,119,11759,Argentina,Australia,W,3,0,25,25,25,,...,1,1,1.4,2.2,43.8,8.6,1.2,48.2,66.6,99.6
Argentina,134,11766,Argentina,Japan,W,3,1,30,25,25,25.0,...,13,1,1.8,1.6,46.4,8.0,2.0,46.8,62.0,96.6
Argentina,149,11774,Argentina,Bulgaria,W,3,1,25,16,25,25.0,...,3,1,1.8,1.4,44.6,8.2,2.8,49.2,59.2,94.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ukraine,1108,21504,Ukraine,Bulgaria,W,3,1,20,25,26,25.0,...,3,1,2.0,2.2,52.4,10.8,5.6,57.0,74.8,103.4
Ukraine,1127,21514,Ukraine,Netherlands,W,3,2,25,20,20,26.0,...,15,1,2.2,1.8,51.2,10.8,4.8,57.8,72.4,104.4
Ukraine,1143,21522,Ukraine,Italy,L,2,3,15,20,25,25.0,...,12,0,2.4,1.6,50.8,9.2,4.6,60.8,70.0,101.8
Ukraine,1155,21528,Ukraine,Serbia,L,0,3,22,19,17,,...,20,0,2.8,1.6,54.8,9.6,5.4,64.8,75.4,111.0


In [28]:
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

Unnamed: 0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,...,opp_code,target,sets won_rolling,sets lost_rolling,attacks_rolling,blocks_rolling,serves_rolling,digs_rolling,receives_rolling,sets_rolling
82,11740,Argentina,Netherlands,W,3,0,25,25,25,,...,15,1,0.8,2.8,40.6,8.4,1.6,46.0,69.8,92.4
105,11752,Argentina,Italy,L,0,3,28,21,20,,...,12,0,1.4,2.2,41.6,7.8,1.4,47.2,65.8,93.4
119,11759,Argentina,Australia,W,3,0,25,25,25,,...,1,1,1.4,2.2,43.8,8.6,1.2,48.2,66.6,99.6
134,11766,Argentina,Japan,W,3,1,30,25,25,25.0,...,13,1,1.8,1.6,46.4,8.0,2.0,46.8,62.0,96.6
149,11774,Argentina,Bulgaria,W,3,1,25,16,25,25.0,...,3,1,1.8,1.4,44.6,8.2,2.8,49.2,59.2,94.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,21504,Ukraine,Bulgaria,W,3,1,20,25,26,25.0,...,3,1,2.0,2.2,52.4,10.8,5.6,57.0,74.8,103.4
1127,21514,Ukraine,Netherlands,W,3,2,25,20,20,26.0,...,15,1,2.2,1.8,51.2,10.8,4.8,57.8,72.4,104.4
1143,21522,Ukraine,Italy,L,2,3,15,20,25,25.0,...,12,0,2.4,1.6,50.8,9.2,4.6,60.8,70.0,101.8
1155,21528,Ukraine,Serbia,L,0,3,22,19,17,,...,20,0,2.8,1.6,54.8,9.6,5.4,64.8,75.4,111.0


In [29]:
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,...,opp_code,target,sets won_rolling,sets lost_rolling,attacks_rolling,blocks_rolling,serves_rolling,digs_rolling,receives_rolling,sets_rolling
0,11740,Argentina,Netherlands,W,3,0,25,25,25,,...,15,1,0.8,2.8,40.6,8.4,1.6,46.0,69.8,92.4
1,11752,Argentina,Italy,L,0,3,28,21,20,,...,12,0,1.4,2.2,41.6,7.8,1.4,47.2,65.8,93.4
2,11759,Argentina,Australia,W,3,0,25,25,25,,...,1,1,1.4,2.2,43.8,8.6,1.2,48.2,66.6,99.6
3,11766,Argentina,Japan,W,3,1,30,25,25,25.0,...,13,1,1.8,1.6,46.4,8.0,2.0,46.8,62.0,96.6
4,11774,Argentina,Bulgaria,W,3,1,25,16,25,25.0,...,3,1,1.8,1.4,44.6,8.2,2.8,49.2,59.2,94.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1076,21504,Ukraine,Bulgaria,W,3,1,20,25,26,25.0,...,3,1,2.0,2.2,52.4,10.8,5.6,57.0,74.8,103.4
1077,21514,Ukraine,Netherlands,W,3,2,25,20,20,26.0,...,15,1,2.2,1.8,51.2,10.8,4.8,57.8,72.4,104.4
1078,21522,Ukraine,Italy,L,2,3,15,20,25,25.0,...,12,0,2.4,1.6,50.8,9.2,4.6,60.8,70.0,101.8
1079,21528,Ukraine,Serbia,L,0,3,22,19,17,,...,20,0,2.8,1.6,54.8,9.6,5.4,64.8,75.4,111.0


In [30]:
def make_predictions(data, predictors):
    train = data[data["matchid"] < 21000]
    test = data[data["matchid"] > 21000]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [31]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [32]:
precision

0.5522388059701493

In [33]:
combined

Unnamed: 0,actual,predicted
53,1,0
54,1,1
55,0,1
56,0,0
57,1,0
...,...,...
1076,1,1
1077,1,1
1078,0,1
1079,0,1


In [34]:
combined = combined.merge(matches_rolling[["matchid", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,matchid,team,opponent,result
53,1,0,21442,Argentina,France,W
54,1,1,21450,Argentina,Canada,W
55,0,1,21454,Argentina,Bulgaria,L
56,0,0,21468,Argentina,Italy,L
57,1,0,21476,Argentina,Netherlands,W
...,...,...,...,...,...,...
1076,1,1,21504,Ukraine,Bulgaria,W
1077,1,1,21514,Ukraine,Netherlands,W
1078,0,1,21522,Ukraine,Italy,L
1079,0,1,21528,Ukraine,Serbia,L


In [35]:
merged = combined.merge(combined, left_on=["matchid", "team"], right_on=["matchid", "opponent"])
merged

Unnamed: 0,actual_x,predicted_x,matchid,team_x,opponent_x,result_x,actual_y,predicted_y,team_y,opponent_y,result_y
0,1,0,21442,Argentina,France,W,0,1,France,Argentina,L
1,1,1,21450,Argentina,Canada,W,0,0,Canada,Argentina,L
2,0,1,21454,Argentina,Bulgaria,L,1,0,Bulgaria,Argentina,W
3,0,0,21468,Argentina,Italy,L,1,1,Italy,Argentina,W
4,1,0,21476,Argentina,Netherlands,W,0,1,Netherlands,Argentina,L
...,...,...,...,...,...,...,...,...,...,...,...
227,1,1,21504,Ukraine,Bulgaria,W,0,1,Bulgaria,Ukraine,L
228,1,1,21514,Ukraine,Netherlands,W,0,0,Netherlands,Ukraine,L
229,0,1,21522,Ukraine,Italy,L,1,1,Italy,Ukraine,W
230,0,1,21528,Ukraine,Serbia,L,1,0,Serbia,Ukraine,W


In [36]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
1    39
0    25
Name: count, dtype: int64

In [37]:
##### Predictive Tool #####

In [38]:
matches

Unnamed: 0,matchid,team,opponent,result,sets won,sets lost,set1,set2,set3,set4,set5,attacks,blocks,serves,digs,receives,sets,opp_code,target
1,11700,France,Bulgaria,W,3,0,27,25,25,,,39,7,4,13,54,74,3,1
2,11700,Bulgaria,France,L,0,3,25,21,23,,,36,5,4,11,56,78,9,0
3,11701,Germany,Australia,W,3,0,25,25,25,,,42,5,5,34,40,71,1,1
4,11701,Australia,Germany,L,0,3,19,18,16,,,28,3,0,26,53,60,10,0
5,11702,Japan,Iran,W,3,0,25,25,26,,,45,7,2,11,50,94,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,21550,Slovenia,Italy,L,1,3,22,25,21,18.0,,52,9,2,64,79,106,12,0
1201,21551,Brazil,Slovenia,W,3,1,23,25,25,25.0,,48,11,5,60,68,110,21,1
1202,21551,Slovenia,Brazil,L,1,3,25,20,23,19.0,,49,7,2,68,80,122,2,0
1203,21552,Poland,Italy,W,3,0,25,25,25,,,37,10,3,55,43,80,12,1


In [39]:
# compute rolling averages of stats grouped by team
def add_rolling_averages(df, predictors, window=5):
    df_sorted = df.sort_values(["team", "matchid"])  # sort
    for col in predictors:
        df_sorted[col + "_rolling"] = (
            df_sorted.groupby("team")[col]
            .transform(lambda x: x.rolling(window, min_periods=1).mean().shift(1))
        )
    return df_sorted

In [40]:
# merge by matchid and merge the two team rows into one row
def build_matchup_data(df, predictors):
    df = df.copy()
    df["target"] = (df["result"] == "W").astype(int)

    winners = df[df["target"] == 1].copy()
    losers = df[df["target"] == 0].copy()

    matchups = winners.merge(losers, on="matchid", suffixes=("_win", "_lose"))

    # Generate two rows per matchup
    rows = []

    for _, row in matchups.iterrows():
        feat_win = {col + "_diff": row[col + "_rolling_win"] - row[col + "_rolling_lose"] for col in predictors}
        feat_lose = {col + "_diff": row[col + "_rolling_lose"] - row[col + "_rolling_win"] for col in predictors}

        rows.append((feat_win, 1))  # winner perspective
        rows.append((feat_lose, 0))  # loser perspective

    X = pd.DataFrame([r[0] for r in rows])
    y = [r[1] for r in rows]

    return X, y, matchups

In [41]:
predictors = ["attacks", "blocks", "serves", "digs", "receives", "sets", "sets won", "sets lost"]

In [42]:
# add rolling averages
matches_with_roll = add_rolling_averages(matches, predictors)

In [43]:
# build training data
X, y, matchups = build_matchup_data(matches_with_roll, predictors)

In [44]:
# train model
rf = RandomForestClassifier(n_estimators=200, min_samples_split=5, random_state=1)

In [45]:
rf.fit(X, y)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
def predict_match(team_a, team_b, df, model, predictors):
    team_a_row = df[df["team"] == team_a].sort_values("matchid").iloc[-1]
    team_b_row = df[df["team"] == team_b].sort_values("matchid").iloc[-1]

    # Build both feature differences
    features_a = [team_a_row[col + "_rolling"] - team_b_row[col + "_rolling"] for col in predictors]
    features_b = [team_b_row[col + "_rolling"] - team_a_row[col + "_rolling"] for col in predictors]

    # Get both probabilities
    prob_a = model.predict_proba([features_a])[0][1]  # prob team_a wins
    prob_b = model.predict_proba([features_b])[0][1]  # prob team_b wins

    # Pick the higher-confidence outcome
    if prob_a > prob_b:
        return team_a, [1-prob_a, prob_a]
    else:
        return team_b, [1-prob_b, prob_b]

In [None]:
winner, prob = predict_match("Italy", "Poland", matches_with_roll, rf, predictors)
prob = [round(p, 3) for p in prob]

In [54]:
prob

[0.452, 0.548]

In [55]:
print(f"Predicted winner: {winner}, Probabilities: {prob}")

Predicted winner: Poland, Probabilities: [0.452, 0.548]
