# **Machine Learning**

## Dự đoán đội bóng rổ thắng hay thua trong trận kế tiếp

In [1]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score


In [2]:
pd.set_option("display.max_rows", 50)

In [3]:
df = pd.read_csv("nba_games.csv", index_col=0)

In [4]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240,37,99,0.374,10,33,0.303,15,23,0.652,...,43.6,40.4,300,98,BRK,125,1,2021,12/22/2020,False
1,240,42,92,0.457,15,35,0.429,26,32,0.813,...,50.0,32.1,267,120,GSW,99,0,2021,12/22/2020,True
2,240,44,93,0.473,14,40,0.350,14,19,0.737,...,100.0,35.9,166,118,LAL,109,1,2021,12/22/2020,True
3,240,38,81,0.469,9,29,0.310,24,31,0.774,...,20.9,40.2,154,114,LAC,116,0,2021,12/22/2020,False
4,240,46,90,0.511,14,35,0.400,15,18,0.833,...,33.3,39.2,203,126,BOS,122,1,2021,12/23/2020,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,240,33,80,0.413,9,37,0.243,13,19,0.684,...,33.3,28.1,203,106,PHI,96,0,2023,4/22/2023,False
5186,240,41,84,0.488,9,22,0.409,21,27,0.778,...,29.1,35.8,140,128,LAC,100,1,2023,4/22/2023,True
5187,240,40,92,0.435,12,37,0.324,8,10,0.800,...,25.8,30.3,152,117,PHO,112,0,2023,4/22/2023,False
5188,240,38,85,0.447,15,39,0.385,8,12,0.667,...,37.5,37.3,236,113,MIA,121,1,2023,4/22/2023,False


### Data Cleaning

In [5]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240,37,99,0.374,10,33,0.303,15,23,0.652,...,43.6,40.4,300,98,BRK,125,1,2021,12/22/2020,False
1,240,42,92,0.457,15,35,0.429,26,32,0.813,...,50.0,32.1,267,120,GSW,99,0,2021,12/22/2020,True
2,240,44,93,0.473,14,40,0.350,14,19,0.737,...,100.0,35.9,166,118,LAL,109,1,2021,12/22/2020,True
3,240,38,81,0.469,9,29,0.310,24,31,0.774,...,20.9,40.2,154,114,LAC,116,0,2021,12/22/2020,False
4,240,46,90,0.511,14,35,0.400,15,18,0.833,...,33.3,39.2,203,126,BOS,122,1,2021,12/23/2020,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,240,33,80,0.413,9,37,0.243,13,19,0.684,...,33.3,28.1,203,106,PHI,96,0,2023,4/22/2023,False
5186,240,41,84,0.488,9,22,0.409,21,27,0.778,...,29.1,35.8,140,128,LAC,100,1,2023,4/22/2023,True
5187,240,40,92,0.435,12,37,0.324,8,10,0.800,...,25.8,30.3,152,117,PHO,112,0,2023,4/22/2023,False
5188,240,38,85,0.447,15,39,0.385,8,12,0.667,...,37.5,37.3,236,113,MIA,121,1,2023,4/22/2023,False


Để dự đoán kết quả của trận đấu tiếp theo, chúng ta có thể tạo một thuộc tính tên là 'target' cho từng đội bằng cách đẩy dữ liệu của cột 'won' xuống một hàng

In [6]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group



df = df.groupby("team", group_keys=False).apply(add_target)

In [7]:
df.tail(20)

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
5170,240,45,84,0.536,16,36,0.444,16,23,0.696,...,33.4,250,126,MIL,138,1,2023,4/19/2023,False,True
5171,240,53,99,0.535,25,49,0.51,7,8,0.875,...,33.7,159,142,MIA,122,0,2023,4/19/2023,True,False
5172,240,40,82,0.488,13,31,0.419,9,13,0.692,...,42.4,229,125,BRK,97,1,2023,4/20/2023,True,True
5173,240,34,79,0.43,10,30,0.333,19,22,0.864,...,29.4,150,116,PHI,102,0,2023,4/20/2023,False,False
5174,240,35,92,0.38,11,47,0.234,16,23,0.696,...,30.7,159,102,GSW,114,1,2023,4/20/2023,False,
5175,240,40,100,0.4,16,50,0.32,18,23,0.783,...,32.1,147,124,SAC,97,0,2023,4/20/2023,True,
5176,240,42,88,0.477,10,27,0.37,35,46,0.761,...,35.1,214,128,LAC,124,1,2023,4/20/2023,True,True
5177,240,45,83,0.542,15,35,0.429,19,25,0.76,...,28.5,212,127,PHO,129,0,2023,4/20/2023,False,False
5178,240,45,92,0.489,21,48,0.438,11,16,0.688,...,32.4,178,125,ATL,130,1,2023,4/21/2023,False,
5179,240,51,91,0.56,15,34,0.441,13,16,0.813,...,32.2,273,135,BOS,122,0,2023,4/21/2023,True,


Ở đội có mã là PHI, ta thấy 'target' ở thời gian gần nhất là null bởi vì ta chưa biết được kết quả tiếp theo của đội đó

In [8]:
df[df["team"] == "PHI"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
21,240,41,87,0.471,8,28,0.286,23,30,0.767,...,36.6,207,115,WAS,107,0,2021,12/23/2020,True,True
46,240,41,88,0.466,11,31,0.355,16,22,0.727,...,35.9,139,125,NYK,89,1,2021,12/26/2020,True,False
62,240,33,79,0.418,12,37,0.324,16,22,0.727,...,34.0,200,106,CLE,118,1,2021,12/27/2020,False,True
101,240,32,84,0.381,8,31,0.258,28,33,0.848,...,30.0,152,108,TOR,93,0,2021,12/29/2020,True,True
126,240,44,91,0.484,15,33,0.455,13,15,0.867,...,30.5,131,117,ORL,92,1,2021,12/31/2020,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,240,53,101,0.525,14,32,0.438,14,19,0.737,...,36.2,123,131,BRK,105,1,2023,4/9/2023,True,True
5145,240,42,89,0.472,21,43,0.488,16,16,1.000,...,42.6,233,146,BRK,101,0,2023,4/15/2023,True,True
5157,240,36,80,0.450,11,35,0.314,13,16,0.813,...,26.6,119,117,BRK,84,0,2023,4/17/2023,True,True
5172,240,40,82,0.488,13,31,0.419,9,13,0.692,...,42.4,229,125,BRK,97,1,2023,4/20/2023,True,True


In [9]:
df["target"][pd.isnull(df["target"])] = 2 # với target là null, ta sẽ cho tương đương với giá trị là 2
df["target"] = df["target"].astype(int, errors="ignore") # target là false thì tương đương với giá trị là 0, true là 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2 # với target là null, ta sẽ cho tương đương với giá trị là 2


In [10]:
df[df["team"] == "PHI"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
21,240,41,87,0.471,8,28,0.286,23,30,0.767,...,36.6,207,115,WAS,107,0,2021,12/23/2020,True,1
46,240,41,88,0.466,11,31,0.355,16,22,0.727,...,35.9,139,125,NYK,89,1,2021,12/26/2020,True,0
62,240,33,79,0.418,12,37,0.324,16,22,0.727,...,34.0,200,106,CLE,118,1,2021,12/27/2020,False,1
101,240,32,84,0.381,8,31,0.258,28,33,0.848,...,30.0,152,108,TOR,93,0,2021,12/29/2020,True,1
126,240,44,91,0.484,15,33,0.455,13,15,0.867,...,30.5,131,117,ORL,92,1,2021,12/31/2020,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,240,53,101,0.525,14,32,0.438,14,19,0.737,...,36.2,123,131,BRK,105,1,2023,4/9/2023,True,1
5145,240,42,89,0.472,21,43,0.488,16,16,1.000,...,42.6,233,146,BRK,101,0,2023,4/15/2023,True,1
5157,240,36,80,0.450,11,35,0.314,13,16,0.813,...,26.6,119,117,BRK,84,0,2023,4/17/2023,True,1
5172,240,40,82,0.488,13,31,0.419,9,13,0.692,...,42.4,229,125,BRK,97,1,2023,4/20/2023,True,1


In [11]:
df["won"].value_counts()

False    2595
True     2595
Name: won, dtype: int64

In [12]:
df["target"].value_counts()

0    2580
1    2580
2      30
Name: target, dtype: int64

Loại bỏ các cột không có giá trị

In [13]:
nulls = pd.isnull(df).sum()

In [14]:
nulls = nulls[nulls > 0]

In [15]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [16]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [17]:
df = df[valid_columns].copy()

In [18]:
df


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240,37,99,0.374,10,33,0.303,15,23,0.652,...,40.4,300,98,BRK,125,1,2021,12/22/2020,False,0
1,240,42,92,0.457,15,35,0.429,26,32,0.813,...,32.1,267,120,GSW,99,0,2021,12/22/2020,True,1
2,240,44,93,0.473,14,40,0.350,14,19,0.737,...,35.9,166,118,LAL,109,1,2021,12/22/2020,True,1
3,240,38,81,0.469,9,29,0.310,24,31,0.774,...,40.2,154,114,LAC,116,0,2021,12/22/2020,False,1
4,240,46,90,0.511,14,35,0.400,15,18,0.833,...,39.2,203,126,BOS,122,1,2021,12/23/2020,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,240,33,80,0.413,9,37,0.243,13,19,0.684,...,28.1,203,106,PHI,96,0,2023,4/22/2023,False,2
5186,240,41,84,0.488,9,22,0.409,21,27,0.778,...,35.8,140,128,LAC,100,1,2023,4/22/2023,True,2
5187,240,40,92,0.435,12,37,0.324,8,10,0.800,...,30.3,152,117,PHO,112,0,2023,4/22/2023,False,2
5188,240,38,85,0.447,15,39,0.385,8,12,0.667,...,37.3,236,113,MIA,121,1,2023,4/22/2023,False,2


### Machine Learning

Chúng sẽ không muốn sử dụng hết các cột để luyện model vì một số lý do như:
<br>    - Các cột có thể tương quan với nhau nên một số thuật toán không thể xử lý tốt
<br>    - Một số cột có thể gây ra overfitting hay nhiều vấn đề khác cho model
<br> Nên trước khi luyện một model, chúng ta có thể sử dụng FeatureSelector để chọn một set các cột nhất định để luyện model

In [19]:
rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3) # chia data thành các bộ nhỏ hơn để học máy (bộ dùng để train và bộ dùng để test)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

RidgeClassifier hoạt động tốt hơn nếu như chúng ta scale dữ liệu sao cho các cột nằm trải dài từ giá trị 0 đến 1

In [20]:

removed_columns = ["season", "date", "won", "target", "team", "team_opp"] # những cột không nên scale
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [21]:
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [22]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.350,0.611111,0.236585,0.296296,0.433962,0.372323,0.307692,0.422222,0.497110,...,0.234917,1.000000,0.168831,BRK,0.611765,1.0,2021,12/22/2020,False,0
1,0.0,0.475,0.481481,0.439024,0.481481,0.471698,0.579901,0.589744,0.622222,0.729769,...,0.128370,0.829897,0.454545,GSW,0.305882,0.0,2021,12/22/2020,True,1
2,0.0,0.525,0.500000,0.478049,0.444444,0.566038,0.449753,0.282051,0.333333,0.619942,...,0.177150,0.309278,0.428571,LAL,0.423529,1.0,2021,12/22/2020,True,1
3,0.0,0.375,0.277778,0.468293,0.259259,0.358491,0.383855,0.538462,0.600000,0.673410,...,0.232349,0.247423,0.376623,LAC,0.505882,0.0,2021,12/22/2020,False,1
4,0.0,0.575,0.444444,0.570732,0.444444,0.471698,0.532125,0.307692,0.311111,0.758671,...,0.219512,0.500000,0.532468,BOS,0.576471,1.0,2021,12/23/2020,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.250,0.259259,0.331707,0.259259,0.509434,0.273476,0.256410,0.333333,0.543353,...,0.077022,0.500000,0.272727,PHI,0.270588,0.0,2023,4/22/2023,False,2
5186,0.0,0.450,0.333333,0.514634,0.259259,0.226415,0.546952,0.461538,0.511111,0.679191,...,0.175866,0.175258,0.558442,LAC,0.317647,1.0,2023,4/22/2023,True,2
5187,0.0,0.425,0.481481,0.385366,0.370370,0.509434,0.406919,0.128205,0.133333,0.710983,...,0.105263,0.237113,0.415584,PHO,0.458824,0.0,2023,4/22/2023,False,2
5188,0.0,0.375,0.351852,0.414634,0.481481,0.547170,0.507414,0.128205,0.177778,0.518786,...,0.195122,0.670103,0.363636,MIA,0.564706,1.0,2023,4/22/2023,False,2


Chọn 30 thuộc tính tốt nhất để học máy

In [23]:
sfs.fit(df[selected_columns], df["target"])

In [24]:
predictors = list(selected_columns[sfs.get_support()])

In [25]:
predictors

['fg%',
 '3p',
 '3pa',
 '3p%',
 'pts',
 'efg%',
 'ast%',
 'usg%',
 'ortg',
 'stl_max',
 'blk_max',
 '+/-_max',
 'ts%_max',
 'efg%_max',
 'ftr_max',
 'stl%_max',
 'ortg_max',
 'drtg_max',
 'total',
 'fg_opp',
 'tov_opp',
 'blk%_opp',
 'usg%_opp',
 'drtg_opp',
 'ft%_max_opp',
 'orb_max_opp',
 '+/-_max_opp',
 '3par_max_opp',
 'orb%_max_opp',
 'tov%_max_opp']

Chia dữ liệu theo mùa và dùng dữ liệu từ các mùa cũ để dự đoán mùa mới
<br> Ví dụ nếu có các mùa từ 2016-2023 thì chúng ta sẽ dùng dữ liệu mùa 2016-2017 đề dự đoán cho mùa 2018. Sau đó sẽ dùng 2016, 2017, 2018 để dự đoán cho 2019 và cứ tiếp tục như thế cho đến mùa 2023

In [26]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors]) # dự đoán bộ test
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [27]:
predictions = backtest(df, rr, predictors)

In [28]:
predictions

Unnamed: 0,actual,prediction
4988,0,1
4989,1,1
4990,1,0
4991,0,1
4992,1,0
...,...,...
5185,2,1
5186,2,1
5187,2,0
5188,2,0


In [29]:
result_df = pd.concat([df, predictions], axis=1)

result_df.loc[:, ['team', 'team_opp', 'date', 'season', 'prediction', 'actual']].tail(20)

Unnamed: 0,team,team_opp,date,season,prediction,actual
5170,MIA,MIL,4/19/2023,2023,0.0,1.0
5171,MIL,MIA,4/19/2023,2023,1.0,0.0
5172,PHI,BRK,4/20/2023,2023,1.0,1.0
5173,BRK,PHI,4/20/2023,2023,1.0,0.0
5174,SAC,GSW,4/20/2023,2023,0.0,2.0
5175,GSW,SAC,4/20/2023,2023,1.0,2.0
5176,PHO,LAC,4/20/2023,2023,1.0,1.0
5177,LAC,PHO,4/20/2023,2023,0.0,0.0
5178,BOS,ATL,4/21/2023,2023,0.0,2.0
5179,ATL,BOS,4/21/2023,2023,1.0,2.0


In [30]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.4158415841584158

#### Cải thiện model

Nếu để ý thì trong thể thao, các trận chơi ở sân nhà thì có cơ hội thắng cao hơn chơi ở sân khách

In [31]:

df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0]) # lấy số lượng trận thắng/thua chia cho số lượng tổng

home
0.0    0.452023
1.0    0.547977
dtype: float64

In [32]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.350,0.611111,0.236585,0.296296,0.433962,0.372323,0.307692,0.422222,0.497110,...,0.234917,1.000000,0.168831,BRK,0.611765,1.0,2021,12/22/2020,False,0
1,0.0,0.475,0.481481,0.439024,0.481481,0.471698,0.579901,0.589744,0.622222,0.729769,...,0.128370,0.829897,0.454545,GSW,0.305882,0.0,2021,12/22/2020,True,1
2,0.0,0.525,0.500000,0.478049,0.444444,0.566038,0.449753,0.282051,0.333333,0.619942,...,0.177150,0.309278,0.428571,LAL,0.423529,1.0,2021,12/22/2020,True,1
3,0.0,0.375,0.277778,0.468293,0.259259,0.358491,0.383855,0.538462,0.600000,0.673410,...,0.232349,0.247423,0.376623,LAC,0.505882,0.0,2021,12/22/2020,False,1
4,0.0,0.575,0.444444,0.570732,0.444444,0.471698,0.532125,0.307692,0.311111,0.758671,...,0.219512,0.500000,0.532468,BOS,0.576471,1.0,2021,12/23/2020,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.250,0.259259,0.331707,0.259259,0.509434,0.273476,0.256410,0.333333,0.543353,...,0.077022,0.500000,0.272727,PHI,0.270588,0.0,2023,4/22/2023,False,2
5186,0.0,0.450,0.333333,0.514634,0.259259,0.226415,0.546952,0.461538,0.511111,0.679191,...,0.175866,0.175258,0.558442,LAC,0.317647,1.0,2023,4/22/2023,True,2
5187,0.0,0.425,0.481481,0.385366,0.370370,0.509434,0.406919,0.128205,0.133333,0.710983,...,0.105263,0.237113,0.415584,PHO,0.458824,0.0,2023,4/22/2023,False,2
5188,0.0,0.375,0.351852,0.414634,0.481481,0.547170,0.507414,0.128205,0.177778,0.518786,...,0.195122,0.670103,0.363636,MIA,0.564706,1.0,2023,4/22/2023,False,2


Để ý với dataframe hiện tại của chúng ta, khi chúng ta đưa ra dự đoán, chúng ta chỉ nhìn trận đấu hiện tại để dự đoán kết quả tiếp theo. Tuy nhiên trong thực tế, ở một trận đấu cụ thể, đội chơi đó có thể chơi tốt hơn hoặc tệ hơn so với cách chơi thông thường của đội bóng đó. Vì vậy, chúng ta nên sử dụng trung bình các trận đấu gần đây của một đội để dự đoán kết quả tiếp theo

In [33]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [34]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.350,0.611111,0.236585,0.296296,0.433962,0.372323,0.307692,0.422222,0.497110,...,0.074,0.408805,0.234917,1.000000,0.168831,0.611765,1.0,False,GSW,2021
1,0.0,0.475,0.481481,0.439024,0.481481,0.471698,0.579901,0.589744,0.622222,0.729769,...,0.127,0.475891,0.128370,0.829897,0.454545,0.305882,0.0,True,BRK,2021
2,0.0,0.525,0.500000,0.478049,0.444444,0.566038,0.449753,0.282051,0.333333,0.619942,...,0.039,1.000000,0.177150,0.309278,0.428571,0.423529,1.0,True,LAC,2021
3,0.0,0.375,0.277778,0.468293,0.259259,0.358491,0.383855,0.538462,0.600000,0.673410,...,0.045,0.170860,0.232349,0.247423,0.376623,0.505882,0.0,False,LAL,2021
4,0.0,0.575,0.444444,0.570732,0.444444,0.471698,0.532125,0.307692,0.311111,0.758671,...,0.070,0.300839,0.219512,0.500000,0.532468,0.576471,1.0,False,MIL,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.250,0.259259,0.331707,0.259259,0.509434,0.273476,0.256410,0.333333,0.543353,...,0.035,0.300839,0.077022,0.500000,0.272727,0.270588,0.0,False,BRK,2023
5186,0.0,0.450,0.333333,0.514634,0.259259,0.226415,0.546952,0.461538,0.511111,0.679191,...,0.040,0.256813,0.175866,0.175258,0.558442,0.317647,1.0,True,PHO,2023
5187,0.0,0.425,0.481481,0.385366,0.370370,0.509434,0.406919,0.128205,0.133333,0.710983,...,0.072,0.222222,0.105263,0.237113,0.415584,0.458824,0.0,False,LAC,2023
5188,0.0,0.375,0.351852,0.414634,0.481481,0.547170,0.507414,0.128205,0.177778,0.518786,...,0.055,0.344864,0.195122,0.670103,0.363636,0.564706,1.0,False,MIL,2023


In [35]:
def find_team_averages(team):
    rolling = team.rolling(5).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  rolling = team.rolling(5).mean()


In [36]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [37]:

df_rolling


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts,ts%,efg%,3par,ftr,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,ortg,drtg,fg_max,fga_max,fg%_max,3p_max,3pa_max,3p%_max,ft_max,fta_max,ft%_max,orb_max,drb_max,trb_max,ast_max,stl_max,blk_max,tov_max,pf_max,...,efg%_opp,3par_opp,ftr_opp,orb%_opp,drb%_opp,trb%_opp,ast%_opp,stl%_opp,blk%_opp,tov%_opp,usg%_opp,ortg_opp,drtg_opp,fg_max_opp,fga_max_opp,fg%_max_opp,3p_max_opp,3pa_max_opp,3p%_max_opp,ft_max_opp,fta_max_opp,ft%_max_opp,orb_max_opp,drb_max_opp,trb_max_opp,ast_max_opp,stl_max_opp,blk_max_opp,tov_max_opp,pf_max_opp,pts_max_opp,+/-_max_opp,ts%_max_opp,efg%_max_opp,3par_max_opp,ftr_max_opp,orb%_max_opp,drb%_max_opp,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.280,0.229630,0.396098,0.348148,0.452830,0.436244,0.312821,0.360000,0.619364,0.224,0.345,0.269565,0.338095,0.245455,0.221053,0.406897,0.362963,0.258824,0.405467,0.428340,0.563542,0.359736,0.293817,0.388913,0.302778,0.61504,0.274766,0.251370,0.492437,0.0,0.335399,0.522527,0.431579,0.444444,0.503597,0.309091,0.252632,0.540267,0.227273,0.224,0.9112,0.123077,0.233333,0.215385,0.218182,0.20,0.22,0.288889,0.45,...,0.489879,0.463542,0.235314,0.611514,0.706183,0.697222,0.47456,0.446729,0.332877,0.441176,0.0,0.522527,0.335399,0.378947,0.355556,0.685252,0.363636,0.368421,0.623200,0.272727,0.232,1.0000,0.323077,0.377778,0.315385,0.309091,0.32,0.26,0.400000,0.55,0.290196,0.541667,0.4128,0.4126,0.59968,0.062596,0.267559,0.354162,0.192739,0.249524,0.0728,0.0894,0.374633,0.082670,0.380412,0.348052,0.432941,0.4,0.0,2023.0
5186,0.0,0.495,0.400000,0.520976,0.288889,0.298113,0.495881,0.492308,0.542222,0.706069,0.392,0.420,0.426087,0.433333,0.363636,0.315789,0.337931,0.481481,0.524706,0.502961,0.495951,0.316667,0.476238,0.519403,0.508742,0.545000,0.54976,0.378505,0.364384,0.368908,0.0,0.525626,0.471037,0.473684,0.459259,0.647842,0.218182,0.231579,0.633333,0.345455,0.288,1.0000,0.200000,0.311111,0.284615,0.327273,0.28,0.26,0.400000,0.60,...,0.463158,0.422569,0.382838,0.491258,0.480597,0.455000,0.38368,0.303738,0.278082,0.470588,0.0,0.471037,0.525626,0.557895,0.503704,0.807194,0.254545,0.294737,0.582133,0.263636,0.256,0.8188,0.230769,0.322222,0.269231,0.272727,0.20,0.20,0.355556,0.60,0.478431,0.361111,0.4680,0.4594,1.00000,0.041827,0.135942,0.229916,0.141034,0.216190,0.0322,0.1052,0.428302,0.178434,0.409278,0.529870,0.475294,0.4,0.6,2023.0
5187,0.0,0.475,0.437037,0.468780,0.318519,0.400000,0.420099,0.405128,0.440000,0.688728,0.408,0.355,0.378261,0.319048,0.290909,0.263158,0.441379,0.570370,0.475294,0.454670,0.463158,0.422569,0.382838,0.491258,0.480597,0.455000,0.38368,0.303738,0.278082,0.470588,0.0,0.471037,0.525626,0.557895,0.503704,0.807194,0.254545,0.294737,0.582133,0.263636,0.256,0.8188,0.230769,0.322222,0.269231,0.272727,0.20,0.20,0.355556,0.60,...,0.495951,0.316667,0.476238,0.519403,0.508742,0.545000,0.54976,0.378505,0.364384,0.368908,0.0,0.525626,0.471037,0.473684,0.459259,0.647842,0.218182,0.231579,0.633333,0.345455,0.288,1.0000,0.200000,0.311111,0.284615,0.327273,0.28,0.26,0.400000,0.60,0.435294,0.463889,0.5022,0.4790,1.00000,0.111312,0.249434,0.249940,0.145655,0.243810,0.0550,0.1146,0.212159,0.084981,0.480412,0.490909,0.524706,0.6,0.4,2023.0
5188,0.0,0.505,0.440741,0.493659,0.548148,0.626415,0.506096,0.215385,0.240000,0.681214,0.304,0.365,0.330435,0.495238,0.263636,0.168421,0.468966,0.340741,0.489412,0.510706,0.548583,0.643056,0.214521,0.382516,0.628998,0.452778,0.63488,0.269159,0.206164,0.511765,0.0,0.461263,0.602145,0.357895,0.303704,0.635971,0.272727,0.284211,0.635467,0.163636,0.160,1.0000,0.200000,0.255556,0.261538,0.363636,0.18,0.18,0.400000,0.60,...,0.680972,0.477083,0.285479,0.371002,0.617484,0.547222,0.54304,0.430841,0.226712,0.466387,0.0,0.602145,0.461263,0.421053,0.325926,0.928058,0.309091,0.242105,0.822400,0.209091,0.200,1.0000,0.092308,0.322222,0.223077,0.254545,0.28,0.14,0.288889,0.35,0.368627,0.538889,0.6008,0.6046,0.89312,0.075095,0.254789,0.341375,0.241144,0.269048,0.0616,0.0742,0.398113,0.198716,0.411340,0.488312,0.625882,0.4,0.2,2023.0


Kết hợp dataframe trung bình với dataframe của chúng ta

In [38]:
rolling_cols = [f"{col}_5" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [39]:
df = df.dropna()

In [40]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts,ts%,efg%,3par,ftr,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,ortg,drtg,fg_max,fga_max,fg%_max,3p_max,3pa_max,3p%_max,ft_max,fta_max,ft%_max,orb_max,drb_max,trb_max,ast_max,stl_max,blk_max,tov_max,pf_max,...,efg%_opp_5,3par_opp_5,ftr_opp_5,orb%_opp_5,drb%_opp_5,trb%_opp_5,ast%_opp_5,stl%_opp_5,blk%_opp_5,tov%_opp_5,usg%_opp_5,ortg_opp_5,drtg_opp_5,fg_max_opp_5,fga_max_opp_5,fg%_max_opp_5,3p_max_opp_5,3pa_max_opp_5,3p%_max_opp_5,ft_max_opp_5,fta_max_opp_5,ft%_max_opp_5,orb_max_opp_5,drb_max_opp_5,trb_max_opp_5,ast_max_opp_5,stl_max_opp_5,blk_max_opp_5,tov_max_opp_5,pf_max_opp_5,pts_max_opp_5,+/-_max_opp_5,ts%_max_opp_5,efg%_max_opp_5,3par_max_opp_5,ftr_max_opp_5,orb%_max_opp_5,drb%_max_opp_5,trb%_max_opp_5,ast%_max_opp_5,stl%_max_opp_5,blk%_max_opp_5,tov%_max_opp_5,usg%_max_opp_5,ortg_max_opp_5,drtg_max_opp_5,total_opp_5,home_opp_5,won_5,season_5
109,0.0,0.650,0.462963,0.636585,0.518519,0.547170,0.548600,0.230769,0.222222,0.793353,0.36,0.400,0.391304,0.452381,0.772727,0.578947,0.517241,0.518519,0.623529,0.635535,0.657895,0.555556,0.191419,0.464819,0.394456,0.436111,0.4496,0.757009,0.626712,0.550420,0.0,0.533969,0.318236,0.631579,0.407407,1.000000,0.545455,0.368421,0.600000,0.181818,0.16,1.0,0.230769,0.166667,0.230769,0.090909,0.4,0.4,0.333333,0.50,...,0.540081,0.440625,0.345545,0.446482,0.510448,0.512222,0.43648,0.346729,0.347945,0.576471,0.0,0.467223,0.452443,0.410526,0.392593,0.676259,0.254545,0.294737,0.800000,0.190909,0.176,1.0000,0.169231,0.300000,0.284615,0.272727,0.26,0.30,0.533333,0.60,0.337255,0.469444,0.5064,0.4828,0.76000,0.053233,0.134089,0.191074,0.144334,0.278333,0.0504,0.0902,0.489937,0.147882,0.447423,0.472727,0.482353,0.4,0.6,2021.0
111,0.0,0.775,0.629630,0.641463,0.629630,0.584906,0.635914,0.384615,0.422222,0.686416,0.32,0.375,0.347826,0.523810,0.272727,0.157895,0.241379,0.629630,0.847059,0.658314,0.676113,0.522569,0.316832,0.375267,0.287846,0.294444,0.4576,0.261682,0.195205,0.243697,0.0,0.724672,0.680572,0.421053,0.629630,1.000000,0.454545,0.421053,1.000000,0.363636,0.28,1.0,0.307692,0.388889,0.346154,0.318182,0.2,0.1,0.333333,0.75,...,0.357490,0.409375,0.350825,0.596588,0.573134,0.559444,0.58848,0.311215,0.273973,0.471429,0.0,0.367580,0.487247,0.378947,0.407407,0.632374,0.236364,0.315789,0.777867,0.254545,0.296,1.0000,0.276923,0.322222,0.261538,0.254545,0.20,0.18,0.333333,0.75,0.305882,0.369444,0.4576,0.4626,0.84000,0.068436,0.155098,0.269240,0.175138,0.410952,0.0488,0.0784,0.509015,0.174069,0.419588,0.490909,0.451765,0.4,0.6,2021.0
115,0.0,0.600,0.537037,0.531707,0.592593,0.528302,0.654036,0.333333,0.400000,0.605491,0.40,0.500,0.500000,0.523810,0.590909,0.315789,0.310345,0.666667,0.647059,0.551253,0.582996,0.505208,0.320132,0.456290,0.782516,0.594444,0.5888,0.598131,0.541096,0.323529,0.0,0.606675,0.336114,0.315789,0.259259,1.000000,0.272727,0.315789,1.000000,0.272727,0.24,1.0,0.076923,0.388889,0.230769,0.227273,0.3,0.3,0.222222,0.50,...,0.448988,0.574653,0.502970,0.416205,0.591898,0.506111,0.48608,0.321495,0.197260,0.486555,0.0,0.446961,0.472944,0.284211,0.303704,0.766187,0.309091,0.357895,0.511200,0.345455,0.320,0.9428,0.215385,0.255556,0.230769,0.263636,0.18,0.14,0.288889,0.50,0.243137,0.425000,0.4436,0.4400,0.90400,0.108234,0.185170,0.375875,0.223982,0.425476,0.0486,0.0954,0.656604,0.351220,0.312371,0.485714,0.428235,0.4,0.8,2021.0
116,0.0,0.425,0.666667,0.280488,0.518519,0.584906,0.515651,0.230769,0.200000,0.888728,0.60,0.375,0.500000,0.428571,0.636364,0.315789,0.413793,0.666667,0.411765,0.300683,0.344130,0.508681,0.146865,0.545842,0.522388,0.430556,0.6080,0.635514,0.479452,0.411765,0.0,0.348033,0.474374,0.368421,0.296296,0.401079,0.272727,0.210526,0.556000,0.136364,0.08,1.0,0.384615,0.444444,0.346154,0.363636,0.3,0.1,0.333333,0.75,...,0.487854,0.544444,0.353795,0.348401,0.448188,0.371667,0.44000,0.291589,0.354795,0.514286,0.0,0.418594,0.538498,0.357895,0.348148,0.604317,0.290909,0.294737,0.730133,0.218182,0.200,1.0000,0.138462,0.311111,0.238462,0.245455,0.18,0.20,0.244444,0.40,0.294118,0.397222,0.4706,0.4056,0.84448,0.078058,0.313491,0.239083,0.184598,0.270714,0.0352,0.1082,0.498532,0.244159,0.365979,0.568831,0.475294,0.8,0.4,2021.0
118,0.0,0.575,0.296296,0.692683,0.444444,0.301887,0.759473,0.307692,0.355556,0.638728,0.24,0.600,0.500000,0.404762,0.181818,0.157895,0.551724,0.629630,0.564706,0.676538,0.698381,0.361111,0.339934,0.339019,0.724947,0.630556,0.4368,0.186916,0.164384,0.621849,0.0,0.551847,0.383790,0.421053,0.481481,1.000000,0.454545,0.157895,1.000000,0.227273,0.24,1.0,0.153846,0.222222,0.192308,0.272727,0.1,0.2,0.444444,0.25,...,0.438866,0.481597,0.293729,0.347548,0.579957,0.386667,0.48736,0.468224,0.271233,0.407563,0.0,0.379976,0.513945,0.368421,0.422222,0.633453,0.272727,0.284211,0.725333,0.209091,0.184,1.0000,0.200000,0.200000,0.146154,0.245455,0.26,0.16,0.422222,0.70,0.313725,0.341667,0.4058,0.4500,0.77600,0.085365,0.173635,0.238601,0.146975,0.419524,0.0630,0.0722,0.261635,0.288062,0.412371,0.545455,0.421176,0.2,0.6,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.250,0.259259,0.331707,0.259259,0.509434,0.273476,0.256410,0.333333,0.543353,0.16,0.425,0.304348,0.285714,0.136364,0.315789,0.310345,0.259259,0.176471,0.293850,0.340081,0.614583,0.330033,0.200426,0.377399,0.258333,0.5376,0.158879,0.308219,0.386555,0.0,0.274136,0.380215,0.263158,0.296296,0.640288,0.181818,0.157895,0.333333,0.090909,0.16,1.0,0.153846,0.333333,0.307692,0.181818,0.1,0.4,0.333333,0.25,...,0.489879,0.463542,0.235314,0.611514,0.706183,0.697222,0.47456,0.446729,0.332877,0.441176,0.0,0.522527,0.335399,0.378947,0.355556,0.685252,0.363636,0.368421,0.623200,0.272727,0.232,1.0000,0.323077,0.377778,0.315385,0.309091,0.32,0.26,0.400000,0.55,0.290196,0.541667,0.4128,0.4126,0.59968,0.062596,0.267559,0.354162,0.192739,0.249524,0.0728,0.0894,0.374633,0.082670,0.380412,0.348052,0.432941,0.4,0.0,2023.0
5186,0.0,0.450,0.333333,0.514634,0.259259,0.226415,0.546952,0.461538,0.511111,0.679191,0.52,0.475,0.543478,0.404762,0.454545,0.421053,0.413793,0.259259,0.458824,0.489749,0.487854,0.265625,0.466997,0.742004,0.605544,0.772222,0.5440,0.495327,0.496575,0.457983,0.0,0.518474,0.365912,0.368421,0.407407,0.228417,0.181818,0.157895,0.666667,0.409091,0.36,1.0,0.230769,0.388889,0.346154,0.318182,0.3,0.3,0.444444,0.25,...,0.463158,0.422569,0.382838,0.491258,0.480597,0.455000,0.38368,0.303738,0.278082,0.470588,0.0,0.471037,0.525626,0.557895,0.503704,0.807194,0.254545,0.294737,0.582133,0.263636,0.256,0.8188,0.230769,0.322222,0.269231,0.272727,0.20,0.20,0.355556,0.60,0.478431,0.361111,0.4680,0.4594,1.00000,0.041827,0.135942,0.229916,0.141034,0.216190,0.0322,0.1052,0.428302,0.178434,0.409278,0.529870,0.475294,0.4,0.6,2023.0
5187,0.0,0.425,0.481481,0.385366,0.370370,0.509434,0.406919,0.128205,0.133333,0.710983,0.32,0.200,0.195652,0.261905,0.318182,0.263158,0.413793,0.555556,0.317647,0.341686,0.402834,0.508681,0.117162,0.394456,0.257996,0.227778,0.3280,0.345794,0.277397,0.457983,0.0,0.365912,0.518474,0.736842,0.703704,0.550360,0.181818,0.263158,0.666667,0.136364,0.16,1.0,0.153846,0.166667,0.192308,0.090909,0.1,0.1,0.333333,0.75,...,0.495951,0.316667,0.476238,0.519403,0.508742,0.545000,0.54976,0.378505,0.364384,0.368908,0.0,0.525626,0.471037,0.473684,0.459259,0.647842,0.218182,0.231579,0.633333,0.345455,0.288,1.0000,0.200000,0.311111,0.284615,0.327273,0.28,0.26,0.400000,0.60,0.435294,0.463889,0.5022,0.4790,1.00000,0.111312,0.249434,0.249940,0.145655,0.243810,0.0550,0.1146,0.212159,0.084981,0.480412,0.490909,0.524706,0.6,0.4,2023.0
5188,0.0,0.375,0.351852,0.414634,0.481481,0.547170,0.507414,0.128205,0.177778,0.518786,0.28,0.325,0.282609,0.333333,0.318182,0.157895,0.586207,0.296296,0.305882,0.407745,0.473684,0.607639,0.169967,0.345416,0.582090,0.380556,0.4944,0.336449,0.202055,0.655462,0.0,0.321812,0.592372,0.263158,0.296296,0.401079,0.272727,0.210526,1.000000,0.136364,0.12,1.0,0.230769,0.166667,0.230769,0.181818,0.2,0.1,0.444444,1.00,...,0.680972,0.477083,0.285479,0.371002,0.617484,0.547222,0.54304,0.430841,0.226712,0.466387,0.0,0.602145,0.461263,0.421053,0.325926,0.928058,0.309091,0.242105,0.822400,0.209091,0.200,1.0000,0.092308,0.322222,0.223077,0.254545,0.28,0.14,0.288889,0.35,0.368627,0.538889,0.6008,0.6046,0.89312,0.075095,0.254789,0.341375,0.241144,0.269048,0.0616,0.0742,0.398113,0.198716,0.411340,0.488312,0.625882,0.4,0.2,2023.0


In [41]:
df.loc[:, ['target', 'date', 'season', 'team', 'team_opp']].tail(20)

Unnamed: 0,target,date,season,team,team_opp
5170,1,4/19/2023,2023,MIA,MIL
5171,0,4/19/2023,2023,MIL,MIA
5172,1,4/20/2023,2023,PHI,BRK
5173,0,4/20/2023,2023,BRK,PHI
5174,2,4/20/2023,2023,SAC,GSW
5175,2,4/20/2023,2023,GSW,SAC
5176,1,4/20/2023,2023,PHO,LAC
5177,0,4/20/2023,2023,LAC,PHO
5178,2,4/21/2023,2023,BOS,ATL
5179,2,4/21/2023,2023,ATL,BOS


Điều tiếp theo chúng ta có thể làm để tăng độ chính xác của model là cho thuật toán của chúng ta biết về thông tin mà chúng ta biết ở thực tế. Những gì mà chúng ta biết ví dụ như trận đó đối thủ là ai, chơi ở sân nhà hay khách.

Tương tự như tạo target, chúng ta sẽ tạo thêm các cột mới (liên quan đến trận đấu tiếp theo), ở đó chúng ta sẽ lấy dữ liệu từ các cột hiện tại và lùi xuống 1 hàng

In [42]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# Ở đây, các cột mới bao gồm home_next, team_opp_next, date_next
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [43]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf,pts,ts%,efg%,3par,ftr,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,ortg,drtg,fg_max,fga_max,fg%_max,3p_max,3pa_max,3p%_max,ft_max,fta_max,ft%_max,orb_max,drb_max,trb_max,ast_max,stl_max,blk_max,tov_max,pf_max,...,orb%_opp_5,drb%_opp_5,trb%_opp_5,ast%_opp_5,stl%_opp_5,blk%_opp_5,tov%_opp_5,usg%_opp_5,ortg_opp_5,drtg_opp_5,fg_max_opp_5,fga_max_opp_5,fg%_max_opp_5,3p_max_opp_5,3pa_max_opp_5,3p%_max_opp_5,ft_max_opp_5,fta_max_opp_5,ft%_max_opp_5,orb_max_opp_5,drb_max_opp_5,trb_max_opp_5,ast_max_opp_5,stl_max_opp_5,blk_max_opp_5,tov_max_opp_5,pf_max_opp_5,pts_max_opp_5,+/-_max_opp_5,ts%_max_opp_5,efg%_max_opp_5,3par_max_opp_5,ftr_max_opp_5,orb%_max_opp_5,drb%_max_opp_5,trb%_max_opp_5,ast%_max_opp_5,stl%_max_opp_5,blk%_max_opp_5,tov%_max_opp_5,usg%_max_opp_5,ortg_max_opp_5,drtg_max_opp_5,total_opp_5,home_opp_5,won_5,season_5,home_next,team_opp_next,date_next
109,0.0,0.650,0.462963,0.636585,0.518519,0.547170,0.548600,0.230769,0.222222,0.793353,0.36,0.400,0.391304,0.452381,0.772727,0.578947,0.517241,0.518519,0.623529,0.635535,0.657895,0.555556,0.191419,0.464819,0.394456,0.436111,0.4496,0.757009,0.626712,0.550420,0.0,0.533969,0.318236,0.631579,0.407407,1.000000,0.545455,0.368421,0.600000,0.181818,0.16,1.0,0.230769,0.166667,0.230769,0.090909,0.4,0.4,0.333333,0.50,...,0.446482,0.510448,0.512222,0.43648,0.346729,0.347945,0.576471,0.0,0.467223,0.452443,0.410526,0.392593,0.676259,0.254545,0.294737,0.800000,0.190909,0.176,1.0000,0.169231,0.300000,0.284615,0.272727,0.26,0.30,0.533333,0.60,0.337255,0.469444,0.5064,0.4828,0.76000,0.053233,0.134089,0.191074,0.144334,0.278333,0.0504,0.0902,0.489937,0.147882,0.447423,0.472727,0.482353,0.4,0.6,2021.0,0.0,DET,1/1/2021
111,0.0,0.775,0.629630,0.641463,0.629630,0.584906,0.635914,0.384615,0.422222,0.686416,0.32,0.375,0.347826,0.523810,0.272727,0.157895,0.241379,0.629630,0.847059,0.658314,0.676113,0.522569,0.316832,0.375267,0.287846,0.294444,0.4576,0.261682,0.195205,0.243697,0.0,0.724672,0.680572,0.421053,0.629630,1.000000,0.454545,0.421053,1.000000,0.363636,0.28,1.0,0.307692,0.388889,0.346154,0.318182,0.2,0.1,0.333333,0.75,...,0.596588,0.573134,0.559444,0.58848,0.311215,0.273973,0.471429,0.0,0.367580,0.487247,0.378947,0.407407,0.632374,0.236364,0.315789,0.777867,0.254545,0.296,1.0000,0.276923,0.322222,0.261538,0.254545,0.20,0.18,0.333333,0.75,0.305882,0.369444,0.4576,0.4626,0.84000,0.068436,0.155098,0.269240,0.175138,0.410952,0.0488,0.0784,0.509015,0.174069,0.419588,0.490909,0.451765,0.4,0.6,2021.0,1.0,ATL,1/1/2021
115,0.0,0.600,0.537037,0.531707,0.592593,0.528302,0.654036,0.333333,0.400000,0.605491,0.40,0.500,0.500000,0.523810,0.590909,0.315789,0.310345,0.666667,0.647059,0.551253,0.582996,0.505208,0.320132,0.456290,0.782516,0.594444,0.5888,0.598131,0.541096,0.323529,0.0,0.606675,0.336114,0.315789,0.259259,1.000000,0.272727,0.315789,1.000000,0.272727,0.24,1.0,0.076923,0.388889,0.230769,0.227273,0.3,0.3,0.222222,0.50,...,0.416205,0.591898,0.506111,0.48608,0.321495,0.197260,0.486555,0.0,0.446961,0.472944,0.284211,0.303704,0.766187,0.309091,0.357895,0.511200,0.345455,0.320,0.9428,0.215385,0.255556,0.230769,0.263636,0.18,0.14,0.288889,0.50,0.243137,0.425000,0.4436,0.4400,0.90400,0.108234,0.185170,0.375875,0.223982,0.425476,0.0486,0.0954,0.656604,0.351220,0.312371,0.485714,0.428235,0.4,0.8,2021.0,0.0,UTA,1/1/2021
116,0.0,0.425,0.666667,0.280488,0.518519,0.584906,0.515651,0.230769,0.200000,0.888728,0.60,0.375,0.500000,0.428571,0.636364,0.315789,0.413793,0.666667,0.411765,0.300683,0.344130,0.508681,0.146865,0.545842,0.522388,0.430556,0.6080,0.635514,0.479452,0.411765,0.0,0.348033,0.474374,0.368421,0.296296,0.401079,0.272727,0.210526,0.556000,0.136364,0.08,1.0,0.384615,0.444444,0.346154,0.363636,0.3,0.1,0.333333,0.75,...,0.348401,0.448188,0.371667,0.44000,0.291589,0.354795,0.514286,0.0,0.418594,0.538498,0.357895,0.348148,0.604317,0.290909,0.294737,0.730133,0.218182,0.200,1.0000,0.138462,0.311111,0.238462,0.245455,0.18,0.20,0.244444,0.40,0.294118,0.397222,0.4706,0.4056,0.84448,0.078058,0.313491,0.239083,0.184598,0.270714,0.0352,0.1082,0.498532,0.244159,0.365979,0.568831,0.475294,0.8,0.4,2021.0,1.0,CHI,1/1/2021
118,0.0,0.575,0.296296,0.692683,0.444444,0.301887,0.759473,0.307692,0.355556,0.638728,0.24,0.600,0.500000,0.404762,0.181818,0.157895,0.551724,0.629630,0.564706,0.676538,0.698381,0.361111,0.339934,0.339019,0.724947,0.630556,0.4368,0.186916,0.164384,0.621849,0.0,0.551847,0.383790,0.421053,0.481481,1.000000,0.454545,0.157895,1.000000,0.227273,0.24,1.0,0.153846,0.222222,0.192308,0.272727,0.1,0.2,0.444444,0.25,...,0.347548,0.579957,0.386667,0.48736,0.468224,0.271233,0.407563,0.0,0.379976,0.513945,0.368421,0.422222,0.633453,0.272727,0.284211,0.725333,0.209091,0.184,1.0000,0.200000,0.200000,0.146154,0.245455,0.26,0.16,0.422222,0.70,0.313725,0.341667,0.4058,0.4500,0.77600,0.085365,0.173635,0.238601,0.146975,0.419524,0.0630,0.0722,0.261635,0.288062,0.412371,0.545455,0.421176,0.2,0.6,2021.0,0.0,SAS,1/1/2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.250,0.259259,0.331707,0.259259,0.509434,0.273476,0.256410,0.333333,0.543353,0.16,0.425,0.304348,0.285714,0.136364,0.315789,0.310345,0.259259,0.176471,0.293850,0.340081,0.614583,0.330033,0.200426,0.377399,0.258333,0.5376,0.158879,0.308219,0.386555,0.0,0.274136,0.380215,0.263158,0.296296,0.640288,0.181818,0.157895,0.333333,0.090909,0.16,1.0,0.153846,0.333333,0.307692,0.181818,0.1,0.4,0.333333,0.25,...,0.611514,0.706183,0.697222,0.47456,0.446729,0.332877,0.441176,0.0,0.522527,0.335399,0.378947,0.355556,0.685252,0.363636,0.368421,0.623200,0.272727,0.232,1.0000,0.323077,0.377778,0.315385,0.309091,0.32,0.26,0.400000,0.55,0.290196,0.541667,0.4128,0.4126,0.59968,0.062596,0.267559,0.354162,0.192739,0.249524,0.0728,0.0894,0.374633,0.082670,0.380412,0.348052,0.432941,0.4,0.0,2023.0,,,
5186,0.0,0.450,0.333333,0.514634,0.259259,0.226415,0.546952,0.461538,0.511111,0.679191,0.52,0.475,0.543478,0.404762,0.454545,0.421053,0.413793,0.259259,0.458824,0.489749,0.487854,0.265625,0.466997,0.742004,0.605544,0.772222,0.5440,0.495327,0.496575,0.457983,0.0,0.518474,0.365912,0.368421,0.407407,0.228417,0.181818,0.157895,0.666667,0.409091,0.36,1.0,0.230769,0.388889,0.346154,0.318182,0.3,0.3,0.444444,0.25,...,0.491258,0.480597,0.455000,0.38368,0.303738,0.278082,0.470588,0.0,0.471037,0.525626,0.557895,0.503704,0.807194,0.254545,0.294737,0.582133,0.263636,0.256,0.8188,0.230769,0.322222,0.269231,0.272727,0.20,0.20,0.355556,0.60,0.478431,0.361111,0.4680,0.4594,1.00000,0.041827,0.135942,0.229916,0.141034,0.216190,0.0322,0.1052,0.428302,0.178434,0.409278,0.529870,0.475294,0.4,0.6,2023.0,,,
5187,0.0,0.425,0.481481,0.385366,0.370370,0.509434,0.406919,0.128205,0.133333,0.710983,0.32,0.200,0.195652,0.261905,0.318182,0.263158,0.413793,0.555556,0.317647,0.341686,0.402834,0.508681,0.117162,0.394456,0.257996,0.227778,0.3280,0.345794,0.277397,0.457983,0.0,0.365912,0.518474,0.736842,0.703704,0.550360,0.181818,0.263158,0.666667,0.136364,0.16,1.0,0.153846,0.166667,0.192308,0.090909,0.1,0.1,0.333333,0.75,...,0.519403,0.508742,0.545000,0.54976,0.378505,0.364384,0.368908,0.0,0.525626,0.471037,0.473684,0.459259,0.647842,0.218182,0.231579,0.633333,0.345455,0.288,1.0000,0.200000,0.311111,0.284615,0.327273,0.28,0.26,0.400000,0.60,0.435294,0.463889,0.5022,0.4790,1.00000,0.111312,0.249434,0.249940,0.145655,0.243810,0.0550,0.1146,0.212159,0.084981,0.480412,0.490909,0.524706,0.6,0.4,2023.0,,,
5188,0.0,0.375,0.351852,0.414634,0.481481,0.547170,0.507414,0.128205,0.177778,0.518786,0.28,0.325,0.282609,0.333333,0.318182,0.157895,0.586207,0.296296,0.305882,0.407745,0.473684,0.607639,0.169967,0.345416,0.582090,0.380556,0.4944,0.336449,0.202055,0.655462,0.0,0.321812,0.592372,0.263158,0.296296,0.401079,0.272727,0.210526,1.000000,0.136364,0.12,1.0,0.230769,0.166667,0.230769,0.181818,0.2,0.1,0.444444,1.00,...,0.371002,0.617484,0.547222,0.54304,0.430841,0.226712,0.466387,0.0,0.602145,0.461263,0.421053,0.325926,0.928058,0.309091,0.242105,0.822400,0.209091,0.200,1.0000,0.092308,0.322222,0.223077,0.254545,0.28,0.14,0.288889,0.35,0.368627,0.538889,0.6008,0.6046,0.89312,0.075095,0.254789,0.341375,0.241144,0.269048,0.0616,0.0742,0.398113,0.198716,0.411340,0.488312,0.625882,0.4,0.2,2023.0,,,


In [44]:
df.loc[:, ['target', 'team', 'team_opp_next', 'date_next', 'home_next']].tail(20)

Unnamed: 0,target,team,team_opp_next,date_next,home_next
5170,1,MIA,MIL,4/22/2023,1.0
5171,0,MIL,MIA,4/22/2023,0.0
5172,1,PHI,BRK,4/22/2023,0.0
5173,0,BRK,PHI,4/22/2023,1.0
5174,2,SAC,,,
5175,2,GSW,,,
5176,1,PHO,LAC,4/22/2023,0.0
5177,0,LAC,PHO,4/22/2023,1.0
5178,2,BOS,,,
5179,2,ATL,,,


In [45]:
df_team_BOS = df.loc[df['team'] == 'BOS', ['target', 'team', 'team_opp_next', 'date_next', 'home_next']]
df_team_BOS

Unnamed: 0,target,team,team_opp_next,date_next,home_next
109,0,BOS,DET,1/1/2021,0.0
142,1,BOS,DET,1/3/2021,0.0
170,1,BOS,TOR,1/4/2021,0.0
198,1,BOS,MIA,1/6/2021,0.0
216,1,BOS,WAS,1/8/2021,1.0
...,...,...,...,...,...
4984,0,BOS,GSW,6/16/2022,1.0
4987,0,BOS,ATL,4/15/2023,1.0
5141,1,BOS,ATL,4/18/2023,1.0
5161,0,BOS,ATL,4/21/2023,0.0


In [46]:
df_team_ATL = df.loc[df['team'] == 'ATL', ['target', 'team', 'team_opp_next', 'date_next', 'home_next']]
df_team_ATL

Unnamed: 0,target,team,team_opp_next,date_next,home_next
134,0,ATL,CLE,1/2/2021,1.0
155,0,ATL,NYK,1/4/2021,1.0
183,0,ATL,CHO,1/6/2021,1.0
211,0,ATL,CHO,1/9/2021,0.0
262,1,ATL,PHI,1/11/2021,1.0
...,...,...,...,...,...
5098,1,ATL,MIA,4/11/2023,0.0
5130,0,ATL,BOS,4/15/2023,0.0
5140,0,ATL,BOS,4/18/2023,0.0
5160,1,ATL,BOS,4/21/2023,1.0


In [47]:
df.loc[5178,"team_opp_next"]='ATL'
df.loc[5178,"date_next"]='4/23/2023'
df.loc[5178,"home_next"]= 1.0

In [48]:
df.loc[5179,"team_opp_next"]='BOS'
df.loc[5179,"date_next"]='4/23/2023'
df.loc[5179,"home_next"]= 0.0

Lấy stat của đối thủ tiếp theo

In [49]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
    left_on=["team", "date_next"], 
    right_on=["team_opp_next", "date_next"])


In [50]:
full.loc[:, ['won','target', 'team_x', 'team_opp', 'team_opp_next_x', 'date_next']].tail(20)


Unnamed: 0,won,target,team_x,team_opp,team_opp_next_x,date_next
4758,False,0,BRK,PHI,PHI,4/20/2023
4759,True,1,PHI,BRK,BRK,4/20/2023
4760,False,1,GSW,SAC,SAC,4/20/2023
4761,True,0,SAC,GSW,GSW,4/20/2023
4762,False,1,ATL,BOS,BOS,4/21/2023
4763,True,0,BOS,ATL,ATL,4/21/2023
4764,False,1,NYK,CLE,CLE,4/21/2023
4765,True,0,CLE,NYK,NYK,4/21/2023
4766,False,0,LAC,PHO,PHO,4/20/2023
4767,True,1,PHO,LAC,LAC,4/20/2023


In [51]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]].tail(20)

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
4758,BRK,PHI,PHI,BRK,4/20/2023
4759,PHI,BRK,BRK,PHI,4/20/2023
4760,GSW,SAC,SAC,GSW,4/20/2023
4761,SAC,GSW,GSW,SAC,4/20/2023
4762,ATL,BOS,BOS,ATL,4/21/2023
4763,BOS,ATL,ATL,BOS,4/21/2023
4764,NYK,CLE,CLE,NYK,4/21/2023
4765,CLE,NYK,NYK,CLE,4/21/2023
4766,LAC,PHO,PHO,LAC,4/20/2023
4767,PHO,LAC,LAC,PHO,4/20/2023


In [52]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [53]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [54]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [55]:
predictors = list(selected_columns[sfs.get_support()])

In [56]:
predictors

['usg%',
 'stl%_max',
 'usg%_opp',
 'fg%_max_opp',
 'ft%_max_opp',
 'mp_5_x',
 'fg_5_x',
 'drb%_5_x',
 'usg%_5_x',
 '+/-_max_5_x',
 'mp_opp_5_x',
 '3p_opp_5_x',
 'orb%_opp_5_x',
 'usg%_opp_5_x',
 'stl_max_opp_5_x',
 'trb%_max_opp_5_x',
 'ortg_max_opp_5_x',
 'won_5_x',
 'home_next',
 'ft_5_y',
 'ast_5_y',
 'usg%_5_y',
 '+/-_max_5_y',
 'stl%_max_5_y',
 'blk%_max_5_y',
 '3p%_opp_5_y',
 'blk_opp_5_y',
 'usg%_opp_5_y',
 '3p%_max_opp_5_y',
 'efg%_max_opp_5_y']

In [57]:
predictions = backtest(full, rr, predictors)

In [58]:
predictions.tail(20)

Unnamed: 0,actual,prediction
4758,0,0
4759,1,0
4760,1,1
4761,0,0
4762,1,0
4763,0,0
4764,1,1
4765,0,0
4766,0,1
4767,1,0


In [59]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.5614035087719298

In [60]:
result_df = pd.concat([full, predictions], axis=1)

result_df.loc[:, ['team_x', 'team_opp_next_x', 'date_next', 'season', 'prediction', 'actual']].tail(20)

Unnamed: 0,team_x,team_opp_next_x,date_next,season,prediction,actual
4758,BRK,PHI,4/20/2023,2023,0.0,0.0
4759,PHI,BRK,4/20/2023,2023,0.0,1.0
4760,GSW,SAC,4/20/2023,2023,1.0,1.0
4761,SAC,GSW,4/20/2023,2023,0.0,0.0
4762,ATL,BOS,4/21/2023,2023,0.0,1.0
4763,BOS,ATL,4/21/2023,2023,0.0,0.0
4764,NYK,CLE,4/21/2023,2023,1.0,1.0
4765,CLE,NYK,4/21/2023,2023,0.0,0.0
4766,LAC,PHO,4/20/2023,2023,1.0,0.0
4767,PHO,LAC,4/20/2023,2023,0.0,1.0
