In [210]:
import pandas as pd

In [211]:
# https://www.basketball-reference.com/wnba/years/2023_games.html
schedule = pd.read_csv("reg_season.csv")
# remove first and last column
schedule = schedule.iloc[:, 1:-2]
schedule.head()

Unnamed: 0,Visitor/Neutral,PTS,Home/Neutral,PTS.1
0,Connecticut Sun,70,Indiana Fever,61
1,Phoenix Mercury,71,Los Angeles Sparks,94
2,Chicago Sky,77,Minnesota Lynx,66
3,New York Liberty,64,Washington Mystics,80
4,Atlanta Dream,78,Dallas Wings,85


In [212]:
# https://www.basketball-reference.com/wnba/years/2023.html
advanced_stats = pd.read_csv("advanced_stats.csv")
# Remove nan columns
advanced_stats = advanced_stats.dropna(axis=1, how='all')
# Remove fist and last columns
advanced_stats = advanced_stats.iloc[:, 1:-1]
advanced_stats.head()


Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,...,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1
0,Las Vegas Aces,29.0,34.0,6.0,35,5,12.55,-1.09,11.46,114.8,...,0.36,0.597,0.553,12.5,18.9,0.236,0.48,14.4,78.2,0.171
1,New York Liberty,28.8,32.0,8.0,32,8,8.68,-0.53,8.14,111.8,...,0.424,0.577,0.539,14.9,25.8,0.198,0.476,13.1,78.5,0.184
2,Connecticut Sun,29.1,27.0,13.0,26,14,3.7,-0.27,3.43,105.4,...,0.294,0.54,0.498,14.0,23.9,0.224,0.486,16.7,75.8,0.236
3,Dallas Wings,26.0,22.0,18.0,25,15,2.95,-0.31,2.64,109.5,...,0.292,0.535,0.489,13.7,32.8,0.224,0.504,15.0,77.2,0.237
4,Washington Mystics,28.8,19.0,21.0,19,21,-0.35,0.03,-0.32,101.5,...,0.342,0.533,0.486,13.9,18.7,0.222,0.496,16.3,78.0,0.238


In [213]:
# Combine the two dataframes by team name
df = pd.merge(schedule, advanced_stats, left_on="Visitor/Neutral", right_on="Team")
df = pd.merge(df, advanced_stats, left_on="Home/Neutral", right_on="Team")
df = df.drop(['Team_x', 'Team_y'], axis=1)
df.head()

Unnamed: 0,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Age_x,W_x,L_x,PW_x,PL_x,MOV_x,...,3PAr_y,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y
0,Connecticut Sun,70,Indiana Fever,61,29.1,27.0,13.0,26,14,3.7,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
1,Connecticut Sun,88,Indiana Fever,72,29.1,27.0,13.0,26,14,3.7,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
2,Phoenix Mercury,85,Indiana Fever,82,27.8,9.0,31.0,8,32,-8.3,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
3,Phoenix Mercury,71,Indiana Fever,72,27.8,9.0,31.0,8,32,-8.3,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
4,Chicago Sky,89,Indiana Fever,87,27.2,18.0,22.0,17,23,-1.63,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248


In [214]:
for index, row in df.iterrows():
    if df.loc[index, 'PTS'] > df.loc[index, 'PTS.1']:
        df.loc[index, 'Home_Winner'] = 0
    else:
        df.loc[index, 'Home_Winner'] = 1

df.head()

Unnamed: 0,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Age_x,W_x,L_x,PW_x,PL_x,MOV_x,...,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y,Home_Winner
0,Connecticut Sun,70,Indiana Fever,61,29.1,27.0,13.0,26,14,3.7,...,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248,0.0
1,Connecticut Sun,88,Indiana Fever,72,29.1,27.0,13.0,26,14,3.7,...,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248,0.0
2,Phoenix Mercury,85,Indiana Fever,82,27.8,9.0,31.0,8,32,-8.3,...,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248,0.0
3,Phoenix Mercury,71,Indiana Fever,72,27.8,9.0,31.0,8,32,-8.3,...,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248,1.0
4,Chicago Sky,89,Indiana Fever,87,27.2,18.0,22.0,17,23,-1.63,...,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248,0.0


In [215]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1.0)

sfs = SequentialFeatureSelector(rr, n_features_to_select=10, direction='backward')

In [216]:
remove_cols = ["Home/Neutral", "Visitor/Neutral", "Home_Winner", "PTS", "PTS.1"]
selected_cols = [x for x in df.columns if x not in remove_cols]
df[selected_cols].head()

Unnamed: 0,Age_x,W_x,L_x,PW_x,PL_x,MOV_x,SOS_x,SRS_x,ORtg_x,DRtg_x,...,3PAr_y,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y
0,29.1,27.0,13.0,26,14,3.7,-0.27,3.43,105.4,100.7,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
1,29.1,27.0,13.0,26,14,3.7,-0.27,3.43,105.4,100.7,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
2,27.8,9.0,31.0,8,32,-8.3,0.55,-7.75,99.1,109.9,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
3,27.8,9.0,31.0,8,32,-8.3,0.55,-7.75,99.1,109.9,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248
4,27.2,18.0,22.0,17,23,-1.63,0.11,-1.52,103.3,105.3,...,0.29,0.532,0.491,15.6,26.4,0.204,0.509,13.9,76.8,0.248


In [217]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
df[selected_cols] = scalar.fit_transform(df[selected_cols])
df.head()

Unnamed: 0,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Age_x,W_x,L_x,PW_x,PL_x,MOV_x,...,TS%_y,eFG%_y,TOV%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y,DRB%_y,FT/FGA.1_y,Home_Winner
0,Connecticut Sun,70,Indiana Fever,61,1.0,0.72,0.28,0.666667,0.333333,0.57554,...,0.177215,0.225,0.659574,0.546099,0.392405,0.891892,0.216216,0.690909,0.9625,0.0
1,Connecticut Sun,88,Indiana Fever,72,1.0,0.72,0.28,0.666667,0.333333,0.57554,...,0.177215,0.225,0.659574,0.546099,0.392405,0.891892,0.216216,0.690909,0.9625,0.0
2,Phoenix Mercury,85,Indiana Fever,82,0.59375,0.0,1.0,0.0,1.0,0.0,...,0.177215,0.225,0.659574,0.546099,0.392405,0.891892,0.216216,0.690909,0.9625,0.0
3,Phoenix Mercury,71,Indiana Fever,72,0.59375,0.0,1.0,0.0,1.0,0.0,...,0.177215,0.225,0.659574,0.546099,0.392405,0.891892,0.216216,0.690909,0.9625,1.0
4,Chicago Sky,89,Indiana Fever,87,0.40625,0.36,0.64,0.333333,0.666667,0.319904,...,0.177215,0.225,0.659574,0.546099,0.392405,0.891892,0.216216,0.690909,0.9625,0.0


In [218]:

sfs.fit(df[selected_cols], df['Home_Winner'])

In [219]:
predictors = list(df[selected_cols].columns[sfs.get_support()])
df[predictors].head()

Unnamed: 0,ORB%_x,eFG%.1_x,TOV%.1_x,L_y,TS%_y,eFG%_y,ORB%_y,FT/FGA_y,eFG%.1_y,TOV%.1_y
0,0.368794,0.27027,0.972973,0.84,0.177215,0.225,0.546099,0.392405,0.891892,0.216216
1,0.368794,0.27027,0.972973,0.84,0.177215,0.225,0.546099,0.392405,0.891892,0.216216
2,0.156028,0.810811,0.081081,0.84,0.177215,0.225,0.546099,0.392405,0.891892,0.216216
3,0.156028,0.810811,0.081081,0.84,0.177215,0.225,0.546099,0.392405,0.891892,0.216216
4,0.404255,0.621622,0.405405,0.84,0.177215,0.225,0.546099,0.392405,0.891892,0.216216


In [284]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
def prediction():
    X_train, X_test, y_train, y_test = train_test_split(df[predictors], df['Home_Winner'], test_size=0.2)

    # Train a logistic regression model on the training data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = model.predict(X_test)

    # Evaluate the accuracy of the model on the test data
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    # test_indicies = X_test.index
    # test_df = df.loc[test_indicies][['Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS.1', 'Home_Winner']]
    # test_df['Predicted'] = y_pred
    # # determine if the prediction was correct
    # test_df['Correct'] = test_df['Home_Winner'] == test_df['Predicted']
    # print(test_df)

In [285]:
prediction()

Accuracy: 0.7291666666666666
        Visitor/Neutral  PTS        Home/Neutral  PTS.1  Home_Winner  \
93        Indiana Fever   73       Atlanta Dream     82          1.0   
68        Atlanta Dream   85      Minnesota Lynx     91          1.0   
20      Connecticut Sun   88  Washington Mystics     81          0.0   
36         Dallas Wings   97  Washington Mystics     84          0.0   
62      Phoenix Mercury   64      Minnesota Lynx     75          1.0   
184    New York Liberty   85     Phoenix Mercury     63          0.0   
146       Atlanta Dream   72      Las Vegas Aces     93          1.0   
107       Atlanta Dream   90  Los Angeles Sparks     79          0.0   
161     Connecticut Sun   79         Chicago Sky     73          0.0   
178       Seattle Storm   83         Chicago Sky     74          0.0   
118       Seattle Storm   85  Los Angeles Sparks     92          1.0   
64          Chicago Sky   77      Minnesota Lynx     66          0.0   
216  Los Angeles Sparks   79       