In [1]:
import pandas as pd
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import numpy as np
from numpy import where
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import  median_absolute_error, explained_variance_score, max_error, r2_score, mean_squared_error,mean_absolute_error

# Dataset Preparation

## Player data in the regular season

Loading the data

In [2]:
data = pd.read_csv('databasebasketball/player_data_reg_season.csv', sep=';')
df = pd.DataFrame(data[['firstname', 'lastname', 'gp', 'minutes', 'pts', 'reb', 'asts', 'stl', 'blk', 'turnover', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']])
df.head()


Unnamed: 0,firstname,lastname,gp,minutes,pts,reb,asts,stl,blk,turnover,fga,fgm,fta,ftm,tpa,tpm
0,Alaa,Abdelnaby,256,3200,1465,846,85,71,69,247,1236,620,321,225,3,0
1,Kareem,Abdul-jabbar,1560,57446,38387,17440,5660,1160,3189,2527,28307,15837,9304,6712,18,1
2,Mahmo,Abdul-rauf,586,15633,8553,1087,2079,487,46,963,7943,3514,1161,1051,1339,474
3,Tariq,Abdul-wahad,236,4808,1830,776,266,184,82,309,1726,720,529,372,76,18
4,Shareef,Abdur-rahim,672,24862,13338,5474,1847,718,556,1911,10215,4789,4427,3614,477,146


removing players who didnt play at least the average amount of minutes in the regular season

In [3]:
avg_minutes = np.average(df['minutes'])

for idx in df.index:
    if df['minutes'][idx] < avg_minutes:
        df.drop(idx, inplace=True)

df.head()

Unnamed: 0,firstname,lastname,gp,minutes,pts,reb,asts,stl,blk,turnover,fga,fgm,fta,ftm,tpa,tpm
1,Kareem,Abdul-jabbar,1560,57446,38387,17440,5660,1160,3189,2527,28307,15837,9304,6712,18,1
2,Mahmo,Abdul-rauf,586,15633,8553,1087,2079,487,46,963,7943,3514,1161,1051,1339,474
4,Shareef,Abdur-rahim,672,24862,13338,5474,1847,718,556,1911,10215,4789,4427,3614,477,146
9,Mark,Acres,375,5982,1343,1525,180,137,104,235,1016,514,463,308,13,7
11,Alvan,Adams,988,27203,13910,6937,4012,1289,808,2194,11464,5709,3160,2490,15,2


drop rows with missing values

In [4]:
df.dropna(axis=0, inplace=True)

Creating the desired attributes using exsiting attributes

In [5]:
fg_percent = [] # field goal percentage -> field goals made / field goals attempted
ft_percent = [] # free throw percentage -> free throws made / free throws attempted
p3_percent = [] # three pointer percentage -> three pointers made / three pointers attempted
astpg = [] # assists per game -> assists / games played
blkpg = [] # blocks per game -> blocks / games played
ptspg = [] # points per game -> total points / games played
stlpg = [] # steals per game -> total steals / games played
topg = [] # turnovers per game -> total turnovers / games played

for idx in df.index:
    fg_percent.append(df['fgm'][idx]/df['fga'][idx])
    ft_percent.append(df['ftm'][idx]/df['fta'][idx])
    p3_percent.append(df['tpm'][idx]/df['tpa'][idx])
    astpg.append(df['asts'][idx]/df['gp'][idx])
    blkpg.append(df['blk'][idx]/df['gp'][idx])
    ptspg.append(df['pts'][idx]/df['gp'][idx])
    stlpg.append(df['stl'][idx]/df['gp'][idx])
    topg.append(df['turnover'][idx]/df['gp'][idx])

df['FG%'] = fg_percent
df['FT%'] = ft_percent
df['3P%'] = p3_percent
df['ASTPG'] = astpg
df['BLKPG'] = blkpg
df['PTSPG'] = ptspg
df['STLPG'] = stlpg
df['TOPG'] = topg

df.head()

  del sys.path[0]


Unnamed: 0,firstname,lastname,gp,minutes,pts,reb,asts,stl,blk,turnover,...,tpa,tpm,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
1,Kareem,Abdul-jabbar,1560,57446,38387,17440,5660,1160,3189,2527,...,18,1,0.559473,0.72141,0.055556,3.628205,2.044231,24.607051,0.74359,1.619872
2,Mahmo,Abdul-rauf,586,15633,8553,1087,2079,487,46,963,...,1339,474,0.442402,0.905254,0.353996,3.547782,0.078498,14.595563,0.831058,1.643345
4,Shareef,Abdur-rahim,672,24862,13338,5474,1847,718,556,1911,...,477,146,0.46882,0.816354,0.30608,2.748512,0.827381,19.848214,1.068452,2.84375
9,Mark,Acres,375,5982,1343,1525,180,137,104,235,...,13,7,0.505906,0.665227,0.538462,0.48,0.277333,3.581333,0.365333,0.626667
11,Alvan,Adams,988,27203,13910,6937,4012,1289,808,2194,...,15,2,0.497994,0.787975,0.133333,4.060729,0.817814,14.078947,1.304656,2.220648


creating a new dataframe to serve as input to the svm

In [6]:
df_input = pd.DataFrame(df[['FG%','FT%', '3P%', 'ASTPG','BLKPG', 'PTSPG', 'STLPG', 'TOPG']])
df_input.head()

Unnamed: 0,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
1,0.559473,0.72141,0.055556,3.628205,2.044231,24.607051,0.74359,1.619872
2,0.442402,0.905254,0.353996,3.547782,0.078498,14.595563,0.831058,1.643345
4,0.46882,0.816354,0.30608,2.748512,0.827381,19.848214,1.068452,2.84375
9,0.505906,0.665227,0.538462,0.48,0.277333,3.581333,0.365333,0.626667
11,0.497994,0.787975,0.133333,4.060729,0.817814,14.078947,1.304656,2.220648


Removing any rows with a "NaN" value, which will come from division involving 0.

In [7]:
df_input.dropna(inplace=True)
np.any(np.isnan(df_input))
df_input.head()

Unnamed: 0,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
1,0.559473,0.72141,0.055556,3.628205,2.044231,24.607051,0.74359,1.619872
2,0.442402,0.905254,0.353996,3.547782,0.078498,14.595563,0.831058,1.643345
4,0.46882,0.816354,0.30608,2.748512,0.827381,19.848214,1.068452,2.84375
9,0.505906,0.665227,0.538462,0.48,0.277333,3.581333,0.365333,0.626667
11,0.497994,0.787975,0.133333,4.060729,0.817814,14.078947,1.304656,2.220648


## Player data in playoffs

loading dataframe

In [8]:
data = pd.read_csv('databasebasketball/player_playoffs.txt', sep=',')
df = pd.DataFrame(data[['firstname', 'lastname', 'gp', 'minutes', 'pts', 'reb', 'asts', 'stl', 'blk', 'turnover', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']])
df.head()

Unnamed: 0,firstname,lastname,gp,minutes,pts,reb,asts,stl,blk,turnover,fga,fgm,fta,ftm,tpa,tpm
0,Paul,Armstrong,3,0,9,0,6,0,0,0,22,4,4,1,0,0
1,Cliff,Barker,6,0,34,0,13,0,0,0,31,12,15,10,0,0
2,Leo,Barnhorst,2,0,22,0,4,0,0,0,25,8,6,6,0,0
3,Ralph,Beard,5,0,66,0,22,0,0,0,70,22,28,22,0,0
4,Charlie,Black,8,0,57,0,17,0,0,0,61,18,29,21,0,0


dropping rows with incomplete data

In [9]:
df.dropna(axis=0, inplace=True)

Creating the desired values

In [10]:
fg_percent = [] # field goal percentage -> field goals made / field goals attempted
ft_percent = [] # free throw percentage -> free throws made / free throws attempted
p3_percent = [] # three pointer percentage -> three pointers made / three pointers attempted
astpg = [] # assists per game -> assists / games played
blkpg = [] # blocks per game -> blocks / games played
ptspg = [] # points per game -> total points / games played
stlpg = [] # steals per game -> total steals / games played
topg = [] # turnovers per game -> total turnovers / games played

for idx in df.index:
    fg_percent.append(df['fgm'][idx]/df['fga'][idx])
    ft_percent.append(df['ftm'][idx]/df['fta'][idx])
    p3_percent.append(df['tpm'][idx]/df['tpa'][idx])
    astpg.append(df['asts'][idx]/df['gp'][idx])
    blkpg.append(df['blk'][idx]/df['gp'][idx])
    ptspg.append(df['pts'][idx]/df['gp'][idx])
    stlpg.append(df['stl'][idx]/df['gp'][idx])
    topg.append(df['turnover'][idx]/df['gp'][idx])

df['FG%'] = fg_percent
df['FT%'] = ft_percent
df['3P%'] = p3_percent
df['ASTPG'] = astpg
df['BLKPG'] = blkpg
df['PTSPG'] = ptspg
df['STLPG'] = stlpg
df['TOPG'] = topg

df.head(len(df))

  del sys.path[0]
  if sys.path[0] == '':
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]


Unnamed: 0,firstname,lastname,gp,minutes,pts,reb,asts,stl,blk,turnover,...,tpa,tpm,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
0,Paul,Armstrong,3,0,9,0,6,0,0,0,...,0,0,0.181818,0.250000,,2.000000,0.000000,3.000000,0.000000,0.000000
1,Cliff,Barker,6,0,34,0,13,0,0,0,...,0,0,0.387097,0.666667,,2.166667,0.000000,5.666667,0.000000,0.000000
2,Leo,Barnhorst,2,0,22,0,4,0,0,0,...,0,0,0.320000,1.000000,,2.000000,0.000000,11.000000,0.000000,0.000000
3,Ralph,Beard,5,0,66,0,22,0,0,0,...,0,0,0.314286,0.785714,,4.400000,0.000000,13.200000,0.000000,0.000000
4,Charlie,Black,8,0,57,0,17,0,0,0,...,0,0,0.295082,0.724138,,2.125000,0.000000,7.125000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7538,Delonte,West,7,115,29,9,4,7,1,2,...,11,5,0.523810,0.500000,0.454545,0.571429,0.142857,4.142857,1.000000,0.285714
7539,Damien,Wilkins,7,136,39,18,3,10,1,8,...,11,3,0.444444,0.444444,0.272727,0.428571,0.142857,5.571429,1.428571,1.142857
7540,Corliss,Williamson,5,40,26,6,3,1,2,4,...,1,0,0.375000,0.777778,0.000000,0.600000,0.400000,5.200000,0.200000,0.800000
7541,Jason,Williams,4,114,68,9,21,6,0,8,...,21,10,0.528302,1.000000,0.476190,5.250000,0.000000,17.000000,1.500000,2.000000


creating the input for the svm

In [11]:
df_input = pd.DataFrame(df[['FG%','FT%', '3P%', 'ASTPG','BLKPG', 'PTSPG', 'STLPG', 'TOPG']])
df_input.head()

Unnamed: 0,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
0,0.181818,0.25,,2.0,0.0,3.0,0.0,0.0
1,0.387097,0.666667,,2.166667,0.0,5.666667,0.0,0.0
2,0.32,1.0,,2.0,0.0,11.0,0.0,0.0
3,0.314286,0.785714,,4.4,0.0,13.2,0.0,0.0
4,0.295082,0.724138,,2.125,0.0,7.125,0.0,0.0


Removing incomplete data

In [12]:
df_input.dropna(inplace=True)
np.any(np.isnan(df_input))
df_input.head()

Unnamed: 0,FG%,FT%,3P%,ASTPG,BLKPG,PTSPG,STLPG,TOPG
1538,0.46,0.668317,0.176471,2.875,0.0,28.75,0.0,4.9375
1683,0.302083,0.806452,0.107143,3.0,0.0,7.818182,0.0,1.909091
2957,0.505618,0.880952,0.5,7.888889,0.0,14.222222,1.111111,4.222222
2960,0.363636,1.0,0.25,2.5,0.166667,5.5,0.166667,1.666667
2962,0.525424,0.785714,0.285714,4.0,0.333333,25.0,1.666667,5.0


# SVM models

## Player outlier detection in the regular season

creating and training an svm model with 5% of the data being outliers

In [None]:
model = OneClassSVM(kernel = 'rbf', gamma='auto', nu=0.05).fit(df_input)

ValueError: ignored

model specification
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df)

In [None]:
y_pred = model.predict(df_input)

Displaying the final output of the model

In [None]:
outlier_index = np.where(y_pred == -1)
outlier_values = df_input.iloc[outlier_index]
outlier_values = outlier_values.join(df['firstname'])
outlier_values = outlier_values.join(df['lastname'])

final_output = outlier_values[['firstname', 'lastname', 'FG%','FT%', '3P%', 'ASTPG','BLKPG', 'PTSPG', 'STLPG', 'TOPG']]
final_output

## Player outlier detection in the playoffs

In [None]:
model = OneClassSVM(kernel = 'rbf', gamma='auto', nu=0.05).fit(df_input)

In [None]:
y_pred = model.predict(df_input)
outlier_index = np.where(y_pred == -1)
outlier_values = df_input.iloc[outlier_index]
outlier_values = outlier_values.join(df['firstname'])
outlier_values = outlier_values.join(df['lastname'])

final_output = outlier_values[['firstname', 'lastname', 'FG%','FT%', '3P%', 'ASTPG','BLKPG', 'PTSPG', 'STLPG', 'TOPG']]


In [None]:
final_output

#Predicting an outcome between two teams

In [13]:
#need to use the win rate to predict 
# Unsupervised or use win rate as labels and use supervised??

In [14]:
#Reading teamseason.txt file into dataframe
season_df= pd.read_csv('/content/databasebasketball/team_season.txt', header=0)
print(season_df)

     team  year leag  o_fgm  o_fga  o_ftm  o_fta  o_oreb  o_dreb  o_reb  ...  \
0     BOS  1946    N   1397   5133    811   1375       0       0      0  ...   
1     CH1  1946    N   1879   6309    939   1550       0       0      0  ...   
2     CL1  1946    N   1674   5699    903   1428       0       0      0  ...   
3     DE1  1946    N   1437   5843    923   1494       0       0      0  ...   
4     NYK  1946    N   1465   5255    951   1438       0       0      0  ...   
...   ...   ...  ...    ...    ...    ...    ...     ...     ...    ...  ...   
1182  SAS  2004    N   2923   6450   1535   2120     987    2489   3476  ...   
1183  SEA  2004    N   2882   6498   1683   2131    1041    2311   3352  ...   
1184  TOR  2004    N   2952   6656   1626   2101     844    2444   3288  ...   
1185  UTA  2004    N   2828   6301   1719   2272    1047    2243   3290  ...   
1186  WAS  2004    N   2966   6794   1795   2476    1133    2374   3507  ...   

      d_pf  d_stl  d_to  d_blk  d_3pm  

In [15]:
#work with teams above 1999 since there are no 3pa stats for offensive (o) and defensive (d) before that
season_df.drop(season_df[season_df.year<1999].index, inplace=True)
season_df.reset_index(inplace=True, drop=True)

In [16]:
#function to get the winrate of a team_season entry
#win rate = wins/no.matches
#matches= wins+losses

def winRate(seasonEntry):
  return seasonEntry["won"] / (seasonEntry["won"] + seasonEntry["lost"])

label=pd.DataFrame(columns=["WinRate"])
label["WinRate"]= season_df.apply(winRate,axis=1)

In [17]:
print(label)

      WinRate
0    0.341463
1    0.426829
2    0.597561
3    0.207317
4    0.390244
..        ...
170  0.719512
171  0.634146
172  0.402439
173  0.317073
174  0.548780

[175 rows x 1 columns]


In [18]:
#DataFrame with features only
feature_df= season_df.drop(["team", "year", "leag", "won", "lost"], axis=1)
print(feature_df)

     o_fgm  o_fga  o_ftm  o_fta  o_oreb  o_dreb  o_reb  o_asts  o_pf  o_stl  \
0     3000   6807   1477   1987    1146    2570   3716    1548  1718    500   
1     3054   6879   1621   2175    1108    2420   3528    1741  2223    794   
2     2935   6533   1863   2458     884    2635   3519    2023  1670    732   
3     2565   6180   1482   2089    1032    2323   3356    1636  1908    646   
4     2977   6734   1653   2205    1010    2499   3510    1940  2219    713   
..     ...    ...    ...    ...     ...     ...    ...     ...   ...    ...   
170   2923   6450   1535   2120     987    2489   3476    1771  1716    613   
171   2882   6498   1683   2131    1041    2311   3352    1487  1943    553   
172   2952   6656   1626   2101     844    2444   3288    1670  1875    621   
173   2828   6301   1719   2272    1047    2243   3290    1826  2189    541   
174   2966   6794   1795   2476    1133    2374   3507    1563  1806    716   

     ...  d_reb  d_asts  d_pf  d_stl  d_to  d_blk  

Splitting training and testing data

In [19]:
#splitting with a 90% training size and 10% testing
fTrain, fTest, lblTrain, lblTest = train_test_split(feature_df, label, train_size=0.90, random_state=1)

In [20]:
#normalize the data
scaler=StandardScaler()

scaler.fit(fTrain)
norm_train=pd.DataFrame(scaler.transform(fTrain), columns=fTrain.columns)
norm_test= pd.DataFrame(scaler.transform(fTest), columns=fTest.columns)

MLP Model

In [24]:
MLP_model = MLPRegressor(hidden_layer_sizes=(100, 100, 100), solver="lbfgs", activation="tanh", random_state=1,max_iter=1000)

In [25]:
MLP_model.fit(norm_train,np.ravel(lblTrain))
lblPredictions= MLP_model.predict(norm_test)


SVR Model

In [26]:
SVR_model= SVR()

In [27]:
SVR_model.fit(norm_train,np.ravel(lblTrain))
lblPredictions_svr= SVR_model.predict(norm_test)

Comparing the Models

In [28]:
#using regression metrics from sklearn

print("MLP Model Evalutation")
print()
print("---------------------")
print("Explained Variance:", explained_variance_score(lblTest,lblPredictions))
print("Max Error:",max_error(lblTest, lblPredictions))
print("Mean Absolute Error:",mean_absolute_error(lblTest, lblPredictions))
print("Mean Squared Error:",mean_squared_error(lblTest, lblPredictions))
print("Median Absolute Error:",median_absolute_error(lblTest, lblPredictions))
print("r2 score:",r2_score(lblTest, lblPredictions))
print()
print("---------------------")
print()
print("SVR Model Evalutation")
print()
print("---------------------")
print("Explained Variance:", explained_variance_score(lblTest,lblPredictions_svr))
print("Max Error:",max_error(lblTest, lblPredictions_svr))
print("Mean Absolute Error:",mean_absolute_error(lblTest, lblPredictions_svr))
print("Mean Squared Error:",mean_squared_error(lblTest, lblPredictions_svr))
print("Median Absolute Error:",median_absolute_error(lblTest, lblPredictions_svr))
print("r2 score:",r2_score(lblTest, lblPredictions_svr))
print("---------------------")
print()

MLP Model Evalutation

---------------------
Explained Variance: 0.6267929858961028
Max Error: 0.2189322765986984
Mean Absolute Error: 0.0816470695701228
Mean Squared Error: 0.011005691163470109
Median Absolute Error: 0.058325269340641334
r2 score: 0.34718104355946255

---------------------

SVR Model Evalutation

---------------------
Explained Variance: 0.6027669731408921
Max Error: 0.19214048030605185
Mean Absolute Error: 0.06530594618002676
Mean Squared Error: 0.006886938727982405
Median Absolute Error: 0.05322470796603945
r2 score: 0.5914909761965531
---------------------



Predicting the Outcome of 2 teams given the feature vectors

In [77]:
#get 2 random teams from the feature vector dataframe
#Each row represents a team
team_Features= feature_df.sample(n=2)
print(team_Features)

     o_fgm  o_fga  o_ftm  o_fta  o_oreb  o_dreb  o_reb  o_asts  o_pf  o_stl  \
87    2859   6434   1594   2011     937    2558   3495    1679  1783    611   
133   2772   6605   1454   1937    1091    2418   3509    1716  1716    708   

     ...  d_reb  d_asts  d_pf  d_stl  d_to  d_blk  d_3pm  d_3pa  d_pts  \
87   ...   3492    1771  1668    738  1020    399    446   1243   8006   
133  ...   3449    1790  1734    613  1204    429    444   1301   7537   

          pace  
87   93.235313  
133  90.800003  

[2 rows x 31 columns]


In [78]:
#predict the win probability

#SVR keeps predicting 0.5 for each win probability

#We have decided to choose the MLPRegressor as the final model

predictions= MLP_model.predict(team_Features)

#winProb for team A
winProb_A= predictions[0]
print(winProb_A)
#winProb for team B
winProb_B= predictions[1]
print(winProb_B)

0.6870928923158964
0.6001766911342991


In [79]:
#the win probability is only relative to the team so we normalize the probabilities
norm_winProb_A = winProb_A/(winProb_B+ winProb_B )
norm_winProb_B = winProb_B/(winProb_B+ winProb_B )
print("Probability of A beating B: ",norm_winProb_A )
print("Probability of B beating A: ",norm_winProb_B )

Probability of A beating B:  0.5724088443166052
Probability of B beating A:  0.5


In [80]:
if (norm_winProb_A>norm_winProb_B):
  print("Winner is Team A")
elif (norm_winProb_B>norm_winProb_A):
  print("Winner is Team B")
else:
  print("Draw")

Winner is Team A
