In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
#training set and prediction set(2021-2022 season)
train_df = pd.read_csv('../CSV/NBA-season-data-processed.csv')
curr_df = pd.read_csv('../CSV/NBA-2022-team-data.csv')

#split data by Conference
train_east_df = pd.read_csv('../CSV/NBA-teams-data-east.csv')
curr_east_df = pd.read_csv('../CSV/NBA-2022-team-data-east.csv')

train_west_df = pd.read_csv('../CSV/NBA-teams-data-west.csv')
curr_west_df = pd.read_csv('../CSV/NBA-2022-team-data-west.csv')

In [3]:
X = train_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y = train_df['made_playoff']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .30, random_state = 42)

In [5]:
logReg = LogisticRegression()
logReg.fit(X_train,y_train)

LogisticRegression()

In [6]:
print('Logistic Regression Score w/o splitting data by conference')
print(f"Training Data Score: {logReg.score(X_train, y_train)}")
print(f"Testing Data Score: {logReg.score(X_test, y_test)}")

Logistic Regression Score w/o splitting data by conference
Training Data Score: 0.9275184275184275
Testing Data Score: 0.9342857142857143


In [7]:
teams = curr_df['Team']
conf = curr_df['Conf']
curr_df = curr_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)

In [8]:
predictions = logReg.predict(curr_df)
predictions_prob = logReg.predict_proba(curr_df)[:,1]

In [9]:
results = pd.DataFrame()
results['Team'] = teams
results['predictions'] = predictions
results['prediction probability'] = predictions_prob
results['Conference'] = conf
print(results.sort_values(['Conference','prediction probability'], ascending = False))

                      Team  predictions  prediction probability Conference
0    Golden State Warriors            1                0.999997       West
2             Phoenix Suns            1                0.999977       West
1                Utah Jazz            1                0.994473       West
22        Dallas Mavericks            1                0.972879       West
20    Los Angeles Clippers            1                0.927364       West
4   Portland Trail Blazers            1                0.759349       West
24          Denver Nuggets            1                0.757737       West
6        Memphis Grizzlies            1                0.583178       West
7       Los Angeles Lakers            1                0.552610       West
15  Minnesota Timberwolves            1                0.526726       West
28   Oklahoma City Thunder            0                0.031153       West
9         Sacramento Kings            0                0.027588       West
16       San Antonio Spur

In [10]:
X_east = train_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_east = train_east_df['made_playoff']

X_west = train_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_west = train_west_df['made_playoff']

In [11]:
X_train_east, X_test_east, y_train_east, y_test_east = train_test_split(X_east, y_east, test_size = .30,random_state=42)
X_train_west, X_test_west, y_train_west, y_test_west = train_test_split(X_west, y_west, test_size = .30, random_state=42)

In [12]:
logReg_east = LogisticRegression()
logReg_east.fit(X_train_east,y_train_east)

logReg_west = LogisticRegression()
logReg_west.fit(X_train_west,y_train_west)

LogisticRegression()

In [13]:
print("logistic Regression East")
print(f"Training Data Score: {logReg_east.score(X_train_east, y_train_east)}")
print(f"Testing Data Score: {logReg_east.score(X_test_east, y_test_east)}")

print("logistic Regression West")
print(f"Training Data Score: {logReg_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {logReg_west.score(X_test_west, y_test_west)}")

logistic Regression East
Training Data Score: 0.9370277078085643
Testing Data Score: 0.9298245614035088
logistic Regression West
Training Data Score: 0.9217171717171717
Testing Data Score: 0.9117647058823529


In [14]:
teams_east = curr_east_df['Team']
teams_west = curr_west_df['Team']
curr_east_df = curr_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
curr_west_df = curr_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)

In [15]:
scaler = StandardScaler()
curr_east_df = pd.DataFrame(scaler.fit_transform(curr_east_df), columns = curr_east_df.columns)
curr_west_df = pd.DataFrame(scaler.fit_transform(curr_west_df), columns = curr_west_df.columns)

In [16]:
predictions_east = logReg_east.predict(curr_east_df)
predictions_east_prob = logReg_east.predict_proba(curr_east_df)[:,1]
predictions_west = logReg_west.predict(curr_west_df)
predictions_west_prob = logReg_west.predict_proba(curr_west_df)[:,1]

In [17]:
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
results_east['prediction probability'] = predictions_east_prob
print(results_east.sort_values('prediction probability', ascending = False))

                   Team  predictions  prediction probability
11        Brooklyn Nets            1                0.999768
9            Miami Heat            1                0.997013
3    Washington Wizards            1                0.994376
0         Chicago Bulls            1                0.982397
13      Milwaukee Bucks            1                0.943266
8        Boston Celtics            1                0.938563
6       New York Knicks            1                0.921020
14    Charlotte Hornets            1                0.895131
5    Philadelphia 76ers            1                0.847330
12        Atlanta Hawks            1                0.765363
2   Cleveland Cavaliers            0                0.460685
4       Toronto Raptors            0                0.308939
7        Indiana Pacers            0                0.093415
10      Detroit Pistons            0                0.000448
1         Orlando Magic            0                0.000095


In [18]:
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
results_west['prediction probability'] = predictions_west_prob
print(results_west.sort_values('prediction probability', ascending = False))

                      Team  predictions  prediction probability
14   Golden State Warriors            1                0.999918
5             Phoenix Suns            1                0.999730
1                Utah Jazz            1                0.977996
6         Dallas Mavericks            1                0.906323
7     Los Angeles Clippers            1                0.828306
13  Portland Trail Blazers            1                0.713329
4           Denver Nuggets            1                0.658463
8        Memphis Grizzlies            1                0.649719
10  Minnesota Timberwolves            1                0.575745
12      Los Angeles Lakers            0                0.433892
0         Sacramento Kings            0                0.051032
11   Oklahoma City Thunder            0                0.030585
9        San Antonio Spurs            0                0.017628
3     New Orleans Pelicans            0                0.001680
2          Houston Rockets            0 

In [19]:
LinSVC_east = LinearSVC(max_iter = 10000, random_state = 42, C = 1)
LinSVC_east.fit(X_train_east, y_train_east)
LinSVC_west = LinearSVC(max_iter = 10000, random_state = 42, C = 1)
LinSVC_west.fit(X_train_west,y_train_west)

LinearSVC(C=1, max_iter=10000, random_state=42)

In [20]:
print("LinSVC East")
print(f"Training Data Score: {LinSVC_east.score(X_train_east, y_train_east)}")
print(f"Testing Data Score: {LinSVC_east.score(X_test_east, y_test_east)}")

print("LinSVC West")
print(f"Training Data Score: {LinSVC_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {LinSVC_west.score(X_test_west, y_test_west)}")

LinSVC East
Training Data Score: 0.9445843828715366
Testing Data Score: 0.9181286549707602
LinSVC West
Training Data Score: 0.9318181818181818
Testing Data Score: 0.9176470588235294


In [21]:
coeff_df = pd.DataFrame(train_east_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(LinSVC_east.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
20,top_PTS,2.468343
4,3P%,0.990451
0,FGA,0.913893
9,FTA,0.605837
14,STL,0.131236
12,DRB,0.122146
18,PTS,0.067584
21,top_5_PTS,0.060264
25,top_5_stats_sum,0.05807
16,TOV,0.055294


In [22]:
coeff_df = pd.DataFrame(train_west_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(LinSVC_west.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
20,top_PTS,1.723019
19,W%,0.536536
6,2PA,0.516352
3,3PA,0.505658
10,FT%,0.449534
8,FT,0.176705
25,top_5_stats_sum,0.16763
11,ORB,0.153284
14,STL,0.146034
15,BLK,0.090565


In [23]:
drop_cols_east = ['DRB','top_5_PTS','top_5_stats_sum','TOV','ORB','2P','AST','Team','PF','top_5_DRB'
            ,'2PA','FT','top_5_ORB','3P','FG%','BLK','2P%','FT%','Year','Conf','made_playoff']
drop_cols_west = ['FT','top_5_stats_sum','ORB','STL','BLK','PTS','top_5_PTS','TOV',
                  'top_5_ORB','AST','top_5_DRB','2P%','PF','top_5_AST','Team','2P','DRB','FGA','FG%',
                  '3P%','FTA','3P','Year','Conf','made_playoff']

In [24]:
X_east = train_east_df.drop(drop_cols_east, axis = 1)
y_east = train_east_df['made_playoff']

X_west = train_west_df.drop(drop_cols_west, axis = 1)
y_west = train_west_df['made_playoff']

In [25]:
X_train_east, X_test_east, y_train_east, y_test_east = train_test_split(X_east, y_east, test_size = .30,random_state=42)
X_train_west, X_test_west, y_train_west, y_test_west = train_test_split(X_west, y_west,test_size = .30, random_state=42)

In [26]:
LinSVC_east = LinearSVC(max_iter = 10000, random_state = 42, C = 1)
LinSVC_east.fit(X_train_east, y_train_east)
LinSVC_west = LinearSVC(max_iter = 10000, random_state = 42, C = 1)
LinSVC_west.fit(X_train_west,y_train_west)

LinearSVC(C=1, max_iter=10000, random_state=42)

In [27]:
print("LinSVC East")
print(f"Training Data Score: {LinSVC_east.score(X_train_east, y_train_east)}")
print(f"Testing Data Score: {LinSVC_east.score(X_test_east, y_test_east)}")

print("LinSVC West")
print(f"Training Data Score: {LinSVC_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {LinSVC_west.score(X_test_west, y_test_west)}")

LinSVC East
Training Data Score: 0.9395465994962217
Testing Data Score: 0.9298245614035088
LinSVC West
Training Data Score: 0.9141414141414141
Testing Data Score: 0.9058823529411765


In [28]:
#exc drop columns for curr_season 
drop_cols_east = ['DRB','top_5_PTS','top_5_stats_sum','TOV','ORB','2P','AST','PF','top_5_DRB'
            ,'2PA','FT','top_5_ORB','3P','FG%','BLK','2P%','FT%']
drop_cols_west = ['FT','top_5_stats_sum','ORB','STL','BLK','PTS','top_5_PTS','TOV',
                  'top_5_ORB','AST','top_5_DRB','2P%','PF','top_5_AST','2P','DRB','FGA','FG%',
                  '3P%','FTA','3P']

In [29]:
curr_east_df = curr_east_df.drop(drop_cols_east, axis = 1)
curr_west_df = curr_west_df.drop(drop_cols_west, axis = 1)

In [30]:
predictions_east = LinSVC_east.predict(curr_east_df)
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
print(results_east.sort_values('predictions', ascending = False))

                   Team  predictions
0         Chicago Bulls            1
3    Washington Wizards            1
5    Philadelphia 76ers            1
6       New York Knicks            1
8        Boston Celtics            1
9            Miami Heat            1
11        Brooklyn Nets            1
12        Atlanta Hawks            1
13      Milwaukee Bucks            1
14    Charlotte Hornets            1
1         Orlando Magic            0
2   Cleveland Cavaliers            0
4       Toronto Raptors            0
7        Indiana Pacers            0
10      Detroit Pistons            0


In [31]:
predictions_west = LinSVC_west.predict(curr_west_df)
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
print(results_west.sort_values('predictions', ascending = False))

                      Team  predictions
1                Utah Jazz            1
4           Denver Nuggets            1
5             Phoenix Suns            1
6         Dallas Mavericks            1
7     Los Angeles Clippers            1
8        Memphis Grizzlies            1
10  Minnesota Timberwolves            1
12      Los Angeles Lakers            1
13  Portland Trail Blazers            1
14   Golden State Warriors            1
0         Sacramento Kings            0
2          Houston Rockets            0
3     New Orleans Pelicans            0
9        San Antonio Spurs            0
11   Oklahoma City Thunder            0


In [32]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf'],
             'probability':[True]}
 
grid_east = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid_east.fit(X_train_east,y_train_east)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=0.1, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=0.1, gamma=0.01, kernel=rbf, pro

[CV 3/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END C=1000, gam

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf'], 'probability': [True]},
             verbose=3)

In [33]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf'],
             'probability':[True]}
 
grid_west = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid_west.fit(X_train_west,y_train_west)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END .C=0.1, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=0.1, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=0.1, gamma=0.01, kernel=rbf, pro

[CV 2/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END ..C=1000, gamma=1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 5/5] END C=1000, gamma=0.1, kernel=rbf, probability=True; total time=   0.0s
[CV 1/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 2/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 3/5] END C=1000, gamma=0.01, kernel=rbf, probability=True; total time=   0.0s
[CV 4/5] END C=1000, gamm

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf'], 'probability': [True]},
             verbose=3)

In [34]:
print("grid East")
print(f"Training Data Score: {grid_east.score(X_train_east, y_train_east)}")
print(f"Testing Data Score: {grid_east.score(X_test_east, y_test_east)}")

print("grid West")
print(f"Training Data Score: {grid_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {grid_west.score(X_test_west, y_test_west)}")

grid East
Training Data Score: 0.9319899244332494
Testing Data Score: 0.9239766081871345
grid West
Training Data Score: 0.9191919191919192
Testing Data Score: 0.9235294117647059


In [35]:
predictions_east = grid_east.predict(curr_east_df)
predictions_east_prob = grid_east.predict_proba(curr_east_df)[:,1]
predictions_west = grid_west.predict(curr_west_df)
predictions_west_prob = grid_west.predict_proba(curr_west_df)[:,1]

In [36]:
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
results_east['prediction probability'] = predictions_east_prob
print(results_east.sort_values('prediction probability', ascending = False))

                   Team  predictions  prediction probability
11        Brooklyn Nets            1                1.000000
9            Miami Heat            1                0.999985
0         Chicago Bulls            1                0.995903
3    Washington Wizards            1                0.995613
13      Milwaukee Bucks            1                0.985843
14    Charlotte Hornets            1                0.979542
6       New York Knicks            1                0.936616
8        Boston Celtics            1                0.883962
5    Philadelphia 76ers            1                0.866430
12        Atlanta Hawks            1                0.823911
2   Cleveland Cavaliers            0                0.462902
4       Toronto Raptors            0                0.248036
7        Indiana Pacers            0                0.062876
10      Detroit Pistons            0                0.000031
1         Orlando Magic            0                0.000025


In [37]:
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
results_west['prediction probability'] = predictions_west_prob
print(results_west.sort_values('prediction probability', ascending = False))

                      Team  predictions  prediction probability
14   Golden State Warriors            1                1.000000
5             Phoenix Suns            1                1.000000
1                Utah Jazz            1                0.979747
6         Dallas Mavericks            1                0.887562
7     Los Angeles Clippers            1                0.876461
13  Portland Trail Blazers            1                0.751176
8        Memphis Grizzlies            1                0.729147
12      Los Angeles Lakers            1                0.698882
4           Denver Nuggets            1                0.650810
10  Minnesota Timberwolves            1                0.480218
0         Sacramento Kings            0                0.074431
11   Oklahoma City Thunder            0                0.016409
9        San Antonio Spurs            0                0.006150
3     New Orleans Pelicans            0                0.002592
2          Houston Rockets            0 

In [38]:
train_east_df = pd.read_csv('../CSV/NBA-teams-data-east.csv')
curr_east_df = pd.read_csv('../CSV/NBA-2022-team-data-east.csv')

train_west_df = pd.read_csv('../CSV/NBA-teams-data-west.csv')
curr_west_df = pd.read_csv('../CSV/NBA-2022-team-data-west.csv')

In [39]:
X_east = train_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_east = train_east_df['made_playoff']

X_west = train_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_west = train_west_df['made_playoff']

In [40]:
X_train_east, X_test_east, y_train_east, y_test_east = train_test_split(X_east, y_east, test_size = .30,random_state=42)
X_train_west, X_test_west, y_train_west, y_test_west = train_test_split(X_west, y_west,test_size = .30, random_state=42)

In [41]:
random_forest_east = RandomForestClassifier(oob_score = True)
random_forest_east.fit(X_train_east,y_train_east)

random_forest_west = RandomForestClassifier(oob_score = True)
random_forest_west.fit(X_train_west,y_train_west)

RandomForestClassifier(oob_score=True)

In [42]:
print("random_forest East")
print(f"Training Data Score: {random_forest_east.score(X_train_east,y_train_east)}")
print(f"Testing Data Score: {random_forest_east.score(X_test_east, y_test_east)}")

print("random_forest West")
print(f"Training Data Score: {random_forest_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {random_forest_west.score(X_test_west, y_test_west)}")

random_forest East
Training Data Score: 1.0
Testing Data Score: 0.9122807017543859
random_forest West
Training Data Score: 1.0
Testing Data Score: 0.9176470588235294


In [43]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [44]:
RandomForest = RandomForestClassifier()
#Create a randomSearchCV to find better parameters
RF_random_search_east = RandomizedSearchCV(estimator = RandomForest, param_distributions = random_grid,n_iter = 20, 
                                        cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF_random_search_west = RandomizedSearchCV(estimator = RandomForest, param_distributions = random_grid,n_iter = 20, 
                                           cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [45]:
RF_random_search_east.fit(X_train_east,y_train_east)
RF_random_search_west.fit(X_train_west,y_train_west)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [46]:
print("RFrandomSearchCV East")
print(f"Training Data Score: {RF_random_search_east.score(X_train_east,y_train_east)}")
print(f"Testing Data Score: {RF_random_search_east.score(X_test_east, y_test_east)}")
print("RFrandomSearchCV West")
print(f"Training Data Score: {RF_random_search_west.score(X_train_west,y_train_west)}")
print(f"Testing Data Score: {RF_random_search_west.score(X_test_west, y_test_west)}")

RFrandomSearchCV East
Training Data Score: 0.964735516372796
Testing Data Score: 0.9064327485380117
RFrandomSearchCV West
Training Data Score: 0.9949494949494949
Testing Data Score: 0.9117647058823529


In [47]:
teams_east = curr_east_df['Team']
teams_west = curr_west_df['Team']
curr_east_df = curr_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
curr_west_df = curr_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)

In [48]:
predictions_east = RF_random_search_east.predict(curr_east_df)
predictions_east_prob = RF_random_search_east.predict_proba(curr_east_df)[:,1]
predictions_west = RF_random_search_west.predict(curr_west_df)
predictions_west_prob = RF_random_search_west.predict_proba(curr_west_df)[:,1]

In [49]:
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
results_east['prediction probability'] = predictions_east_prob
print(results_east.sort_values('prediction probability', ascending = False))

                   Team  predictions  prediction probability
3    Washington Wizards            1                0.938047
5    Philadelphia 76ers            1                0.937756
11        Brooklyn Nets            1                0.933249
13      Milwaukee Bucks            1                0.917532
8        Boston Celtics            1                0.910696
9            Miami Heat            1                0.905168
12        Atlanta Hawks            1                0.881185
14    Charlotte Hornets            1                0.856454
0         Chicago Bulls            1                0.836135
6       New York Knicks            1                0.738234
2   Cleveland Cavaliers            1                0.541523
4       Toronto Raptors            0                0.460168
7        Indiana Pacers            0                0.344210
1         Orlando Magic            0                0.073333
10      Detroit Pistons            0                0.044813


In [50]:
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
results_west['prediction probability'] = predictions_west_prob
print(results_west.sort_values('prediction probability', ascending = False))

                      Team  predictions  prediction probability
5             Phoenix Suns            1                0.975133
1                Utah Jazz            1                0.964067
14   Golden State Warriors            1                0.953031
6         Dallas Mavericks            1                0.789850
7     Los Angeles Clippers            1                0.775600
8        Memphis Grizzlies            1                0.709744
13  Portland Trail Blazers            1                0.673978
12      Los Angeles Lakers            1                0.661817
4           Denver Nuggets            1                0.596402
10  Minnesota Timberwolves            0                0.427735
9        San Antonio Spurs            0                0.411475
0         Sacramento Kings            0                0.138062
2          Houston Rockets            0                0.122952
11   Oklahoma City Thunder            0                0.100506
3     New Orleans Pelicans            0 