In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#training set and prediction set(2021-2022 season)
train_df = pd.read_csv('../CSV/NBA-season-data-processed.csv')
curr_df = pd.read_csv('../CSV/NBA-2022-team-data.csv')

train_east_df = pd.read_csv('../CSV/NBA-teams-data-east.csv')
curr_east_df = pd.read_csv('../CSV/NBA-2022-team-data-east.csv')

train_west_df = pd.read_csv('../CSV/NBA-teams-data-west.csv')
curr_west_df = pd.read_csv('../CSV/NBA-2022-team-data-west.csv')

In [3]:
X = train_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y = train_df['made_playoff']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .30, random_state = 42)

In [5]:
logReg = LogisticRegression()
logReg.fit(X_train,y_train)

LogisticRegression()

In [6]:
print('Logistic Regression Score w/o splitting data by conference')
print(f"Training Data Score: {logReg.score(X_train, y_train)}")
print(f"Testing Data Score: {logReg.score(X_test, y_test)}")

Logistic Regression Score w/o splitting data by conference
Training Data Score: 0.9275184275184275
Testing Data Score: 0.9342857142857143


In [7]:
teams = curr_df['Team']
conf = curr_df['Conf']
curr_df = curr_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)

In [8]:
predictions = logReg.predict(curr_df)
predictions_prob = logReg.predict_proba(curr_df)[:,1]

In [9]:
results = pd.DataFrame()
results['Team'] = teams
results['predictions'] = predictions
results['prediction probability'] = predictions_prob
results['Conference'] = conf
print(results.sort_values(['Conference','prediction probability'], ascending = False))

                      Team  predictions  prediction probability Conference
0    Golden State Warriors            1                0.999997       West
2             Phoenix Suns            1                0.999977       West
1                Utah Jazz            1                0.994473       West
22        Dallas Mavericks            1                0.972879       West
20    Los Angeles Clippers            1                0.927364       West
4   Portland Trail Blazers            1                0.759349       West
24          Denver Nuggets            1                0.757737       West
6        Memphis Grizzlies            1                0.583178       West
7       Los Angeles Lakers            1                0.552610       West
15  Minnesota Timberwolves            1                0.526726       West
28   Oklahoma City Thunder            0                0.031153       West
9         Sacramento Kings            0                0.027588       West
16       San Antonio Spur

In [10]:
X_east = train_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_east = train_east_df['made_playoff']

X_west = train_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
y_west = train_west_df['made_playoff']

In [11]:
X_train_east, X_test_east, y_train_east, y_test_east = train_test_split(X_east, y_east, test_size = .20,random_state=42)
X_train_west, X_test_west, y_train_west, y_test_west = train_test_split(X_west, y_west,test_size = .20, random_state=42)

In [12]:
logReg_east = LogisticRegression()
logReg_east.fit(X_train_east,y_train_east)

logReg_west = LogisticRegression()
logReg_west.fit(X_train_west,y_train_west)

LogisticRegression()

In [13]:
print("logReg East")
print(f"Training Data Score: {logReg_east.score(X_train_east, y_train_east)}")
print(f"Testing Data Score: {logReg_east.score(X_test_east, y_test_east)}")

print("logReg West")
print(f"Training Data Score: {logReg_west.score(X_train_west, y_train_west)}")
print(f"Testing Data Score: {logReg_west.score(X_test_west, y_test_west)}")

logReg East
Training Data Score: 0.9427312775330396
Testing Data Score: 0.9473684210526315
logReg West
Training Data Score: 0.9314159292035398
Testing Data Score: 0.8859649122807017


In [14]:
coeff_df = pd.DataFrame(train_east_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logReg_east.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
20,top_PTS,4.899748
4,3P%,0.542085
13,AST,0.373146
11,ORB,0.355433
0,FGA,0.35464
15,BLK,0.271351
25,top_5_stats_sum,0.240686
12,DRB,0.220794
8,FT,0.220715
9,FTA,0.220454


In [15]:
coeff_df = pd.DataFrame(train_west_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logReg_west.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
20,top_PTS,3.969952
15,BLK,0.483335
19,W%,0.400215
21,top_5_PTS,0.394403
25,top_5_stats_sum,0.376841
13,AST,0.333184
18,PTS,0.315922
14,STL,0.29224
10,FT%,0.26785
16,TOV,0.220349


In [16]:
curr_east_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   FG               15 non-null     float64
 1   FGA              15 non-null     float64
 2   FG%              15 non-null     float64
 3   3P               15 non-null     float64
 4   3PA              15 non-null     float64
 5   3P%              15 non-null     float64
 6   2P               15 non-null     float64
 7   2PA              15 non-null     float64
 8   2P%              15 non-null     float64
 9   FT               15 non-null     float64
 10  FTA              15 non-null     float64
 11  FT%              15 non-null     float64
 12  ORB              15 non-null     float64
 13  DRB              15 non-null     float64
 14  AST              15 non-null     float64
 15  STL              15 non-null     float64
 16  BLK              15 non-null     float64
 17  TOV              1

In [17]:
teams_east = curr_east_df['Team']
teams_west = curr_west_df['Team']
curr_east_df = curr_east_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)
curr_west_df = curr_west_df.drop(['Team','Year','Conf','made_playoff'], axis = 1)

In [18]:
predictions_east = logReg_east.predict(curr_east_df)
predictions_east_prob = logReg_east.predict_proba(curr_east_df)[:,1]
predictions_west = logReg_west.predict(curr_west_df)
predictions_west_prob = logReg_west.predict_proba(curr_west_df)[:,1]

In [19]:
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
results_east['prediction probability'] = predictions_east_prob
print(results_east.sort_values('prediction probability', ascending = False))

                   Team  predictions  prediction probability
11        Brooklyn Nets            1                0.998408
9            Miami Heat            1                0.994422
3    Washington Wizards            1                0.990722
0         Chicago Bulls            1                0.982349
8        Boston Celtics            1                0.943589
13      Milwaukee Bucks            1                0.929806
6       New York Knicks            1                0.900448
14    Charlotte Hornets            1                0.866302
5    Philadelphia 76ers            1                0.792615
12        Atlanta Hawks            1                0.773762
2   Cleveland Cavaliers            1                0.593244
4       Toronto Raptors            1                0.574954
7        Indiana Pacers            0                0.149425
10      Detroit Pistons            0                0.004082
1         Orlando Magic            0                0.000782


In [20]:
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
results_west['prediction probability'] = predictions_west_prob
print(results_west.sort_values('prediction probability', ascending = False))

                      Team  predictions  prediction probability
14   Golden State Warriors            1                0.999986
5             Phoenix Suns            1                0.999934
1                Utah Jazz            1                0.988774
6         Dallas Mavericks            1                0.953178
7     Los Angeles Clippers            1                0.827064
13  Portland Trail Blazers            1                0.755313
4           Denver Nuggets            1                0.694647
8        Memphis Grizzlies            1                0.655409
10  Minnesota Timberwolves            1                0.552457
12      Los Angeles Lakers            0                0.427328
0         Sacramento Kings            0                0.029421
11   Oklahoma City Thunder            0                0.008255
9        San Antonio Spurs            0                0.004187
3     New Orleans Pelicans            0                0.000316
2          Houston Rockets            0 

In [21]:
scaler = StandardScaler()
curr_east_df = pd.DataFrame(scaler.fit_transform(curr_east_df), columns = curr_east_df.columns)
curr_west_df = pd.DataFrame(scaler.fit_transform(curr_west_df), columns = curr_west_df.columns)

In [22]:
predictions_east = logReg_east.predict(curr_east_df)
predictions_east_prob = logReg_east.predict_proba(curr_east_df)[:,1]
predictions_west = logReg_west.predict(curr_west_df)
predictions_west_prob = logReg_west.predict_proba(curr_west_df)[:,1]

In [23]:
results_east = pd.DataFrame()
results_east['Team'] = teams_east
results_east['predictions'] = predictions_east
results_east['prediction probability'] = predictions_east_prob
print(results_east.sort_values('prediction probability', ascending = False))

                   Team  predictions  prediction probability
11        Brooklyn Nets            1                0.999683
9            Miami Heat            1                0.997065
3    Washington Wizards            1                0.994121
0         Chicago Bulls            1                0.991051
13      Milwaukee Bucks            1                0.942429
8        Boston Celtics            1                0.936757
6       New York Knicks            1                0.907048
14    Charlotte Hornets            1                0.893661
5    Philadelphia 76ers            1                0.819328
12        Atlanta Hawks            1                0.774137
2   Cleveland Cavaliers            0                0.455174
4       Toronto Raptors            0                0.427932
7        Indiana Pacers            0                0.063009
10      Detroit Pistons            0                0.000404
1         Orlando Magic            0                0.000066


In [24]:
results_west = pd.DataFrame()
results_west['Team'] = teams_west
results_west['predictions'] = predictions_west
results_west['prediction probability'] = predictions_west_prob
print(results_west.sort_values('prediction probability', ascending = False))

                      Team  predictions  prediction probability
14   Golden State Warriors            1                0.999944
5             Phoenix Suns            1                0.999798
1                Utah Jazz            1                0.984719
6         Dallas Mavericks            1                0.942490
7     Los Angeles Clippers            1                0.824967
13  Portland Trail Blazers            1                0.747283
4           Denver Nuggets            1                0.729345
8        Memphis Grizzlies            1                0.697896
10  Minnesota Timberwolves            1                0.663054
12      Los Angeles Lakers            1                0.503502
0         Sacramento Kings            0                0.051116
11   Oklahoma City Thunder            0                0.021237
9        San Antonio Spurs            0                0.011289
3     New Orleans Pelicans            0                0.001089
2          Houston Rockets            0 

In [25]:
results_east['Conf'] = 'East'
results_west['Conf'] = 'West'

In [26]:
results = pd.concat([results_east,results_west])

In [27]:
results.to_csv('../Results/Logistic-Regression-Results.csv')