# NBA Games Prediction
## Imports and Load Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import chain # to unlist nested lists

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [54]:
games = pd.read_csv("nba_games_2024.csv")
games.replace([np.inf, -np.inf], np.nan,inplace=True)
games.head()

Unnamed: 0,season,date,home_team,home_total,away_team,away_total,mp,home_fg,home_fga,home_fg%,...,away_drb%,away_trb%,away_ast%,away_stl%,away_blk%,away_tov%,away_usg%,away_ortg,away_drtg,home_won
0,2024,2023-10-24,GSW,104,PHO,108,240.0,36.0,101.0,0.356,...,70.5,55.0,54.8,4.9,12.1,15.6,100.0,106.2,102.3,False
1,2024,2023-10-24,LAL,107,DEN,119,240.0,41.0,90.0,0.456,...,71.7,48.8,60.4,9.4,9.8,10.3,100.0,124.8,112.3,False
2,2024,2023-10-24,DEN,119,LAL,107,240.0,48.0,91.0,0.527,...,77.5,51.2,56.1,5.2,7.0,10.0,100.0,112.3,124.8,True
3,2024,2023-10-24,PHO,108,GSW,104,240.0,42.0,95.0,0.442,...,64.6,45.0,52.8,10.8,9.7,8.8,100.0,102.3,106.2,True
4,2024,2023-10-25,ORL,116,HOU,86,240.0,42.0,87.0,0.483,...,58.5,35.2,59.4,6.4,5.7,17.3,100.0,91.8,123.8,True


## Exploratory Data Analysis

In [None]:
# Shape of dataframe
games.shape

In [None]:
# Distribution of total points for home team
sns.histplot(data=games, x='home_total')

In [None]:
# Scatterplot of home and away points
sns.regplot(data=games, x='home_total',y='away_total')

In [None]:
# Checking the distribution of basis games statistics
basic_stats = ['home_fg%','home_3p%','home_ft%','home_trb','home_ast','home_stl','home_blk','home_tov']

plt.figure(figsize=(15,10))
for index,column in enumerate(basic_stats):
    plt.subplot(2,4,index+1)
    sns.histplot(data=games, x=column)
    plt.title(f"{column} Distribution",fontweight="black",size=20,pad=10)
    plt.tight_layout()


## EDA by Teams

In [None]:
# Checking stats by team
stats_by_team = games.groupby('home_team')[basic_stats].mean().reset_index()
stats_by_team['home_won'] = games[games['home_won']==True].groupby('home_team')['home_won'].count().reset_index()['home_won']
for col in stats_by_team.columns:
    replacement = col.replace('home_','')
    stats_by_team.rename(columns={col: replacement},inplace=True)
stats_by_team

In [None]:
plt.figure(figsize=(25,20))
for index,column in enumerate(stats_by_team):
    if(index!=0): # index!=0 is not plotting the team stat
        plt.subplot(5,2,index)
        sns.barplot(stats_by_team,
                    x='team',
                    y=column,
                    order=stats_by_team.sort_values(column).team,
                    palette='crest')
        plt.title(f"{column}",fontweight="black",size=20,pad=10)
        plt.tight_layout()

In [None]:
################################################################################
### Creating dataframe that displays the top n teams in each basic statistic ###
################################################################################

# Setting top n teams
n = 5

# Initializing lists
stat_index = []
team_index = []
values = []

# Appending lists
for col in stats_by_team.columns[1:]:
    stat_index.append([col]*n)
    team_index.append(list(stats_by_team.nlargest(n,'3p%')['team']))
    values.append(list(stats_by_team.nlargest(n,'3p%')['3p%']))

# Unlisting nested lists
stat_index = list(chain.from_iterable(stat_index))
team_index = list(chain.from_iterable(team_index))
values = list(chain.from_iterable(values))

# Creating dataframe
top_teams = pd.DataFrame([stat_index,team_index,values]).T
top_teams.columns = ['Stat','Team','Average']
top_teams.set_index(['Stat','Team'],inplace=True)
top_teams

In [None]:
########################################################
### Custom Ranking of Teams based on all basic stats ###
########################################################

stats_by_team['overall'] = stats_by_team[['fg%','3p%','ft%','trb','ast','stl','blk','won']].sum(axis=1) - stats_by_team['tov']
team_overall = stats_by_team[['team','overall']].sort_values(by='overall',ascending=False).reset_index(drop=True)
team_overall.index += 1
team_overall

## Data Processing

In [55]:
# Creating a ranking_difference column between the home team against the away team
def overall_difference(df):
    overall = []
    home_overall = float(team_overall[team_overall['team']==df.home_team]['overall'])
    away_overall = float(team_overall[team_overall['team']==df.away_team]['overall'])
    overall.append(home_overall-away_overall)
    return overall

games['ranking_difference'] = games.apply(overall_difference,axis=1)
games = games.explode('ranking_difference')

In [56]:
# Creating a home_last_10_wins and away_last_10_wins columns
def add_home_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["home_last_10_wins"] = total

    return group

def add_away_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["home_last_10_wins"] = 10 - total

    return group

games = games.groupby("home_team", group_keys=False).apply(add_home_last_10_wins)
games = games.groupby("away_team", group_keys=False).apply(add_away_last_10_wins)

In [57]:
# Add target variable which is the result of the next game for the home team
def add_target(group):
    group["target"] = group["home_won"].shift(-1)
    return group

games = games.groupby("home_team", group_keys=False).apply(add_target)

# Changing the 'won' column from True/False to 1's and 0's
games['target'] = games['target']*1

In [58]:
# Separating the date column into 3 columns: year, month, day
#games['date'] = pd.to_datetime(games['date'])

#games['day'] = games['date'].dt.day
#games['month'] = games['date'].dt.month
#games['year'] = games['date'].dt.year
#games = games.drop("date",axis=1)

# Creating dummy variables for categorical variables
games = pd.get_dummies(games, columns=['home_team', 'away_team'], drop_first=True)

In [59]:
# Dropping heuristically unnecessary columns
games.drop(['mp','home_fga','home_3pa','home_fta','home_+/-','home_pts',
            'away_fga','away_3pa','away_fta','away_+/-','away_pts',],axis=1,inplace=True)

# Dropping last round of games where target values are NaN
games.dropna(inplace=True)

## Model Building

In [60]:
# Splitting test and train
test = games.tail(280)
train = games[~games.index.isin(test.index)]

# Splitting X and y
X_train = train.drop(['target','date','season'],axis=1)
y_train = train['target'].astype('int')

X_test = test.drop(['target','date','season'],axis=1)
y_test = test['target'].astype('int')

# Fitting a MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
#X_test[X_test.columns] = scaler.fit_transform(X_test[X_test.columns])


In [None]:
df_rolling = games

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["home_team"], group_keys=False).apply(find_team_averages)

rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
games = pd.concat([games, df_rolling], axis=1)

In [61]:
model_name = []
model_accuracy = []

def model_builder(model, name):
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    model_name.append(name)
    model_accuracy.append(accuracy_score(y_test,predictions))

    print("Confusion matrix:")
    print(confusion_matrix(y_test,predictions))
    print("\n")

    print("Classification report:")
    print(classification_report(y_test,predictions))

    print("Accuracy score:")
    print(accuracy_score(y_test, predictions))

In [62]:
model_builder(LogisticRegression(),"Logistic Regression")

Confusion matrix:
[[80 61]
 [48 91]]


Classification report:
              precision    recall  f1-score   support

           0       0.62      0.57      0.59       141
           1       0.60      0.65      0.63       139

    accuracy                           0.61       280
   macro avg       0.61      0.61      0.61       280
weighted avg       0.61      0.61      0.61       280

Accuracy score:
0.6107142857142858


In [63]:
model_builder(GaussianNB(),"Gaussian NB")

Confusion matrix:
[[76 65]
 [50 89]]


Classification report:
              precision    recall  f1-score   support

           0       0.60      0.54      0.57       141
           1       0.58      0.64      0.61       139

    accuracy                           0.59       280
   macro avg       0.59      0.59      0.59       280
weighted avg       0.59      0.59      0.59       280

Accuracy score:
0.5892857142857143


In [64]:
model_builder(DecisionTreeClassifier(), "Decision")

Confusion matrix:
[[73 68]
 [75 64]]


Classification report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.51       141
           1       0.48      0.46      0.47       139

    accuracy                           0.49       280
   macro avg       0.49      0.49      0.49       280
weighted avg       0.49      0.49      0.49       280

Accuracy score:
0.48928571428571427
