# NBA Games Prediction
Here is the best model setup I've found so far:
- Latest games: February 9th
- Columns added: `ranking_difference`,`last_10_wins`
- Dummy columns: `home_team`, `away_team`
- Columns dropped:
    - `mp`,`home_fga`,`home_3pa`,`home_fta`,`home_+/-`,`home_pts`,
            `away_fga`,`away_3pa`,`away_fta`,`away_+/-`,`away_pts`,`date`, `season`
- Model: Logistic Regression
- Accuracy: 0.739286

## Imports and Load Data

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import chain # to unlist nested lists

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [149]:
games = pd.read_csv("nba_games_2024.csv")
games.replace([np.inf, -np.inf], np.nan,inplace=True)

## Data Processing

In [150]:
# Defining basic statistics
basic_stats = ['home_fg%','home_3p%','home_ft%','home_trb','home_ast','home_stl','home_blk','home_tov']

In [151]:
# Creating a stats_by_team dataframe that displays the average basic statistics of each team
stats_by_team = games.groupby('home_team')[basic_stats].mean().reset_index()

# Counting the number of games each team has won
stats_by_team['home_won'] = games[games['home_won']==True].groupby('home_team')['home_won'].count().reset_index()['home_won']

# Renaming columns to not have 'home_' in the beginning
stats_by_team.columns = stats_by_team.columns.str.lstrip('home_')

# Creating a ranking of all teams
stats_by_team['ranking'] = stats_by_team[['fg%','3p%','ft%','trb','ast','stl','blk','won']].sum(axis=1) - stats_by_team['tov']
stats_by_team = stats_by_team.sort_values(by='ranking',ascending=False).reset_index(drop=True)
stats_by_team.index += 1

In [152]:
# Creating a ranking_difference column between the home team against the away team
def overall_difference(df):
    overall = []
    home_overall = float(stats_by_team[stats_by_team['team']==df.home_team]['ranking'])
    away_overall = float(stats_by_team[stats_by_team['team']==df.away_team]['ranking'])
    overall.append(home_overall-away_overall)
    return overall

games['ranking_difference'] = games.apply(overall_difference,axis=1)
games = games.explode('ranking_difference')

In [153]:
# Creating a home_last_10_wins and away_last_10_wins columns
def add_home_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["home_last_10_wins"] = total

    return group

def add_away_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["away_last_10_wins"] = 5 - total

    return group

games = games.groupby("home_team", group_keys=False).apply(add_home_last_10_wins)
games = games.groupby("away_team", group_keys=False).apply(add_away_last_10_wins)

In [128]:
# Creating columns for the average basic statistics of the last 5 games for each team in each game
def find_team_averages(team):
    rolling = team.drop('team',axis=1).rolling(5).mean()
    return rolling

# Making a list of basic stats of away teams
away_basic_stats = []
for stat in basic_stats:
    away_basic_stats.append(stat.replace('home_','away_'))

# df_rolling_ are dataframes to store the rolling averages
df_rolling_home = games[basic_stats + ['home_team']]
df_rolling_away = games[basic_stats + ['away_team']]

# Renaming for streamline
df_rolling_home.rename(columns={'home_team':'team'},inplace=True)
df_rolling_away.rename(columns={'away_team':'team'},inplace=True)

# Running the function
df_rolling_home = df_rolling_home.groupby(["team"], group_keys=False).apply(find_team_averages)
df_rolling_away = df_rolling_away.groupby(["team"], group_keys=False).apply(find_team_averages)

# Concatenating to the games dataframe
df_rolling_home.columns = [f"{col}_10" for col in df_rolling_home.columns]
df_rolling_away.columns = [f"{col}_10" for col in df_rolling_away.columns]
games = pd.concat([games, df_rolling_home, df_rolling_away], axis=1)

In [129]:
# Separating the date column into 3 columns: year, month, day
games['date'] = pd.to_datetime(games['date'])

games['day'] = games['date'].dt.day
games['month'] = games['date'].dt.month
games['year'] = games['date'].dt.year
games = games.drop("date",axis=1)

In [142]:
# Creating dummy variables for categorical variables
games = pd.get_dummies(games, columns=['home_team', 'away_team'], drop_first=True)

In [155]:
# Dropping heuristically unnecessary columns
games.drop(['mp','home_fga','home_3pa','home_fta','home_+/-','home_pts',
            'away_fga','away_3pa','away_fta','away_+/-','away_pts'],axis=1,inplace=True)

# Dropping last round of games where target values are NaN
games.dropna(inplace=True)

In [154]:
# Add target variable which is the result of the next game for the home team
def add_target(group):
    group["target"] = group["home_won"].shift(-1)
    return group

games = games.groupby("home_team", group_keys=False).apply(add_target)

# Changing the 'won' column from True/False to 1's and 0's
games['target'] = games['target']*1

## Model Building

In [156]:
# Splitting test and train
test = games.tail(280)
train = games[~games.index.isin(test.index)]

# Splitting X and y
X_train = train.drop(['date','season','home_team','away_team'],axis=1)
y_train = train['target'].astype('int')

X_test = test.drop(['date','season','home_team','away_team'],axis=1)
y_test = test['target'].astype('int')

# Fitting a MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
#X_test[X_test.columns] = scaler.fit_transform(X_test[X_test.columns])


In [66]:
# Function to store different model names and their respective accuracies
model_name = []
model_accuracy = []

def model_builder(model, name):
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    model_name.append(name)
    model_accuracy.append(accuracy_score(y_test,predictions))

In [157]:
# Running different models
model_builder(LogisticRegression(),"Logistic Regression")
model_builder(GaussianNB(),"Gaussian NB")
model_builder(DecisionTreeClassifier(), "Decision Tree")

In [158]:
# Display model results
model_results = pd.DataFrame({'Model': model_name, 'Accuracy': model_accuracy})
model_results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.610714
1,Gaussian NB,0.582143
2,Decision Tree,0.514286
3,Logistic Regression,0.646429
4,Gaussian NB,1.0
5,Decision Tree,1.0
6,Logistic Regression,0.857143
7,Gaussian NB,1.0
8,Decision Tree,1.0
9,Logistic Regression,0.657143
