# NBA Games Prediction
Here is the best model setup I've found so far:
- Latest games: February 9th
- Columns added: `last_10_wins`,`day_of_week`,`month`
- Dummy columns: `home_team`, `away_team`
- Columns dropped:
    - `mp`,`home_fga`,`home_3pa`,`home_fta`,`home_+/-`,`home_pts`,
            `away_fga`,`away_3pa`,`away_fta`,`away_+/-`,`away_pts`, `season`
- Model: Logistic Regression
- Accuracy: 0.621429

## Imports and Load Data

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import chain # to unlist nested lists

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
#import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
games = pd.read_csv("nba_games_2024.csv")
games.drop('Unnamed: 0',axis=1,inplace=True)
games.replace([np.inf, -np.inf], np.nan,inplace=True)

## Data Processing

In [None]:
# # Defining basic statistics
# basic_stats = ['home_fg%','home_3p%','home_ft%','home_trb','home_ast','home_stl','home_blk','home_tov']

In [None]:
# # Creating a stats_by_team dataframe that displays the average basic statistics of each team
# stats_by_team = games.groupby('home_team')[basic_stats].mean().reset_index()

# # Counting the number of games each team has won
# stats_by_team['home_won'] = games[games['home_won']==True].groupby('home_team')['home_won'].count().reset_index()['home_won']

# # Renaming columns to not have 'home_' in the beginning
# stats_by_team.columns = stats_by_team.columns.str.lstrip('home_')

# # Creating a ranking of all teams
# stats_by_team['ranking'] = stats_by_team[['fg%','3p%','ft%','trb','ast','stl','blk','won']].sum(axis=1) - stats_by_team['tov']
# stats_by_team = stats_by_team.sort_values(by='ranking',ascending=False).reset_index(drop=True)
# stats_by_team.index += 1

In [None]:
# # Creating a ranking_difference column between the home team against the away team
# def ranking_difference(df):
#     overall = []
#     home_overall = float(stats_by_team[stats_by_team['team']==df.home_team]['ranking'])
#     away_overall = float(stats_by_team[stats_by_team['team']==df.away_team]['ranking'])
#     overall.append(home_overall-away_overall)
#     return overall

# games['ranking_difference'] = games.apply(ranking_difference,axis=1)
# games = games.explode('ranking_difference')
# games['ranking_difference'] = games['ranking_difference'].astype(float)

In [None]:
# Creating a home_last_10_wins and away_last_10_wins columns
def add_home_last_10_wins(group, n):
    group["home_last_10_wins"] = group['home_won'].shift(1).rolling(n-1, min_periods=1).sum()
    return group

def add_away_last_10_wins(group, n):
    group["away_last_10_wins"] = n - group['home_won'].shift(1).rolling(n-1, min_periods=1).sum()
    return group

games = games.groupby("home_team", group_keys=False).apply(add_home_last_10_wins,n=10)
games = games.groupby("away_team", group_keys=False).apply(add_away_last_10_wins,n=10)

In [None]:
# # Creating columns for the average basic statistics of the last 5 games for each team in each game
# def find_team_averages(team):
#     rolling = team.drop('team',axis=1).rolling(5).mean()
#     return rolling

# # Making a list of basic stats of away teams
# away_basic_stats = []
# for stat in basic_stats:
#     away_basic_stats.append(stat.replace('home_','away_'))

# # df_rolling_ are dataframes to store the rolling averages
# df_rolling_home = games[basic_stats + ['home_team']]
# df_rolling_away = games[basic_stats + ['away_team']]

# # Renaming for streamline
# df_rolling_home.rename(columns={'home_team':'team'},inplace=True)
# df_rolling_away.rename(columns={'away_team':'team'},inplace=True)

# # Running the function
# df_rolling_home = df_rolling_home.groupby(["team"], group_keys=False).apply(find_team_averages)
# df_rolling_away = df_rolling_away.groupby(["team"], group_keys=False).apply(find_team_averages)

# # Concatenating to the games dataframe
# df_rolling_home.columns = [f"{col}_10" for col in df_rolling_home.columns]
# df_rolling_away.columns = [f"{col}_10" for col in df_rolling_away.columns]
# games = pd.concat([games, df_rolling_home, df_rolling_away], axis=1)

In [None]:
# Separating the date column into 2 columns: month, day
games['date'] = pd.to_datetime(games['date'])

games['day_of_week'] = games['date'].dt.dayofweek
games['month'] = games['date'].dt.month
games = games.drop("date",axis=1)

In [None]:
# Add target variable which is the result of the next game for the home team
def add_target(group):
    group["target"] = group["home_won"].shift(-1)
    return group

games = games.groupby("home_team", group_keys=False).apply(add_target)

# Changing the 'won' column from True/False to 1's and 0's
games['target'] = games['target']*1

In [None]:
# Creating dummy variables for categorical variables
games = pd.get_dummies(games, columns=['home_team', 'away_team'], drop_first=True)

In [None]:
# Dropping heuristically unnecessary columns
games.drop(['mp','home_fga','home_3pa','home_fta','home_+/-','home_pts',
            'away_fga','away_3pa','away_fta','away_+/-','away_pts'],axis=1,inplace=True)

# Dropping last round of games where target values are NaN
games.dropna(inplace=True)

## Model Building

In [None]:
# Splitting test and train
test = games.tail(280)
train = games[~games.index.isin(test.index)]

# Splitting X and y
X_train = train.drop(['season','target'],axis=1)
y_train = train['target'].astype('int')

X_test = test.drop(['season','target'],axis=1)
y_test = test['target'].astype('int')


In [None]:
# Function to store different model names and their respective accuracies
model_name = []
model_accuracy = []

def model_builder(model, name):
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    model_name.append(name)
    model_accuracy.append(accuracy_score(y_test,predictions))

In [None]:
# Running different models
model_builder(LogisticRegression(),"Logistic Regression")
model_builder(GaussianNB(),"Gaussian NB")
model_builder(DecisionTreeClassifier(), "Decision Tree")
#model_builder(xgb.XGBClassifier(),"XGB")

In [None]:
# Display model results
model_results = pd.DataFrame({'Model': model_name, 'Accuracy': model_accuracy})
model_results

# Testing out on new games

In [None]:
X_train = train.drop(['season','target','date'],axis=1)
y_train = train['target'].astype('int')

model = LogisticRegression()
model.fit(X_train,y_train)
model.predict(test.drop(['season','target','date'],axis=1))