# NBA Games Prediction
## Imports and Load Data

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import chain # to unlist nested lists

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
games = pd.read_csv("nba_games_2024.csv")
games.replace([np.inf, -np.inf], np.nan,inplace=True)

## Data Processing

In [None]:
# Defining the basic statistics
basic_stats = ['home_fg%','home_3p%','home_ft%','home_trb','home_ast','home_stl','home_blk','home_tov']

# Creating a stats_by_team dataframe that displays the average basic statistics of each team
stats_by_team = games.groupby('home_team')[basic_stats].mean().reset_index()

# Counting the number of games each team has won
stats_by_team['home_won'] = games[games['home_won']==True].groupby('home_team')['home_won'].count().reset_index()['home_won']

# Renaming columns to not have 'home_' in the beginning
for col in stats_by_team.columns:
    replacement = col.replace('home_','')
    stats_by_team.rename(columns={col: replacement},inplace=True)

# Creating a ranking of all teams
stats_by_team['ranking'] = stats_by_team[['fg%','3p%','ft%','trb','ast','stl','blk','won']].sum(axis=1) - stats_by_team['tov']
team_overall = stats_by_team[['team','ranking']].sort_values(by='ranking',ascending=False).reset_index(drop=True)
team_overall.index += 1

In [None]:
# Creating a ranking_difference column between the home team against the away team
def overall_difference(df):
    overall = []
    home_overall = float(team_overall[team_overall['team']==df.home_team]['ranking'])
    away_overall = float(team_overall[team_overall['team']==df.away_team]['ranking'])
    overall.append(home_overall-away_overall)
    return overall

games['ranking_difference'] = games.apply(overall_difference,axis=1)
games = games.explode('ranking_difference')

In [None]:
# Creating a home_last_10_wins and away_last_10_wins columns
def add_home_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["home_last_10_wins"] = total

    return group

def add_away_last_10_wins(group):
    total = group["home_won"].shift(1) + group["home_won"].shift(2) +\
            group["home_won"].shift(3) + group["home_won"].shift(4) +\
            group["home_won"].shift(5) + group["home_won"].shift(6) +\
            group["home_won"].shift(7) + group["home_won"].shift(8) +\
            group["home_won"].shift(9) + group["home_won"].shift(10)
    group["away_last_10_wins"] = 10 - total

    return group

games = games.groupby("home_team", group_keys=False).apply(add_home_last_10_wins)
games = games.groupby("away_team", group_keys=False).apply(add_away_last_10_wins)

In [None]:
df_rolling = games

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["home_team"], group_keys=False).apply(find_team_averages)

rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
games = pd.concat([games, df_rolling], axis=1)

In [None]:
# Add target variable which is the result of the next game for the home team
def add_target(group):
    group["target"] = group["home_won"].shift(-1)
    return group

games = games.groupby("home_team", group_keys=False).apply(add_target)

# Changing the 'won' column from True/False to 1's and 0's
games['target'] = games['target']*1

In [None]:
# Separating the date column into 3 columns: year, month, day
#games['date'] = pd.to_datetime(games['date'])

#games['day'] = games['date'].dt.day
#games['month'] = games['date'].dt.month
#games['year'] = games['date'].dt.year
#games = games.drop("date",axis=1)

# Creating dummy variables for categorical variables
games = pd.get_dummies(games, columns=['home_team', 'away_team'], drop_first=True)

In [None]:
# Dropping heuristically unnecessary columns
games.drop(['mp','home_fga','home_3pa','home_fta','home_+/-','home_pts',
            'away_fga','away_3pa','away_fta','away_+/-','away_pts',],axis=1,inplace=True)

# Dropping last round of games where target values are NaN
games.dropna(inplace=True)

## Model Building

In [None]:
# Splitting test and train
test = games.tail(280)
train = games[~games.index.isin(test.index)]

# Splitting X and y
X_train = train.drop(['target','date','season'],axis=1)
y_train = train['target'].astype('int')

X_test = test.drop(['target','date','season'],axis=1)
y_test = test['target'].astype('int')

# Fitting a MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
#X_test[X_test.columns] = scaler.fit_transform(X_test[X_test.columns])


In [None]:
# Function to store different model names and their respective accuracies

model_name = []
model_accuracy = []

def model_builder(model, name):
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    model_name.append(name)
    model_accuracy.append(accuracy_score(y_test,predictions))

    # Printing confusion matrix, classification report, and accuracy score
    #print("Confusion matrix:")
    #print(confusion_matrix(y_test,predictions))
    #print("\n")

    #print("Classification report:")
    #print(classification_report(y_test,predictions))

    #print("Accuracy score:")
    #print(accuracy_score(y_test, predictions))

In [None]:
# Running different models
model_builder(LogisticRegression(),"Logistic Regression")
model_builder(GaussianNB(),"Gaussian NB")
model_builder(DecisionTreeClassifier(), "Decision Tree")

In [None]:
# Display model results
model_results = pd.DataFrame({'Model': model_name, 'Accuracy': model_accuracy})
model_results