In [None]:
# Import Libraries
import pandas as pd

# Loads In CSV File With Scaped NBA Games And Makes The First Column The Index
df = pd.read_csv("nba_games.csv", index_col=0)

# Sorts Data Fame Based On The Time In Which The Game Took Place
df = df.sort_values("date")

# Resets The Index So That The Earliest Game Has The Lowest Index
df = df.reset_index(drop=True)

#Deletes Repetative Columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

# Function To Add A Target
def add_target(team):
    
    # Takes The Won Column From The Game And Pulls It Back One Row
    # Target Indicates If A Team Won Or Lost Their Next Game
    team["target"] = team["won"].shift(-1)
    
    return team

# Splits The Dataframe Into One Dataframe Per Team And Runs The Add Target Function
df = df.groupby("team", group_keys=False).apply(add_target)

# Shows The Dataframe For The Washington Wizards With A Functioning Target Where The Target If True If Washington Wins Their Next Game And False If They Lose Their Next Game 
df[df["team"] == "WAS"]

# Finds All The Null Values In The Target And Sets Them To 2
df.loc[pd.isnull(df["target"]), "target"] = 2

# Converts False Into 0 And True Into 1
df["target"] = df["target"].astype(int, errors="ignore")

# Shows The Entire Data Frame With Every Team
df

# Makes Sure The Amount Of Wins And Losses Are Equal
df["won"].value_counts()

# Makes Sure The Amount Of Targets Are Equal
# Amount Of Targets Will Not Be Equal Because Of The 2 Which Represents N/A When The Next Game Occurs In The Next Season
df["target"].value_counts()

# Drops An Inconsistency I Found Where The Is A Game That The ft% Is A Null And 15 Games Where +/-_max Is A Null
df = df.dropna(subset=["ft%", "ft%_opp"])
df = df.dropna(subset=["+/-_max", "+/-_max_opp"])

# Finds All Nulls In The File
nulls = pd.isnull(df)

# Counts The Ammount Of Nulls
nulls = nulls.sum()

# Finds All Columns With More Than 0 Nulls 
nulls = nulls[nulls > 0]

# Prints All The Columns With Null Values In Them
nulls

# Finds All Valid Columns With No Nulls
valid_columns = df.columns[~df.columns.isin(nulls.index)]  

# Prints All Valid Columns
valid_columns

# Creates New Dataframe With Only Valid Columns
df = df[valid_columns].copy()

df

# Imports Machine Learning Libraries From sklearn
from sklearn.model_selection import TimeSeriesSplit # TimeSeriesSplit Is Used To Ensure Past Data Is Used To Predict Future Data Not Future Data To Predict Past Data 
from sklearn.feature_selection import SequentialFeatureSelector # SequentialFeatureSelector Is Used To Select Which Features Of The Dataset Will Be Used To Make Predictions 
from sklearn.linear_model import RidgeClassifier # RidgeClassifier Is Used To Make The Predictions On The Winner Of Games 

rr = RidgeClassifier(alpha=1) # Initialize RidgeClassifier
split = TimeSeriesSplit(n_splits=3) # Initialize TimeSeriesSplit

# Initalize SequentialFeatureSelector Passing In The Machine Learning Model, The Amount Of Features (Columns To Select), The Direction (Forward Because It Starts With 0 Features Picks The One That Imporves Accuracy The Most And Continues Until 30), And TimeSeriesSplit To Cross Validate  
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) 

# Selects Columns We Don't Want To Scale
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

# Selects Columns We Want To Scale Which Is Every Collumn Except The Removed Columns
selected_columns = df.columns[~df.columns.isin(removed_columns)]

# Imports Scaling Library From sklearn
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # Initalizes Scalar 
df[selected_columns] = scaler.fit_transform(df[selected_columns]) # Scales Selected Columns To A Value Between 0 And 1 To Improve Efficency

df

# Fits The Feature Selector To Predict The Target
sfs.fit(df[selected_columns], df["target"])

# Gets The 30 Most Influential Stats
predictors = list(selected_columns[sfs.get_support()])

predictors

# Function To Make Predictions For NBA Games 
# Start = 2 (Gets 2 Seasons Of Data First Before Making Predictions Meaning 2018 Season Is The First Season Predictions Are Made For)
# Step = 1 (Makes A Prediction For One Season At A Time) 
def backtest(data, model, predictors, start=2, step=1):

    all_predictions = [] # List Of Dataframe Of All Prediction For A Single Season 
    seasons = sorted(data["season"].unique()) # List Of All Seasons That Exist In Data

    # Loops Through Seasons
    for i in range(start, len(seasons), step):
        
        season = seasons[i] # Numbers Seasons (Starting With i = 2) 
        train = data[data["season"] < season] # Train Data Is All The Data That Comes Before The Current Season
        test = data[data["season"] == season] # Test Data Is Used To Generate Predictions In The Current Season 
        # All Data Is Then Iterated To The Next Season

        # Gives The Model The Predictors To Make A Prediction On The Target
        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors]) # Makes Predictions On The Test Set
        preds = pd.Series(preds, index=test.index) # Comverts From numpy.array To A Pandas Series With The Same Index As The Test Set   

        combined = pd.concat([test["target"], preds], axis=1) # Puts Together The Target (Correct Values) And The Predicted Values In A 2 Column Series
        combined.columns = ["actual", "predictions"] # Renames Column Names 

        all_predictions.append(combined)

    return pd.concat(all_predictions)

# Runs Backtest Function To Predict The Outcome Of Games
predictions = backtest(df, rr, predictors)

predictions

# Imports Accuracy Score From sklearn
from sklearn.metrics import accuracy_score

# Excludes The Games Where The Target Is 2 As 2 Will Not Be Predicted
predictions = predictions[predictions["actual"] != 2]

# Compares Actual And Predictions To Get The Accuracy Score
accuracy_score(predictions["actual"], predictions["predictions"])

# Compares The Winning Percentage For Teams At Home VS Teams Away
df.groupby("home")["won"].mean()

# Creates A Subset Of The Original Dataframe With Previously Selected Columns And Won, Team, And Season Columns
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

df_rolling

# Function To Find The Average Of A Teams Stat Over The Previous 3, 5, 10, 15, And 20 Games
def find_team_averages(team, games_back=[3, 5, 10, 15, 20]):

    # Creates An Empty Datafram To Store Information
    dfs = []

    # Itterated Through Each Of The Selected Intervals Of Games Back
    for g in games_back:

        rolling = team.rolling(g).mean(numeric_only=True) # Calculates The Average For Each Stat
        rolling = rolling.add_suffix(f"_{g}g") # Add A Lable Indicating The Amount Of Games Back The Stat Averages 
        dfs.append(rolling) # Add The Data To The Data Frame

    # Returns The Dataframe With All The Teams Rolling Stats
    return pd.concat(dfs, axis=1) 

# Groups The Dataframe By Team For Each Season 
df_rolling = df_rolling.groupby(["team","season"], group_keys=False).apply(find_team_averages)

df_rolling

rolling_cols = [f"{col}" for col in df_rolling.columns] # Creates New Column Title With The Previous Name And _10
df_rolling.columns = rolling_cols # Renames Columns

# Combines The Two Dataframes
df = pd.concat([df, df_rolling], axis=1)

df

# Drops NaN Values
df = df.dropna()

df

# Function To Shift The Dataframes Columns To Give The Algorithm More Information
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1) # Gets The Next Column For A Given Stat And Shifts It Back
    return next_col

# Function To Add Another Column Based On The shift_col Function
def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name)) # Adds Another Column Grouped By Team

# Avoid SettingWithCopyWarning
df = df.copy()

# All Of This Information Is Known In Advanced (Who Is At Home? Who Is The Next Opponent? When Does The Team Next Play?) Even If The Games Have Already Happened The Algorithm Only Has Information That Would Be Known Before A Game Occurs
df.loc[:, "home_next"] = add_col(df, "home") # Adds A Home Column To Indicate If The Game Was Played At Home
df.loc[:, "team_opp_next"] = add_col(df, "team_opp") # Adds Team Opponent Column To Indicate The Next Team Played
df.loc[:, "date_next"] = add_col(df, "date") # Adds Date Next To Indicate When The Team Plays Next

df

# Merges In Rolling Information On Opponent
# In Cases Where Two Columns Have The Same Name The Left Side Gets The _x The Right Side Gets The _y Meaning _y Is The Opponents Stats And _x Is From The Original Team
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
                left_on=["team", "date_next"], 
                right_on=["team_opp_next", "date_next"]
)

full

full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

# Removes Columns That Are Not Numeric Values And Cannot Be Processed Adding It To The Old Set Of Removed Columns
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

removed_columns

# Selected Columns Are All Columns Not In Removed Columns List
selected_columns = full.columns[~full.columns.isin(removed_columns)]

# Fits The Feature Selector To Predict The Target
sfs.fit(full[selected_columns], full["target"])

# Gets The 30 Most Influential Stats
predictors = list(selected_columns[sfs.get_support()])

predictors

# Runs Backtest Function To Predict The Outcome Of Games
predictions = backtest(full, rr, predictors)

# Excludes The Games Where The Target Is 2 As 2 Will Not Be Predicted
predictions = predictions[predictions["actual"] != 2]

# Compares Actual And Predictions To Get The Accuracy Score
accuracy_score(predictions["actual"], predictions["predictions"])