<a href="https://colab.research.google.com/github/Chandramani05/NBA-Games-Data-Analysis-and-Match-Prediction/blob/main/Predicting_NBA_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import requests
import pandas as pd
import time

from datetime import date
from datetime import timedelta

pd.set_option("display.max_columns", None)

##Functions

In [30]:
def make_request(endpoint, params=None, record_path=None, verbose=False):
    root = "https://www.balldontlie.io/api/v1/"
    response = requests.get(root + endpoint, params=params)
    print(response)
    if response.status_code != 200:
        print(response.status_code)
        return response
    if verbose: 
        print("Success!")  
    df = pd.json_normalize(response.json(), record_path=record_path)
   
    # If the request ends up being a multi page request, get all the pages
    # and then complile the results into one dataframe
    n_pages = response.json()["meta"]["total_pages"] 
    if n_pages > 1:
        for page_num in range(2, n_pages + 1):
            # Make sure not to exceed the 60 request per second limit
            time.sleep(1)
        # The code is slightly different depending on whether the query paramerters were passed
        # as a dictionary or as a list of tuples
            if isinstance(params, dict):
                params.update({"page": page_num})
                response = requests.get(root + endpoint, params=params)
                page_n = pd.json_normalize(response.json(), record_path=record_path)
                df = df.append(page_n)
            if isinstance(params, list):
                params.append(("page", page_num))
                response = requests.get(root + endpoint, params=params)
                page_n = pd.json_normalize(response.json(), record_path=record_path)
                df = df.append(page_n)
                params.pop()
            
    return df



In [31]:
def get_recent_games(home_team_id, away_team_id):
    """
    Get a list game ids for the 20 most recent games played for each team specified.
    ---Params---
    home_team_id: int
    away_team_id: int
    ---Returns---
     a tuple of 2 lists. ---> ([home team game ids], [away team game ids])
    """

    # Ensure that the ids are integers
    home_team_id = int(home_team_id)
    away_team_id = int(away_team_id)

    # Get todays date
    today = date.today()                                                           # Get today
    today = f"{today.year}-{today.month}-{today.day}"                              # Convert to format yyyy-mm-dd
    one_year_ago = date.today() - timedelta(days=365)                              # Get last-year-today
    one_year_ago = f"{one_year_ago.year}-{one_year_ago.month}-{one_year_ago.day}"  # convert to format yyyy-mm-dd

    # get home team recent games
    recent_games_home = pd.DataFrame()
    res = make_request("games", record_path="data", params={"end_date": today,
                                                            "start_date": one_year_ago,
                                                            "team_ids[]": [home_team_id],
                                                            "page": 1,
                                                            "per_page": "100"})
    res = res.sort_values("date", ascending=False)
    res = res[res["home_team.id"].eq(home_team_id)]

    recent_games_home = recent_games_home.append(res)
    recent_games_home = recent_games_home.head(20)
    game_ids_home = list(recent_games_home["id"].values)

    # get away team recent games
    recent_games_away = pd.DataFrame()
    res = make_request("games", record_path="data", params={"end_date": "2021-11-09",
                                                            "start_date": "2020-11-09",
                                                            "team_ids[]": [away_team_id],
                                                            "page": 1,
                                                            "per_page": "100"})

    res = res.sort_values("date", ascending=False)
    res = res[res["visitor_team.id"].eq(away_team_id)]

    recent_games_away = recent_games_away.append(res)
    recent_games_away = recent_games_away.head(20)
    game_ids_away = list(recent_games_away["id"].values)


    return recent_games_home, recent_games_away




In [32]:
def clean_stats(df):
    # drop columns with superfluous information
    df.drop(["id", "game.period", "game.postseason", "game.status", "game.time", "player.height_feet", "player.height_inches",
            "player.weight_pounds", "team.abbreviation", "team.city", "team.conference", "team.division", "team.name",
            "player.first_name", "player.last_name", "player.position", "team.full_name", "player.team_id"],
          axis=1, inplace=True)
    
    # Some responses have a mysterious "player" column with all null values
    # It's important to remove this column if it exists, otherwise the next block
    # of code will drop every single row and will produce errors
    try: 
        df.drop("player", axis=1, inplace=True)
    except KeyError:
        pass
    
    # drop rows with any null values
    # a null value generally indicates that the player did not play in that game
    df.dropna(axis=0, how="any", inplace=True)
    
    
    ### Dealing with time
    # clean time column to get a consitent format. ("mm:ss" or "m:ss")
    
    df["min"] = df["min"].astype(str)

    # drop the row if the player didn't play in the game
    df.reset_index(drop=True, inplace=True)  # The next line of code depends on unique indices!!!!
    played_0min = df[df["min"].eq("0:00") | df["min"].eq("") | df["min"].str.startswith("0")].index
    df.drop(played_0min, axis=0, inplace=True)

    # Convert times like "27.0" to "27:0"
    df["min"] = df["min"].str.replace(".",":", regex=False)

    # convert times like "27" to "27:00"
    minutes_only_times = df["min"][~df["min"].str.contains(":")].index
    df["min"].loc[minutes_only_times] += ":00"


    minutes = [time[0] for time in df["min"].str.split(":").values]
    seconds = [time[1] for time in df["min"].str.split(":").values]

    # convert times like "27:0" to "27:00"
    for i, second in enumerate(seconds):
        if len(second) == 1:
            seconds[i] = second + "0"

    # convert times like "8:60" to "9:00"
    for i, second in enumerate(seconds):        
        if second == "60":
            seconds[i] = "00"
            minutes[i] = str(int(minutes[i]) + 1)  # increment minutes by 1

    times = [":".join(list(item)) for item in list(zip(minutes,seconds))]

    df["min"] = times

    return df




def aggregate_stats(df):
    # Convert game date to datetime
    df["game.date"] = pd.to_datetime(df["game.date"]).dt.tz_localize(None)

    # Convert string to timedelta
    df["min"] = [pd.Timedelta(minutes=int(time[0]), seconds=int(time[1])) for time in df["min"].str.split(":").values]

    agg_map = {"ast": "sum", 
           "blk": "sum", 
           "dreb": "sum", 
           "fg3_pct": "mean", 
           "fg3a": "sum", 
           "fg3m": "sum", 
           "fg_pct": "mean",
          "fga": "sum",
          "fgm": "sum",
          "ft_pct": "mean",
          "fta": "sum",
          "ftm": "sum",
          "min": "sum",
          "oreb": "sum",
          "pf": "sum",
          "pts": "sum",
          "reb": "sum",
          "stl": "sum",
          "turnover": "sum",
          "game.id": "first",
          "game.date": "first",
          "game.season": "first",
          "game.home_team_id": "first",
          "game.home_team_score": "first",
          "game.visitor_team_id": "first",
          "game.visitor_team_score": "first",
          "player.id": "first",
          "team.id": "first",}

    df = df.groupby("game.id").agg(agg_map)

    return df




def get_stats(game_ids_home, game_ids_away):
    """
    This function makes a request to balldontlie API for stats from specific games.
    The arguments for this function should be:
    1. a list of the 20 most recent game ids for the home team
    2. a list of the 20 most recent game ids for the away team
    
    The order matters. Putting the away team as the first argument and home team as the
    second will produce inaccurate results.
    
    The function returns a Numpy array that the model is expecting as input.
    """
    
    
    def format_params(game_ids):
        """ 
        Format query paramaters in a format the balldontlie API accepts
        e.g. ?game_ids[]=345686&game_ids[]=234356&gameids[]=3456356...
        """
        params = "game_ids[] " * len(game_ids)
        params = list(zip(params.split(" "), game_ids))
        params.append(("per_page", 100))
        return params
    
    stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]
    
    # Get pandas Series of home team stats
    params_home = format_params(game_ids_home)                                 # Get param list
    stats_home = make_request("stats", record_path="data", params=params_home) # Make request with said param list
    stats_home = clean_stats(stats_home)                                       # clean the data
    stats_home = stats_home[stats_home["team.id"].eq(stats_home["game.home_team_id"])]  # Filter for stats of players that played for the home team
    stats_home = aggregate_stats(stats_home)                                   # aggregate individual player stats into team stats
    stats_home = stats_home[stats_cols]                                        # Drop the columns that aren't basketball stats
    stats_home = stats_home.mean()                                             # average the stats
    
    # Get pandas Series of away team stats
    params_away = format_params(game_ids_away)
    stats_away = make_request("stats", record_path="data", params=params_away)
    stats_away = clean_stats(stats_away)
    stats_away = stats_away[stats_away["team.id"].eq(stats_away["game.visitor_team_id"])]
    stats_away = aggregate_stats(stats_away)
    stats_away = stats_away[stats_cols]
    stats_away = stats_away.mean()
    
    # Make a stats diff Series
    stats_diff = stats_home - stats_away
    
    # Rename columns and put it all together
    stats_home.index = "home_" + stats_home.index
    stats_away.index = "away_" + stats_away.index
    stats_diff.index = "diff_" + stats_diff.index
    
    stats = stats_home.append([stats_away, stats_diff])
    model_input = stats.values.reshape(1,-1)
    
    return model_input


In [33]:
def get_team_code_map(df=False):
    # Make balldontlie api request and convert the json response to pandas dataframe
    team_code_df = make_request("teams", record_path="data")
    team_code_df = team_code_df[["id", "city", "abbreviation", "full_name", "name"]]
    team_code_df = team_code_df.set_index("id")
    # using said dataframe, map team names to team id
    team_code_map = {}
    for row in team_code_df.iterrows():
        team_code_map.update(dict.fromkeys(row[1].str.lower().values, row[0]))
        # Make sure "1" maps to 1. i.e. string maps to integer. This is so people can enter the team code
        # in the text box for convenience and everything still works fine.
        team_code_map.update({str(row[0]): row[0]})                   
    if df:
        return team_code_df
    else:
        return team_code_map  # returning a dictionary

In [34]:
team_codes = get_team_code_map(df=True)
team_codes

<Response [200]>


Unnamed: 0_level_0,city,abbreviation,full_name,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Atlanta,ATL,Atlanta Hawks,Hawks
2,Boston,BOS,Boston Celtics,Celtics
3,Brooklyn,BKN,Brooklyn Nets,Nets
4,Charlotte,CHA,Charlotte Hornets,Hornets
5,Chicago,CHI,Chicago Bulls,Bulls
6,Cleveland,CLE,Cleveland Cavaliers,Cavaliers
7,Dallas,DAL,Dallas Mavericks,Mavericks
8,Denver,DEN,Denver Nuggets,Nuggets
9,Detroit,DET,Detroit Pistons,Pistons
10,Golden State,GSW,Golden State Warriors,Warriors


In [35]:
def getIDFromTeamName(name) :
  for index, row in team_codes.iterrows():
   city = row['city']
   abbreviation = row['abbreviation']
   fullnames = row['full_name']
   part_name = row['name']
   if name == city or name == abbreviation or name == fullnames or name == part_name :
     return index

getIDFromTeamName("WAS")
  

30




# This is formatted as code`


##Testing for WAS and ATL

In [36]:
recent_game_home, recent_game_away = get_recent_games(getIDFromTeamName("DAL"),getIDFromTeamName("NYK"))

<Response [200]>
<Response [200]>


In [37]:
recent_game_home

Unnamed: 0,id,date,home_team_score,period,postseason,season,status,time,visitor_team_score,home_team.id,home_team.abbreviation,home_team.city,home_team.conference,home_team.division,home_team.full_name,home_team.name,visitor_team.id,visitor_team.abbreviation,visitor_team.city,visitor_team.conference,visitor_team.division,visitor_team.full_name,visitor_team.name
84,857664,2022-11-29T00:00:00.000Z,116,4,False,2022,Final,Final,113,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,10,GSW,Golden State,West,Pacific,Golden State Warriors,Warriors
83,857596,2022-11-20T00:00:00.000Z,97,4,False,2022,Final,Final,98,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,8,DEN,Denver,West,Northwest,Denver Nuggets,Nuggets
91,857582,2022-11-18T00:00:00.000Z,127,4,False,2022,Final,Final,99,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,8,DEN,Denver,West,Northwest,Denver Nuggets,Nuggets
86,857573,2022-11-16T00:00:00.000Z,92,4,False,2022,Final,Final,101,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,11,HOU,Houston,West,Southwest,Houston Rockets,Rockets
99,857564,2022-11-15T00:00:00.000Z,103,4,False,2022,Final,Final,101,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,13,LAC,LA,West,Pacific,LA Clippers,Clippers
93,857545,2022-11-12T00:00:00.000Z,117,4,False,2022,Final,Final,112,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,25,POR,Portland,West,Northwest,Portland Trail Blazers,Trail Blazers
89,857509,2022-11-07T00:00:00.000Z,96,4,False,2022,Final,Final,94,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,3,BKN,Brooklyn,East,Atlantic,Brooklyn Nets,Nets
81,857477,2022-11-04T00:00:00.000Z,111,4,False,2022,Final,Final,110,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,28,TOR,Toronto,East,Atlantic,Toronto Raptors,Raptors
92,857466,2022-11-02T00:00:00.000Z,103,4,False,2022,Final,Final,100,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,29,UTA,Utah,West,Northwest,Utah Jazz,Jazz
85,857446,2022-10-30T00:00:00.000Z,114,4,False,2022,Final,Final,105,7,DAL,Dallas,West,Southwest,Dallas Mavericks,Mavericks,22,ORL,Orlando,East,Southeast,Orlando Magic,Magic


In [38]:
game_ids_home = list(recent_game_home["id"].values)

In [39]:
recent_game_away

Unnamed: 0,id,date,home_team_score,period,postseason,season,status,time,visitor_team_score,home_team.id,home_team.abbreviation,home_team.city,home_team.conference,home_team.division,home_team.full_name,home_team.name,visitor_team.id,visitor_team.abbreviation,visitor_team.city,visitor_team.conference,visitor_team.division,visitor_team.full_name,visitor_team.name
46,473560,2021-11-08T00:00:00.000Z,96,4,False,2021,Final,,103,23,PHI,Philadelphia,East,Atlantic,Philadelphia 76ers,76ers,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
36,473532,2021-11-05T00:00:00.000Z,98,4,False,2021,Final,,113,17,MIL,Milwaukee,East,Central,Milwaukee Bucks,Bucks,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
45,473517,2021-11-03T00:00:00.000Z,111,4,False,2021,Final,,98,12,IND,Indiana,East,Central,Indiana Pacers,Pacers,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
42,473491,2021-10-30T00:00:00.000Z,117,4,False,2021,Final,,123,19,NOP,New Orleans,West,Southwest,New Orleans Pelicans,Pelicans,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
40,473474,2021-10-28T00:00:00.000Z,103,4,False,2021,Final,,104,5,CHI,Chicago,East,Central,Chicago Bulls,Bulls,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
38,473431,2021-10-22T00:00:00.000Z,96,4,False,2021,Final,,121,22,ORL,Orlando,East,Southeast,Orlando Magic,Magic,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
44,461298,2021-08-13T00:00:00.000Z,93,4,False,2021,Final,,87,9,DET,Detroit,East,Central,Detroit Pistons,Pistons,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
28,457265,2021-08-11T00:00:00.000Z,82,4,False,2021,Final,,91,14,LAL,Los Angeles,West,Pacific,Los Angeles Lakers,Lakers,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
51,430992,2021-05-30T00:00:00.000Z,113,4,True,2020,Final,,96,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,20,NYK,New York,East,Atlantic,New York Knicks,Knicks
48,429183,2021-05-28T00:00:00.000Z,105,4,True,2020,Final,,94,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,20,NYK,New York,East,Atlantic,New York Knicks,Knicks


In [40]:
game_ids_away = list(recent_game_away["id"].values)

##This part will take 15-20 sec on the web page getting stats of last 20 games

In [41]:
stats = get_stats(game_ids_home, game_ids_away)

<Response [200]>


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


<Response [200]>


In [42]:
stats

array([[ 20.95      ,   3.05      ,  30.35      ,  10.18809864,
         39.75      ,  13.9       ,  14.68440085,  81.75      ,
         37.15      ,  15.14377004,  24.7       ,  18.85      ,
          7.95      ,  20.1       , 107.05      ,  38.3       ,
          7.85      ,  10.15      ,  21.35      ,   4.6       ,
         34.35      ,  27.19624106,  32.95      ,  13.15      ,
         43.59898035,  86.15      ,  38.4       ,  46.45060256,
         20.3       ,  16.1       ,   8.9       ,  20.95      ,
        106.05      ,  43.25      ,   6.75      ,  11.15      ,
         -0.4       ,  -1.55      ,  -4.        , -17.00814243,
          6.8       ,   0.75      , -28.9145795 ,  -4.4       ,
         -1.25      , -31.30683252,   4.4       ,   2.75      ,
         -0.95      ,  -0.85      ,   1.        ,  -4.95      ,
          1.1       ,  -1.        ]])

##Testing the Model

In [43]:
import pickle

In [44]:
model = pickle.load(open("/content/drive/MyDrive/Machine Learning NBA /NBA-Games-Data-Analysis-and-Match-Prediction/Models/model.sav", "rb"))

In [45]:
prediction = model.predict(stats)
prediction



array([1])

In [46]:
# probability that away team will win
model.predict_proba(stats)[0][0]



0.27512932

In [47]:
# probability that home team will win
model.predict_proba(stats)[0][1]



0.7248707