In [None]:
import requests
import pandas as pd
import time

from datetime import date
from datetime import timedelta

pd.set_option("display.max_columns", None)

##Functions

In [None]:
def make_request(endpoint, params=None, record_path=None, verbose=False):
    root = "https://www.balldontlie.io/api/v1/"
    response = requests.get(root + endpoint, params=params)
    if response.status_code != 200:
        print(response.status_code)
        return response
    if verbose: 
        print("Success!")  
    df = pd.json_normalize(response.json(), record_path=record_path)
   
    # If the request ends up being a multi page request, get all the pages
    # and then complile the results into one dataframe
    n_pages = response.json()["meta"]["total_pages"] 
    if n_pages > 1:
        for page_num in range(2, n_pages + 1):
            # Make sure not to exceed the 60 request per second limit
            time.sleep(1)
        # The code is slightly different depending on whether the query paramerters were passed
        # as a dictionary or as a list of tuples
            if isinstance(params, dict):
                params.update({"page": page_num})
                response = requests.get(root + endpoint, params=params)
                page_n = pd.json_normalize(response.json(), record_path=record_path)
                df = df.append(page_n)
            if isinstance(params, list):
                params.append(("page", page_num))
                response = requests.get(root + endpoint, params=params)
                page_n = pd.json_normalize(response.json(), record_path=record_path)
                df = df.append(page_n)
                params.pop()
            
    return df



In [None]:
def get_recent_games(home_team_id, away_team_id):
    """
    Get a list game ids for the 20 most recent games played for each team specified.
    ---Params---
    home_team_id: int
    away_team_id: int
    ---Returns---
     a tuple of 2 lists. ---> ([home team game ids], [away team game ids])
    """

    # Ensure that the ids are integers
    home_team_id = int(home_team_id)
    away_team_id = int(away_team_id)

    # Get todays date
    today = date.today()                                                           # Get today
    today = f"{today.year}-{today.month}-{today.day}"                              # Convert to format yyyy-mm-dd
    one_year_ago = date.today() - timedelta(days=365)                              # Get last-year-today
    one_year_ago = f"{one_year_ago.year}-{one_year_ago.month}-{one_year_ago.day}"  # convert to format yyyy-mm-dd

    # get home team recent games
    recent_games_home = pd.DataFrame()
    res = make_request("games", record_path="data", params={"end_date": today,
                                                            "start_date": one_year_ago,
                                                            "team_ids[]": [home_team_id],
                                                            "page": 1,
                                                            "per_page": "100"})
    res = res.sort_values("date", ascending=False)
    res = res[res["home_team.id"].eq(home_team_id)]

    recent_games_home = recent_games_home.append(res)
    recent_games_home = recent_games_home.head(20)
    game_ids_home = list(recent_games_home["id"].values)

    # get away team recent games
    recent_games_away = pd.DataFrame()
    res = make_request("games", record_path="data", params={"end_date": "2021-11-09",
                                                            "start_date": "2020-11-09",
                                                            "team_ids[]": [away_team_id],
                                                            "page": 1,
                                                            "per_page": "100"})

    res = res.sort_values("date", ascending=False)
    res = res[res["visitor_team.id"].eq(away_team_id)]

    recent_games_away = recent_games_away.append(res)
    recent_games_away = recent_games_away.head(20)
    game_ids_away = list(recent_games_away["id"].values)


    return game_ids_home, game_ids_away




In [None]:
def clean_stats(df):
    # drop columns with superfluous information
    df.drop(["id", "game.period", "game.postseason", "game.status", "game.time", "player.height_feet", "player.height_inches",
            "player.weight_pounds", "team.abbreviation", "team.city", "team.conference", "team.division", "team.name",
            "player.first_name", "player.last_name", "player.position", "team.full_name", "player.team_id"],
          axis=1, inplace=True)
    
    # Some responses have a mysterious "player" column with all null values
    # It's important to remove this column if it exists, otherwise the next block
    # of code will drop every single row and will produce errors
    try: 
        df.drop("player", axis=1, inplace=True)
    except KeyError:
        pass
    
    # drop rows with any null values
    # a null value generally indicates that the player did not play in that game
    df.dropna(axis=0, how="any", inplace=True)
    
    
    ### Dealing with time
    # clean time column to get a consitent format. ("mm:ss" or "m:ss")
    
    df["min"] = df["min"].astype(str)

    # drop the row if the player didn't play in the game
    df.reset_index(drop=True, inplace=True)  # The next line of code depends on unique indices!!!!
    played_0min = df[df["min"].eq("0:00") | df["min"].eq("") | df["min"].str.startswith("0")].index
    df.drop(played_0min, axis=0, inplace=True)

    # Convert times like "27.0" to "27:0"
    df["min"] = df["min"].str.replace(".",":", regex=False)

    # convert times like "27" to "27:00"
    minutes_only_times = df["min"][~df["min"].str.contains(":")].index
    df["min"].loc[minutes_only_times] += ":00"


    minutes = [time[0] for time in df["min"].str.split(":").values]
    seconds = [time[1] for time in df["min"].str.split(":").values]

    # convert times like "27:0" to "27:00"
    for i, second in enumerate(seconds):
        if len(second) == 1:
            seconds[i] = second + "0"

    # convert times like "8:60" to "9:00"
    for i, second in enumerate(seconds):        
        if second == "60":
            seconds[i] = "00"
            minutes[i] = str(int(minutes[i]) + 1)  # increment minutes by 1

    times = [":".join(list(item)) for item in list(zip(minutes,seconds))]

    df["min"] = times

    return df




def aggregate_stats(df):
    # Convert game date to datetime
    df["game.date"] = pd.to_datetime(df["game.date"]).dt.tz_localize(None)

    # Convert string to timedelta
    df["min"] = [pd.Timedelta(minutes=int(time[0]), seconds=int(time[1])) for time in df["min"].str.split(":").values]

    agg_map = {"ast": "sum", 
           "blk": "sum", 
           "dreb": "sum", 
           "fg3_pct": "mean", 
           "fg3a": "sum", 
           "fg3m": "sum", 
           "fg_pct": "mean",
          "fga": "sum",
          "fgm": "sum",
          "ft_pct": "mean",
          "fta": "sum",
          "ftm": "sum",
          "min": "sum",
          "oreb": "sum",
          "pf": "sum",
          "pts": "sum",
          "reb": "sum",
          "stl": "sum",
          "turnover": "sum",
          "game.id": "first",
          "game.date": "first",
          "game.season": "first",
          "game.home_team_id": "first",
          "game.home_team_score": "first",
          "game.visitor_team_id": "first",
          "game.visitor_team_score": "first",
          "player.id": "first",
          "team.id": "first",}

    df = df.groupby("game.id").agg(agg_map)

    return df




def get_stats(game_ids_home, game_ids_away):
    """
    This function makes a request to balldontlie API for stats from specific games.
    The arguments for this function should be:
    1. a list of the 20 most recent game ids for the home team
    2. a list of the 20 most recent game ids for the away team
    
    The order matters. Putting the away team as the first argument and home team as the
    second will produce inaccurate results.
    
    The function returns a Numpy array that the model is expecting as input.
    """
    
    
    def format_params(game_ids):
        """ 
        Format query paramaters in a format the balldontlie API accepts
        e.g. ?game_ids[]=345686&game_ids[]=234356&gameids[]=3456356...
        """
        params = "game_ids[] " * len(game_ids)
        params = list(zip(params.split(" "), game_ids))
        params.append(("per_page", 100))
        return params
    
    stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]
    
    # Get pandas Series of home team stats
    params_home = format_params(game_ids_home)                                 # Get param list
    stats_home = make_request("stats", record_path="data", params=params_home) # Make request with said param list
    stats_home = clean_stats(stats_home)                                       # clean the data
    stats_home = stats_home[stats_home["team.id"].eq(stats_home["game.home_team_id"])]  # Filter for stats of players that played for the home team
    stats_home = aggregate_stats(stats_home)                                   # aggregate individual player stats into team stats
    stats_home = stats_home[stats_cols]                                        # Drop the columns that aren't basketball stats
    stats_home = stats_home.mean()                                             # average the stats
    
    # Get pandas Series of away team stats
    params_away = format_params(game_ids_away)
    stats_away = make_request("stats", record_path="data", params=params_away)
    stats_away = clean_stats(stats_away)
    stats_away = stats_away[stats_away["team.id"].eq(stats_away["game.visitor_team_id"])]
    stats_away = aggregate_stats(stats_away)
    stats_away = stats_away[stats_cols]
    stats_away = stats_away.mean()
    
    # Make a stats diff Series
    stats_diff = stats_home - stats_away
    
    # Rename columns and put it all together
    stats_home.index = "home_" + stats_home.index
    stats_away.index = "away_" + stats_away.index
    stats_diff.index = "diff_" + stats_diff.index
    
    stats = stats_home.append([stats_away, stats_diff])
    model_input = stats.values.reshape(1,-1)
    
    return model_input


In [None]:
def get_team_code_map(df=False):
    # Make balldontlie api request and convert the json response to pandas dataframe
    team_code_df = make_request("teams", record_path="data")
    team_code_df = team_code_df[["id", "city", "abbreviation", "full_name", "name"]]
    team_code_df = team_code_df.set_index("id")
    # using said dataframe, map team names to team id
    team_code_map = {}
    for row in team_code_df.iterrows():
        team_code_map.update(dict.fromkeys(row[1].str.lower().values, row[0]))
        # Make sure "1" maps to 1. i.e. string maps to integer. This is so people can enter the team code
        # in the text box for convenience and everything still works fine.
        team_code_map.update({str(row[0]): row[0]})                   
    if df:
        return team_code_df
    else:
        return team_code_map  # returning a dictionary

In [None]:
team_codes = get_team_code_map(df=True)
team_codes

Unnamed: 0_level_0,city,abbreviation,full_name,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Atlanta,ATL,Atlanta Hawks,Hawks
2,Boston,BOS,Boston Celtics,Celtics
3,Brooklyn,BKN,Brooklyn Nets,Nets
4,Charlotte,CHA,Charlotte Hornets,Hornets
5,Chicago,CHI,Chicago Bulls,Bulls
6,Cleveland,CLE,Cleveland Cavaliers,Cavaliers
7,Dallas,DAL,Dallas Mavericks,Mavericks
8,Denver,DEN,Denver Nuggets,Nuggets
9,Detroit,DET,Detroit Pistons,Pistons
10,Golden State,GSW,Golden State Warriors,Warriors


In [None]:
def getIDFromTeamName(name) :
  for index, row in team_codes.iterrows():
   city = row['city']
   abbreviation = row['abbreviation']
   fullnames = row['full_name']
   part_name = row['name']
   if name == city or name == abbreviation or name == fullnames or name == part_name :
     return index

getIDFromTeamName("WAS")
  

30

In [None]:
# Get a pandas dataframe of the 20 most recent games for the following team
# NOTE: this code is not used. it's a template for what is now get_recent_games() in functions.py
home_team_id = "WAS"

today = date.today()                                                           # Get today
today = f"{today.year}-{today.month}-{today.day}"                              # Convert to format yyyy-mm-dd
one_year_ago = date.today() - timedelta(days=365)                              # Get last-year-today
one_year_ago = f"{one_year_ago.year}-{one_year_ago.month}-{one_year_ago.day}"  # convert to format yyyy-mm-dd

recent_games = pd.DataFrame()
res = make_request("games", record_path="data", params={"end_date": today,
                                                        "start_date": one_year_ago,
                                                        "team_ids[]": [home_team_id],
                                                        "page": "1",
                                                       "per_page": "100"})

res = res.sort_values("date", ascending=False)
res = res[res["home_team.id"].eq(home_team_id)]

recent_games = recent_games.append(res)
                                ### Should write a line of code here that drops the row if "time" is not an empty string (meaning the game is still in progress)
recent_games = recent_games.head(20)

In [None]:
recent_games

Unnamed: 0,id,date,home_team_score,period,postseason,season,status,time,visitor_team_score,home_team.id,home_team.abbreviation,home_team.city,home_team.conference,home_team.division,home_team.full_name,home_team.name,visitor_team.id,visitor_team.abbreviation,visitor_team.city,visitor_team.conference,visitor_team.division,visitor_team.full_name,visitor_team.name
79,857644,2022-11-27T00:00:00.000Z,98,4,False,2022,Final,Final,106,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,16,MIA,Miami,East,Southeast,Miami Heat,Heat
81,857619,2022-11-23T00:00:00.000Z,115,4,False,2022,Final,Final,106,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,26,SAC,Sacramento,West,Pacific,Sacramento Kings,Kings
78,857589,2022-11-19T00:00:00.000Z,124,5,False,2022,Final,Final,122,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,28,TOR,Toronto,East,Atlantic,Toronto Raptors,Raptors
80,857570,2022-11-16T00:00:00.000Z,101,4,False,2022,Final,Final,126,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
83,857528,2022-11-10T00:00:00.000Z,104,4,False,2022,Final,Final,95,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,23,PHI,Philadelphia,East,Atlantic,Philadelphia 76ers,76ers
82,857516,2022-11-09T00:00:00.000Z,119,4,False,2022,Final,Final,125,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,29,UTA,Utah,West,Northwest,Utah Jazz,Jazz
77,857503,2022-11-07T00:00:00.000Z,117,4,False,2022,Final,Final,98,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,17,MIL,Milwaukee,East,Central,Milwaukee Bucks,Bucks
73,857492,2022-11-05T00:00:00.000Z,124,5,False,2022,Final,Final,121,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,19,NOP,New Orleans,West,Southwest,New Orleans Pelicans,Pelicans
69,857392,2022-10-23T00:00:00.000Z,109,4,False,2022,Final,Final,126,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,4,CHA,Charlotte,East,Southeast,Charlotte Hornets,Hornets
68,857380,2022-10-21T00:00:00.000Z,108,4,False,2022,Final,Final,98,1,ATL,Atlanta,East,Southeast,Atlanta Hawks,Hawks,22,ORL,Orlando,East,Southeast,Orlando Magic,Magic


##Testing for WAS and ATL

In [None]:
game_ids_home, game_ids_away = get_recent_games(getIDFromTeamName("BOS"),getIDFromTeamName("GSW"))

In [None]:
game_ids_away

[473543,
 473465,
 473460,
 470200,
 461294,
 451511,
 449206,
 448625,
 432960,
 426790,
 424777,
 264797,
 264786,
 264728,
 264715,
 264646,
 264631,
 264614,
 264606,
 264528]

##This part will take 15-20 sec on the web page getting stats of last 20 games

In [None]:
stats = get_stats(game_ids_home, game_ids_away)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
stats

array([[ 2.67368421e+01,  7.47368421e+00,  3.32105263e+01,
         1.35758839e+01,  3.15263158e+01,  1.20526316e+01,
         2.37136748e+01,  8.23684211e+01,  4.17894737e+01,
         2.10509072e+01,  2.23684211e+01,  1.77894737e+01,
         8.84210526e+00,  2.07894737e+01,  1.13421053e+02,
         4.20526316e+01,  7.21052632e+00,  1.41052632e+01,
         2.38500000e+01,  4.50000000e+00,  3.16000000e+01,
         2.26033961e+01,  3.42000000e+01,  1.11500000e+01,
         4.07361038e+01,  8.77000000e+01,  3.82500000e+01,
         4.53272622e+01,  2.09500000e+01,  1.59500000e+01,
         1.04500000e+01,  1.76000000e+01,  1.03600000e+02,
         4.20500000e+01,  8.75000000e+00,  1.48000000e+01,
         2.88684211e+00,  2.97368421e+00,  1.61052632e+00,
        -9.02751217e+00, -2.67368421e+00,  9.02631579e-01,
        -1.70224290e+01, -5.33157895e+00,  3.53947368e+00,
        -2.42763550e+01,  1.41842105e+00,  1.83947368e+00,
        -1.60789474e+00,  3.18947368e+00,  9.82105263e+0

##Testing the Model

In [None]:
import pickle

In [None]:
model = pickle.load(open("/content/drive/MyDrive/Machine Learning NBA /Models/model.sav", "rb"))

In [None]:
prediction = model.predict(stats)
prediction

  "X does not have valid feature names, but"


array([1])

In [None]:
# probability that away team will win
model.predict_proba(stats)[0][0]

  "X does not have valid feature names, but"


0.32425016

In [None]:
# probability that home team will win
model.predict_proba(stats)[0][1]

  "X does not have valid feature names, but"


0.67574984