In [4]:
import sqlite3
import pandas as pd
import time
import requests

conexion = sqlite3.connect("NBA_DATA.db")

cursor = conexion.cursor()

# Crear la tabla Teams
cursor.execute("""
create table If Not Exists Teams (
  team_id int primary key,
  team_nickname text not null,
  abbreviation text not null,
  city text not null,
  state text not null,
  full_name text not null
)
""")

# Crear la tabla Games
cursor.execute("""
create table If Not Exists Games (
  game_id bigint primary key,
  game_date date not null,
  h_team_nickname text not null,
  a_team_nickname text not null,
  h_team_id int references teams (team_id),
  a_team_id int references teams (team_id),
  season int not null
)
""")

# Crear la tabla TEAM_INFO_COMMON
cursor.execute("""
create table If Not Exists game_stats (
  game_id bigint references games (game_id),
  h_flag boolean not null,
  city text,
  nickname text,
  team_id int,
  w int,
  l int,
  w_home int,
  l_home int,
  w_road int,
  l_road int,
  team_turnovers int,
  team_rebounds int,
  gp int,
  gs int,
  actual_minutes int,
  actual_seconds int,
  fg int,
  fga int,
  fg_pct double precision,
  fg3 int,
  fg3a int,
  fg3_pct double precision,
  ft int,
  fta int,
  ft_pct double precision,
  off_reb int,
  def_reb int,
  tot_reb int,
  ast int,
  pf int,
  stl int,
  total_turnovers int,
  blk int,
  pts int,
  avg_reb double precision,
  avg_pts double precision,
  dq int,
  of_efficiency double precision,
  scoring_margin double precision,
  primary key (game_id, h_flag)
  
)
""")

conexion.close()

In [5]:
from nba_api.stats.static import teams

teamLookup = pd.DataFrame(teams.get_teams())

teamLookup = teamLookup.rename(columns={"id": "team_id", "nickname": "team_nickname"}).drop(columns='year_founded')

conexion = sqlite3.connect("NBA_DATA.db")

teamLookup.to_sql("Teams", conexion, if_exists="replace", index=False)

30

In [6]:
def retry(func, retries=3):
    def retry_wrapper(*args, **kwargs):
        attempts = 0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1

    return retry_wrapper

In [7]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import cumestatsteamgames, cumestatsteam, gamerotation
import pandas as pd
import numpy as np
import json
import difflib
import time
import requests
import sqlite3

def getSeasonScheduleFrame(seasons, seasonType):
    def getGameDate(matchup):
        return matchup.partition(' at')[0][:10]

    def getHomeTeam(matchup):
        return matchup.partition(' at')[2]

    def getAwayTeam(matchup):
        return matchup.partition(' at')[0][10:]

    def getTeamIDFromNickname(nickname):
        return teamLookup.loc[teamLookup['team_nickname'] == difflib.get_close_matches(nickname, teamLookup['team_nickname'], 1)[0]].values[0][0]

    @retry
    def getRegularSeasonSchedule(season, teamID, seasonType):
        season = str(season) + "-" + str(season + 1)[-2:]
        teamGames = cumestatsteamgames.CumeStatsTeamGames(league_id='00', season=season,
                                                          season_type_all_star=seasonType,
                                                          team_id=teamID).get_normalized_json()

        teamGames = pd.DataFrame(json.loads(teamGames)['CumeStatsTeamGames'])
        teamGames['SEASON'] = season
        return teamGames

    teamLookup = pd.DataFrame(teams.get_teams())
    teamLookup = teamLookup.rename(columns={"id": "team_id", "nickname": "team_nickname"}).drop(columns='year_founded')

    conexion = sqlite3.connect("NBA_DATA.db")
    teamLookup.to_sql("Teams", conexion, if_exists="replace", index=False)

    scheduleFrame = pd.DataFrame()

    for season in seasons:
        for id in teamLookup['team_id']:
            time.sleep(3)
            teamGames = getRegularSeasonSchedule(season, id, seasonType)
            scheduleFrame = pd.concat([scheduleFrame, teamGames], ignore_index=True)

    scheduleFrame['GAME_DATE'] = pd.to_datetime(scheduleFrame['MATCHUP'].map(getGameDate))
    scheduleFrame['HOME_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getHomeTeam)
    scheduleFrame['HOME_TEAM_ID'] = scheduleFrame['HOME_TEAM_NICKNAME'].map(getTeamIDFromNickname)
    scheduleFrame['AWAY_TEAM_NICKNAME'] = scheduleFrame['MATCHUP'].map(getAwayTeam)
    scheduleFrame['AWAY_TEAM_ID'] = scheduleFrame['AWAY_TEAM_NICKNAME'].map(getTeamIDFromNickname)
    scheduleFrame = scheduleFrame.drop_duplicates()  # There's a row for both teams, only need 1
    scheduleFrame = scheduleFrame.reset_index(drop=True)

    return scheduleFrame

In [8]:
# Get Single Game aggregation columns

def getSingleGameMetrics(gameID,homeTeamID,awayTeamID,awayTeamNickname,seasonYear,gameDate):

    @retry
    def getGameStats(teamID,gameID,seasonYear):
        #season = str(seasonYear) + "-" + str(seasonYear+1)[-2:]
        gameStats = cumestatsteam.CumeStatsTeam(game_ids=gameID,league_id ="00",
                                               season=seasonYear,season_type_all_star="Regular Season",
                                               team_id = teamID).get_normalized_json()

        gameStats = pd.DataFrame(json.loads(gameStats)['TotalTeamStats'])

        return gameStats

    data = getGameStats(homeTeamID,gameID,seasonYear)
    data.at[1,'NICKNAME'] = awayTeamNickname
    data.at[1,'TEAM_ID'] = awayTeamID
    data.at[1,'OFFENSIVE_EFFICIENCY'] = (data.at[1,'FG'] + data.at[1,'AST'])/(data.at[1,'FGA'] - data.at[1,'OFF_REB'] + data.at[1,'AST'] + data.at[1,'TOTAL_TURNOVERS'])
    data.at[1,'SCORING_MARGIN'] = data.at[1,'PTS'] - data.at[0,'PTS']

    data.at[0,'OFFENSIVE_EFFICIENCY'] = (data.at[0,'FG'] + data.at[0,'AST'])/(data.at[0,'FGA'] - data.at[0,'OFF_REB'] + data.at[0,'AST'] + data.at[0,'TOTAL_TURNOVERS'])
    data.at[0,'SCORING_MARGIN'] = data.at[0,'PTS'] - data.at[1,'PTS']

    data['SEASON'] = seasonYear
    data['GAME_DATE'] = gameDate
    data['GAME_ID'] = gameID

    return data

In [15]:
def getGameLogs(gameLogs,scheduleFrame):
    
    # Functions to prepare additional columns after gameLogs table loads
    def getHomeAwayFlag(gameDF):
        gameDF['HOME_FLAG'] = np.where((gameDF['W_HOME']==1) | (gameDF['L_HOME']==1),1,0)
        gameDF['AWAY_FLAG'] = np.where((gameDF['W_ROAD']==1) | (gameDF['L_ROAD']==1),1,0)
        #return gameDF 

    def getTotalWinPctg(gameDF):
        gameDF['TOTAL_GAMES_PLAYED'] = gameDF.groupby(['TEAM_ID','SEASON'])['GAME_DATE'].rank(ascending=True)
        gameDF['TOTAL_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W'].cumsum()
        gameDF['TOTAL_WIN_PCTG'] = gameDF['TOTAL_WINS']/gameDF['TOTAL_GAMES_PLAYED']
        return gameDF.drop(['TOTAL_GAMES_PLAYED','TOTAL_WINS'],axis=1)

    def getHomeWinPctg(gameDF):
        gameDF['HOME_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_FLAG'].cumsum()
        gameDF['HOME_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_HOME'].cumsum()
        gameDF['HOME_WIN_PCTG'] = gameDF['HOME_WINS']/gameDF['HOME_GAMES_PLAYED']
        return gameDF.drop(['HOME_GAMES_PLAYED','HOME_WINS'],axis=1)

    def getAwayWinPctg(gameDF):
        gameDF['AWAY_GAMES_PLAYED'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_FLAG'].cumsum()
        gameDF['AWAY_WINS'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['W_ROAD'].cumsum()
        gameDF['AWAY_WIN_PCTG'] = gameDF['AWAY_WINS']/gameDF['AWAY_GAMES_PLAYED']
        return gameDF.drop(['AWAY_GAMES_PLAYED','AWAY_WINS'],axis=1)

    def getRollingOE(gameDF):
        gameDF['ROLLING_OE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].transform(lambda x: x.rolling(3, 1).mean())

    def getRollingScoringMargin(gameDF):
        gameDF['ROLLING_SCORING_MARGIN'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['SCORING_MARGIN'].transform(lambda x: x.rolling(3, 1).mean())

    def getRestDays(gameDF):
        gameDF['LAST_GAME_DATE'] = gameDF.sort_values(by='GAME_DATE').groupby(['TEAM_ID','SEASON'])['GAME_DATE'].shift(1)
        gameDF['NUM_REST_DAYS'] = (gameDF['GAME_DATE'] - gameDF['LAST_GAME_DATE'])/np.timedelta64(1,'D') 
        return gameDF.drop('LAST_GAME_DATE',axis=1)
    
    start = time.perf_counter_ns()

    i = int(len(gameLogs)/2) #Can use a previously completed gameLog datasetn

    while i<len(scheduleFrame):


        time.sleep(10)
        gameLogs = pd.concat([gameLogs, getSingleGameMetrics(scheduleFrame.at[i,'GAME_ID'],scheduleFrame.at[i,'HOME_TEAM_ID'],
                         scheduleFrame.at[i,'AWAY_TEAM_ID'],scheduleFrame.at[i,'AWAY_TEAM_NICKNAME'],
                         scheduleFrame.at[i,'SEASON'],scheduleFrame.at[i,'GAME_DATE'])])
        
        gameLogs = gameLogs.reset_index(drop=True)

        end = time.perf_counter_ns()

        if i%100 == 0:
            mins = ((end-start)/1e9)/60
            print(i,mins)

        i+=1
        
    # Get Table Level Aggregation Columns
    getHomeAwayFlag(gameLogs)
    gameLogs = getHomeWinPctg(gameLogs)
    gameLogs = getAwayWinPctg(gameLogs)
    gameLogs = getTotalWinPctg(gameLogs)
    getRollingScoringMargin(gameLogs)
    getRollingOE(gameLogs)
    gameLogs = getRestDays(gameLogs)

    return gameLogs.reset_index(drop=True)

In [19]:
#Get ScheduleFrame

seasons = [2021]
seasonType = 'Regular Season'

start = time.perf_counter_ns()
#scheduleFrame = getSeasonScheduleFrame(seasons,seasonType)
#scheduleFrame.to_sql("Games", conexion, if_exists="replace", index=False)
end = time.perf_counter_ns()

secs = (end-start)/1e9
mins = secs/60
print(f"scheduleFrame takes: {int(mins)}:{int(secs)%60}")

start = time.perf_counter_ns()
gameLogs = pd.DataFrame()
gameLogs = getGameLogs(gameLogs,scheduleFrame)
gameLogs.to_csv('gameLogs.csv')

end = time.perf_counter_ns()

secs = (end-start)/1e9
mins = secs/60
print(f"GameLogs takes: {int(mins)}:{int(secs)%60}")

scheduleFrame takes: 0:0
0 0.16743692666666668
100 17.170295296666666
HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)


AttributeError: 'NoneType' object has no attribute 'at'

In [14]:
len(scheduleFrame.GAME_ID.unique())

1230