In [1]:
import pandas as pd
import datetime as datetime
from basketball_reference_web_scraper import client
import joblib
import unicodedata

In [2]:
#Get all dates for the nba seasons 2016-2020
nba_season_start_end_dates = {'2016-2017':[datetime.datetime(2016,10,25),datetime.datetime(2017,4,12)],
                             '2017-2018':[datetime.datetime(2017,10,17),datetime.datetime(2018,4,11)],
                             '2018-2019':[datetime.datetime(2018,10,16),datetime.datetime(2019,4,10)],
                             '2019-2020':[datetime.datetime(2019,10,22),datetime.datetime(2020,9,8)]}

nba_2016_to_2017_dates = [nba_season_start_end_dates['2016-2017'][0] + datetime.timedelta(days=x) for x in range((nba_season_start_end_dates['2016-2017'][1]-nba_season_start_end_dates['2016-2017'][0]).days+1)]
nba_2017_to_2018_dates = [nba_season_start_end_dates['2017-2018'][0] + datetime.timedelta(days=x) for x in range((nba_season_start_end_dates['2017-2018'][1]-nba_season_start_end_dates['2017-2018'][0]).days+1)]
nba_2018_to_2019_dates = [nba_season_start_end_dates['2018-2019'][0] + datetime.timedelta(days=x) for x in range((nba_season_start_end_dates['2018-2019'][1]-nba_season_start_end_dates['2018-2019'][0]).days+1)]
nba_2019_to_2020_dates = [nba_season_start_end_dates['2019-2020'][0] + datetime.timedelta(days=x) for x in range((nba_season_start_end_dates['2019-2020'][1]-nba_season_start_end_dates['2019-2020'][0]).days+1)]

In [3]:
#Get all player season totals for each year - Using this information to make a list of each player that has played during 2016-2020
nba_2016_player_averages = client.players_season_totals(season_end_year=2017)
nba_2017_player_averages = client.players_season_totals(season_end_year=2018)
nba_2018_player_averages = client.players_season_totals(season_end_year=2019)
nba_2019_player_averages = client.players_season_totals(season_end_year=2020)

In [4]:
#Get all player slug ids and their positions and dump as joblib object
player_positions = {}

def get_positions(player_averages):
    for player in player_averages:
        if player['slug'] not in player_positions:
            player_positions[player['slug']] = player['positions']
        else:
            pass
        
get_positions(nba_2016_player_averages)
get_positions(nba_2017_player_averages)
get_positions(nba_2018_player_averages)
get_positions(nba_2019_player_averages)

joblib.dump(player_positions,'../joblib_objects/player_positions')

['../joblib_objects/player_positions']

In [5]:
#Function to remove accents from player names
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

#Get all player names and their slug ids then save as joblib object
player_slugs_names = {}
def get_slugs_names(player_averages):
    for player in player_averages:
        
        try:
        
            name_split = player['name'].split(' ')
            name = f'{name_split[0]} {name_split[1]}'
            name = strip_accents(name)

            if name not in player_slugs_names.keys():
                player_slugs_names[name] = player['slug']
            else:
                pass
        except Exception as e:
            print(player['name'])

get_slugs_names(nba_2016_player_averages)
get_slugs_names(nba_2017_player_averages)
get_slugs_names(nba_2018_player_averages)
get_slugs_names(nba_2019_player_averages)

joblib.dump(player_slugs_names,'joblib_objects/player_slugs_names')

Nenê
Nenê
Nenê


['joblib_objects/player_slugs_names']

In [5]:
#Create empty dataframe that will store each players per game data as a dictionary key(slug id)
player_dataframes = {}

#Get all team names and format
team_names = []

def get_player_names_and_teams(nba_player_averages):
    columns_names = ['date','slug','name','player_id','positions','FD_pts_scored','location','opponent','opponent_id','points_scored','seconds_played','made_field_goals','attempted_field_goals','made_three_point_field_goals',
                    'attempted_three_point_field_goals','made_free_throws','attempted_free_throws','offensive_rebounds','defensive_rebounds',
                    'assists','steals','blocks','turnovers','game_score']
    for player in nba_player_averages:
        if player['slug'] not in player_dataframes:
            player_dataframes[player['slug']] = pd.DataFrame(columns=columns_names)
            player_dataframes[player['slug']].name = player['name']
        else:
            pass
        
        if player['team'] not in team_names:
            team_names.append(str(player['team']))
        else:
            pass

get_player_names_and_teams(nba_2016_player_averages) 
get_player_names_and_teams(nba_2017_player_averages) 
get_player_names_and_teams(nba_2018_player_averages) 
get_player_names_and_teams(nba_2019_player_averages) 

team_names_formatted = [x.split('.')[1] for x in team_names]
team_names_formatted

['OKLAHOMA_CITY_THUNDER',
 'DALLAS_MAVERICKS',
 'BROOKLYN_NETS',
 'OKLAHOMA_CITY_THUNDER',
 'SACRAMENTO_KINGS',
 'NEW_ORLEANS_PELICANS',
 'MINNESOTA_TIMBERWOLVES',
 'SAN_ANTONIO_SPURS',
 'INDIANA_PACERS',
 'MEMPHIS_GRIZZLIES',
 'PORTLAND_TRAIL_BLAZERS',
 'CLEVELAND_CAVALIERS',
 'LOS_ANGELES_CLIPPERS',
 'DALLAS_MAVERICKS',
 'PHILADELPHIA_76ERS',
 'SAN_ANTONIO_SPURS',
 'HOUSTON_ROCKETS',
 'MILWAUKEE_BUCKS',
 'NEW_YORK_KNICKS',
 'SAN_ANTONIO_SPURS',
 'HOUSTON_ROCKETS',
 'DENVER_NUGGETS',
 'NEW_ORLEANS_PELICANS',
 'ORLANDO_MAGIC',
 'MIAMI_HEAT',
 'NEW_YORK_KNICKS',
 'MEMPHIS_GRIZZLIES',
 'PHOENIX_SUNS',
 'DALLAS_MAVERICKS',
 'DALLAS_MAVERICKS',
 'SACRAMENTO_KINGS',
 'GOLDEN_STATE_WARRIORS',
 'DENVER_NUGGETS',
 'LOS_ANGELES_CLIPPERS',
 'CHARLOTTE_HORNETS',
 'PHILADELPHIA_76ERS',
 'DETROIT_PISTONS',
 'ATLANTA_HAWKS',
 'WASHINGTON_WIZARDS',
 'DENVER_NUGGETS',
 'MILWAUKEE_BUCKS',
 'CHARLOTTE_HORNETS',
 'ATLANTA_HAWKS',
 'PHOENIX_SUNS',
 'BROOKLYN_NETS',
 'DALLAS_MAVERICKS',
 'SAN_ANTONIO_SPURS

In [6]:
#Encode all formatted team names and save as joblib object
from sklearn.preprocessing import LabelEncoder

team_label_encoder = LabelEncoder()
team_label_encoder.fit(team_names_formatted)
encoded_team_names = team_label_encoder.transform(team_names_formatted)

joblib.dump(team_label_encoder,'../joblib_objects/team_label_encoder')

['../joblib_objects/team_label_encoder']

In [7]:
#Encode all player slugs and save as joblib object
player_slugs = player_positions.keys()
player_label_encoder = LabelEncoder()
player_label_encoder.fit(list(player_slugs))
encoded_player_names = player_label_encoder.transform(list(player_slugs))

joblib.dump(player_label_encoder,'../joblib_objects/player_label_encoder')

['../joblib_objects/player_label_encoder']

In [8]:
# Format advanced analytics csvs
import os

analytics_file_paths = os.listdir('../advanced_stats_basketball_reference')
analytics_full_file_paths = [f'../advanced_stats_basketball_reference/{file}' for file in analytics_file_paths]

for file in analytics_full_file_paths:
    analytics_df = pd.read_csv(file)
    analytics_df = analytics_df.iloc[0:-1,:]
    analytics_df = analytics_df[['Team','MOV','SRS','ORtg','DRtg','NRtg','Pace','FTr','3PAr','TOV%','ORB%','eFG%.1','TOV%.1','DRB%']]
    analytics_df['Team'] = analytics_df['Team'].apply(lambda x: x.split('*')[0])
    analytics_df['Team'] = analytics_df['Team'].apply(lambda x: x.upper().replace(' ','_'))
    analytics_df.columns = ['Team','Margin_Of_Victory','Simple_Rating_System','Offensive_Rating','Defensive_Rating','Net_Rating','Pace','Free_Throw_Rate','3_Pt_Rate','Turnover_Percentage','Offensive_Rebound_Percentage','Opponent_EFG','Opponent_Turnover_Percentage','Opponent_Defensive_Rebound_Percentage']
    analytics_df['Team_ID'] = analytics_df['Team'].apply(lambda x: team_label_encoder.transform([x])[0])
    
    file_name_split = file.split('_')
    analytics_df.to_csv(f'../cleaned_data/advanced_analytics/advanced_analytics_{file_name_split[5]}_{file_name_split[6]}')

In [10]:
#Function to get boxscores for each nba day, then loop through each player and add their stats to their respective dataframe
def add_boxscores(dates):
    columns_names = ['slug','name','location','opponent','seconds_played','made_field_goals','attempted_field_goals','made_three_point_field_goals',
                    'attempted_three_point_field_goals','made_free_throws','attempted_free_throws','offensive_rebounds','defensive_rebounds',
                    'assists','steals','blocks','turnovers','game_score']
    for date in dates:
        try:
            box_scores = client.player_box_scores(day=date.day, month=date.month, year=date.year)
            for player in box_scores:
                if date not in player_dataframes[player['slug']]['date'].to_list():

                    #Create dataframe from player boxscore, filter for only needed columns, then convert back to dictionary format
                    player_box_score = pd.DataFrame([player])
                    player_box_score = player_box_score[columns_names].to_dict('index')[0]

                    #Add date to row
                    player_box_score['date'] = date
                    
                    #Format Team Name
                    player_box_score['opponent'] = str(player_box_score['opponent']).split('.')[1]
                    
                    #Add player ID
                    player_id = player_label_encoder.transform([player['slug']])
                    player_box_score['player_id'] = player_id[0]
                    
                    #Add opponent ID
                    opponent_id = team_label_encoder.transform([str(player_box_score['opponent'])])
                    player_box_score['opponent_id'] = opponent_id[0]
                    
                    #Add points scored
                    points_scored = (player['made_field_goals'] * 2) + player['made_three_point_field_goals'] + player['made_free_throws']
                    player_box_score['points_scored'] = points_scored
                    
                    #Add fanduel points scored
                    fd_pts_scored = (points_scored + ( (player['offensive_rebounds']+player['defensive_rebounds']) * 1.2 ) + (player['assists']*1.5) + 
                                    (player['steals']*3) + (player['blocks']*3) - player['turnovers']) 
                    player_box_score['FD_pts_scored'] = fd_pts_scored
                    
                    #Format Location
                    player_box_score['location'] = str(player_box_score['location'])
                
                    #Add boxscore to player_dataframes based on dictionary key ('slug')
                    player_dataframes[player['slug']] = player_dataframes[player['slug']].append(player_box_score, ignore_index=True)
                else:
                    pass
        
        except Exception as e:
            print(Exception, e)

In [11]:
#Use above function to add all player data from 2016-2020
add_boxscores(nba_2016_to_2017_dates)
add_boxscores(nba_2017_to_2018_dates)
add_boxscores(nba_2018_to_2019_dates)
add_boxscores(nba_2019_to_2020_dates)

In [19]:
#Connect to AWS RDS database
from sqlalchemy import *
from sqlalchemy.orm import Session
from sqlalchemy import Table, Column, String, MetaData, Integer, Float
import os

user = os.environ['RDS_NBA_DATABASE_USER']
password = os.environ['RDS_NBA_DATABASE_PASSWORD']

db = create_engine(f'postgresql://{user}:{password}@fanduel-lineup-prediction-scraped-data.cvzkizpca2fx.us-east-1.rds.amazonaws.com')
meta = MetaData(db)

In [13]:
#Loop through each player dataframe and create table in RDS AWS database which contains their game data
database_tables = db.table_names()

for key in player_dataframes.keys():
    df = player_dataframes[key]
    
    if key not in database_tables:
        df.to_sql(key, db)
        
    else:
        try:
            database_df = pd.read_sql_table(key, db, index_col='index')
            new_entries_df = df['date'].loc[df['date'] != database_df['date'],:]
            new_entries_df.to_sql(key, db, if_exists='append')
        except:
            pass

In [None]:
#Get all abbreviated team names and their full team name
url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations'
team_abbreviations_full_name_df = pd.read_html(url)[0].iloc[1:,:]

In [None]:
#Make list of team abbreviations and full team names formatted
team_abbreviations = team_abbreviations_full_name_df[0].to_list()
team_full_names = team_abbreviations_full_name_df[1].to_list()
team_full_names_formatted = [team.replace(' ','_') for team in team_full_names]
team_full_names_formatted = [team.upper() for team in team_full_names_formatted]

In [None]:
#Make dictionary of full team names formatted and their abbreviations then save as joblib object
team_abbreviations_full_name_dict = {abbr:full_name for abbr,full_name in zip(team_abbreviations,team_full_names_formatted)}

joblib.dump(team_abbreviations_full_name_dict,'../joblib_objects/team_abbreviations_full_name_dict')