# Data Scraping and Cleaning for NBA Player Data

    Goal: Get a list of players who have played in the NBA,
          look at their performance through their first two seasons
          and see how old they were
    
    1. Get NBA advanced metric data from 2000-2019 for every player within that period
    2. Clean data so that those who played before 2000 season are removed
    3. Clean data so only players whose careers have finished are still present
    

In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
"""
Goes through data tables when web scraping and pulls all player data 
rows is defind in get_data which is where the scraping of our website occurs

"""


def table_read(rows):
    data = []
    intermediate_list = []

    for tr in rows:
        for td in tr.findAll("td"):
            text = td(string=True)
            intermediate_list.append(''.join(text) if text else '0.0')
        data.append(intermediate_list)
        intermediate_list = []
    return(data)

In [3]:
"""
Merges data pulled from table_read and puts it into a dictionary with headers
"""

def dict_creation(data):
    headers = ['Player','Position','Age','Team','Games_played',
               'Min_per_game','PER', 'TS%',
               '3PAr', 'FTr','ORB%','DRB%',
               'TRB%','AST%','STL%','BLK%',
               'TOV%','USG%','', 'OWS',
               'DWS','WS','WS/48','', 
               'OBPM','DBPM','BPM','VORP']

    data_with_headers = []
    for i in range(len(data)):
        data_dict = dict(zip(headers, data[i]))
        data_with_headers.append(data_dict)
    return(data_with_headers)

In [4]:
"""
This is were the web scraping done

NBA_year is a list of the years to pull get data from
url is the url of the website to scrape
table_name and table_id are the html markers for the table that is being scraped 

"""


def get_data(NBA_years, url, table_name, table_id):
    #url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
    df_NBA = []
    for year in NBA_years:
        # year is insert into url so that we target the year we want in our website
        file_url = url.format(year) 
        
        response = requests.get(file_url)
        page = response.text
        NBA_soup = BeautifulSoup(page, 'lxml')
        table = NBA_soup.find(class_= table_name, id = table_id)
        
        
        rows = table.findAll('tr')
        
        # Here we call on our functions created above 
        data = table_read(rows)
        table_creation = dict_creation(data)
    
        dfs = pd.DataFrame(table_creation)
        dfs['Year'] = year # adds what year the data is pulled from to our table
        dfs = dfs[1:] # first row in all the tables for Baketball refernce are empty, this sorts them out
        
        # Removes duplicate entries 
        dfs = dfs.drop_duplicates(subset='Player', keep='first')
        dfs = dfs.reset_index()
        df_NBA.append(dfs)
        
    # This combines all dataframes
    return pd.concat(df_NBA).reset_index(drop=True)

In [5]:
NBA_years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

url ='https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'
table_name = 'overthrow table_container' 
table_id = 'div_advanced_stats'
df_advanced_stats = get_data(NBA_years, url, table_name, table_id)

In [6]:
"""
This Data is pulled to remove any players who were active before 2000 
and are still active players today from the dataframe, to get an accurate count 
of a players career length and first 2 year performance 
"""
NBA_years = [1999, 2019]
url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
table_name = 'stats_table'
table_id = 'per_game_stats'
sorted_players_df = get_data(NBA_years, url, table_name, table_id)

In [7]:
# Changing Age and Win shares to floats so we can work with them later
df_advanced_stats[['Age','WS']] = df_advanced_stats[['Age','WS']].astype(float)

In [8]:
#removes * from some player names
df_advanced_stats['Player'] = df_advanced_stats["Player"].str.replace("*", "")
df_advanced_stats.dropna(axis = 'rows', inplace =True)
#sorts dataframe by age and Player name
df_advanced_stats = df_advanced_stats.sort_values(['Player','Age'])

In [9]:
"""
Creates 4 lists that will be added to dataframe:
    Win Share in first 2 years
    Career length
    Age entering the NBA
    Age of retirement 
"""

ws_12 = []
career_len = []
max_age = []
enter_age = []

for count, player in enumerate(df_advanced_stats.Player):
    df1 = df_advanced_stats[df_advanced_stats['Player']==player]
    df1.reset_index(inplace = True)
    clen = df1.shape[0]
    if clen >= 2:
        ws = (df1.loc[0][23] + df1.loc[1][23])/2
        old_age = df1.loc[clen-1][4]
        young_age = df1.loc[0][4]
    else:
        ws = 0
        age = 0
        age1= 0
    max_age.append(old_age)
    career_len.append(clen)
    ws_12.append(float(ws))
    enter_age.append(young_age)


In [26]:
# Adds list to dataframe
df_advanced_stats['Career_length'] = career_len
df_advanced_stats['Retirement_age'] = max_age
df_advanced_stats['WS_first_2yr'] = ws_12
df_advanced_stats['Age_yr1'] = enter_age
df_advanced_stats['Career_length'].dtype

dtype('int64')

In [23]:
test_set = ['LeBron James','Dwyane Wade','Darko Miličić','Maciej Lampe','Brandon Hunter']

player_stats = []
for player in test_set:
    df_holder = df_advanced_stats[df_advanced_stats['Player']==player]
    player_stats.append(df_holder)

df_example_players = pd.concat(player_stats)

In [42]:
"""
Removes NBA players who were active before 2000s from dataframe
and NBA Players who are currently active
and removes players who only spent 1 year in the NBA
"""

sorted_players = sorted_players_df.Player.unique()

rows_to_remove = []
for player in sorted_players:
    #filter1 = dfr['Player'].isin(player)
    df_holder = df_advanced_stats[df_advanced_stats['Player']==player]
    rows_to_remove.append(df_holder)
    
df_of_removed_players = pd.concat(rows_to_remove)
df_of_1yr_players = df_advanced_stats[df_advanced_stats['Career_length'] == 1]

df_advanced_stats_sorted = pd.concat([df_advanced_stats, df_of_removed_players, 
                                      df_of_1yr_players]).drop_duplicates(keep=False)
df_advanced_stats_sorted.shape


(4414, 33)

In [43]:
#Removes outliers early code doesn't catch 
outlier_remove = df_advanced_stats_sorted[df_advanced_stats_sorted['Age_yr1'] > 29]
df_advanced_stats_sorted = pd.concat([df_advanced_stats_sorted, outlier_remove, a ]).drop_duplicates(keep=False)


In [44]:
# saving dataframe with pickle

with open('df_advanced_stats.pickle', 'wb') as handle:
    pickle.dump(df_advanced_stats_sorted, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
with open('example_players', 'wb') as handle:
    pickle.dump(df_example_players, handle, protocol=pickle.HIGHEST_PROTOCOL)