# Luther Project: NBA STATS scraper

In [2]:
#imports
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
import pickle
import random
import time


%matplotlib inline
sns.set()

In [115]:
# #test urls
# url = 'https://www.basketball-reference.com/players/n/nowitdi01.html' #drafted, many years experience
# url2 = 'https://www.basketball-reference.com/players/b/bareajo01.html' #undrafted, multiyear contract, multiple teams
# url3 = 'https://www.basketball-reference.com/players/m/motlejo01.html' #undrafted, few years of exp
# url4 = 'https://www.basketball-reference.com/players/b/bryanko01.html' #retired 
# url5 = 'https://www.basketball-reference.com/players/a/abrinal01.html' 
# url6 = 'https://www.basketball-reference.com/players/a/asikom01.html' #traded 
# url7 = 'https://www.basketball-reference.com/players/b/bookede01.html'
# url8 = 'https://www.basketball-reference.com/players/b/brookma01.html'
# url9 = 'https://www.basketball-reference.com/players/v/vanvlfr01.html'

# urls = [url, url2, url3, url4, url5, url6, url7, url8, url9]


In [116]:
def clean_html(url):
    """consolidate getting and cleaning html"""
    response = requests.get(url)
    #print(response.status_code)
    page = response.text.replace('<!--', '').replace('-->', '') # tables were hidden in comment tags
    soup = BeautifulSoup(page, 'lxml')
    return soup


In [117]:
# #generate test soups
# soups = [clean_html(url) for url in urls]

# Parsing for bio / static information

In [188]:
def find_name(soup):
    """player name"""
    return soup.find('div', id='meta').find('h1').text

def find_height(soup):
    """in inches"""
    ft_in = soup.find('div', id='meta').find(attrs={'itemprop': 'height'}).text.split('-')
    return 12*int(ft_in[0]) + int(ft_in[1])

def find_weight(soup):
    """in pounds"""
    return int(soup.find('div', id='meta').find(attrs={'itemprop': 'weight'}).text[:-2])
        
def find_draft_position(soup):
    """returns 0 if undrafted"""
    overall = soup.find('div', id='meta').find(text=re.compile('overall'))
    if overall:
        return int(overall.split()[-2][:-2])
    else:
        return 0 # for undrafted # would this make ranks out of order? 
    
def career_earnings(soup):
    if soup.find("table", id='all_salaries'):
        return int(soup.find("table", id='all_salaries').find("tfoot").find_all('td')[-1].text.replace('$','').replace(',',''))
    else:
        return 0 #no career earnings 
    
def find_bio(soup):
    """consolidated bio function"""
    return [find_name(soup), find_height(soup), find_weight(soup), find_draft_position(soup), career_earnings(soup)]

def find_career_stats(soup):
    """includes stats for future years """
    basics_raw = soup.find("table", id="per_game").find("tfoot").find_all('tr')[0].find_all('td')
    basics = [col.text for col in basics_raw]

    advanced = [col.text for col in (soup.find("table", id="advanced").find('tfoot')
                                           .find_all('tr')[0].find_all('td'))]

    return basics + advanced 

# Parse season specific information

In [184]:
def find_current_salary(soup, y):
    """0 if no known contract"""
    is_active = soup.find("table", id=re.compile('contracts')) #might need better way of checking
    if is_active:
        if not is_active.td.text: #check for no contract
            return '0'
        elif y == 1: #looks at contract table
            return is_active.find_all('td')[y].text
        elif soup.find("table", id='all_salaries'): #looks at salary table if it exists
        #    return soup.find("table", id="all_salaries").find_all('tr')[-y].find_all('td')[-1].text
            try: # catch errors with missing salary info
                return soup.find("table", id="all_salaries").find_all('tr')[-y].find_all('td')[-1].text
            except IndexError:
                return '0'
        else:
            return '0'
    elif soup.find("table", id="all_salaries"):
        try:
            return soup.find("table", id="all_salaries").find('tbody').find_all('tr')[-y].find_all('td')[-1].text
        except IndexError:
            return '0'
    else:
        return '0'

def find_season_stats(soup, y=1):
    """where y describes how recent the season is(1 being most recent)
    returns as a single list of:
        current season basic stats, advanced stats, and salary"""
    
    basic_rows = soup.find("table", id="per_game").find('tbody').find_all('tr', attrs={'class':'full_table'}) #basic
 
    career_length = len(basic_rows)
    years_exp =  career_length - (y-1) 

    if years_exp <= 0: #if wasn't in league
        return [np.NaN] * 60 #season stats= list of nulls
    
    current_year = soup.find("table", id="per_game").find("tbody").find_all('tr', attrs={'class':'full_table'})[-y].find('th').text
    basic_stats = [current_year, years_exp]
    for col in basic_rows[-y].find_all('td'): #looking at latest season
        basic_stats.append(col.text)

    advanced_stats = [col.text for col in (soup.find("table", id="advanced").find('tbody')
                                           .find_all('tr',attrs={'class':'full_table'})[-y].find_all('td'))]

    salary = [int(find_current_salary(soup, y).replace('$','').replace(',',''))] #numeric salary
    return basic_stats + advanced_stats + salary


In [185]:
#soup = clean_html('https://www.basketball-reference.com/players/t/thomala01.html')
# y =  3
#soup.find("table", id="all_salaries").find('tbody').find_all('tr')[-y].find_all('td')[-1].text

In [120]:
# soups[4].find("table", id="per_game").find("tbody").find_all('th',{'scope':'row', 'data-stat':'season'})[-2].text

In [121]:
#soups[5].find("table", id="per_game").find("tbody").find_all('tr', attrs={'class':'full_table'})[-3].find('th').text

# Create dictionary of col names to data

In [150]:
def label(stats):
    col_names = ['Name', 'Height', 'Weight', 'Draft Position', 'Career Earnings', 'Current Season(C)', 'C_Experience', 'C_Age', 'C_Tm', 'C_Lg', 'C_Pos', 'C_G', 'C_GS', 'C_MP', 'C_FG', 'C_FGA', 'C_FG%', 'C_3P', 'C_3PA', 'C_3P%', 'C_2P', 'C_2PA', 'C_2P%', 'C_eFG%', 'C_FT', 'C_FTA', 'C_FT%', 'C_ORB', 'C_DRB', 'C_TRB', 'C_AST', 'C_STL', 'C_BLK', 'C_TOV', 'C_PF', 'C_PTS', 'C_Age', 'C_Tm', 'C_Lg', 'C_Pos', 'C_G', 'C_MP', 'C_PER', 'C_TS%', 'C_3PAr', 'C_FTr', 'C_ORB%', 'C_DRB%', 'C_TRB%', 'C_AST%', 'C_STL%', 'C_BLK%', 'C_TOV%', 'C_USG%', 'C_\xa0', 'C_OWS', 'C_DWS', 'C_WS', 'C_WS/48', 'C_\xa0', 'C_OBPM', 'C_DBPM', 'C_BPM', 'C_VORP', 'C_Salary', 'Prev_Season(P)', 'P_Experience', 'P_Age', 'P_Tm', 'P_Lg', 'P_Pos', 'P_G', 'P_GS', 'P_MP', 'P_FG', 'P_FGA', 'P_FG%', 'P_3P', 'P_3PA', 'P_3P%', 'P_2P', 'P_2PA', 'P_2P%', 'P_eFG%', 'P_FT', 'P_FTA', 'P_FT%', 'P_ORB', 'P_DRB', 'P_TRB', 'P_AST', 'P_STL', 'P_BLK', 'P_TOV', 'P_PF', 'P_PTS', 'P_Age', 'P_Tm', 'P_Lg', 'P_Pos', 'P_G', 'P_MP', 'P_PER', 'P_TS%', 'P_3PAr', 'P_FTr', 'P_ORB%', 'P_DRB%', 'P_TRB%', 'P_AST%', 'P_STL%', 'P_BLK%', 'P_TOV%', 'P_USG%', 'P_\xa0', 'P_OWS', 'P_DWS', 'P_WS', 'P_WS/48', 'P_\xa0', 'P_OBPM', 'P_DBPM', 'P_BPM', 'P_VORP', 'P_Salary', 'Ca_Age', 'Ca_Tm', 'Ca_Lg', 'Ca_Pos', 'Ca_G', 'Ca_GS', 'Ca_MP', 'Ca_FG', 'Ca_FGA', 'Ca_FG%', 'Ca_3P', 'Ca_3PA', 'Ca_3P%', 'Ca_2P', 'Ca_2PA', 'Ca_2P%', 'Ca_eFG%', 'Ca_FT', 'Ca_FTA', 'Ca_FT%', 'Ca_ORB', 'Ca_DRB', 'Ca_TRB', 'Ca_AST', 'Ca_STL', 'Ca_BLK', 'Ca_TOV', 'Ca_PF', 'Ca_PTS', 'Ca_Age', 'Ca_Tm', 'Ca_Lg', 'Ca_Pos', 'Ca_G', 'Ca_MP', 'Ca_PER', 'Ca_TS%', 'Ca_3PAr', 'Ca_FTr', 'Ca_ORB%', 'Ca_DRB%', 'Ca_TRB%', 'Ca_AST%', 'Ca_STL%', 'Ca_BLK%', 'Ca_TOV%', 'Ca_USG%', 'Ca_\xa0', 'Ca_OWS', 'Ca_DWS', 'Ca_WS', 'Ca_WS/48', 'Ca_\xa0', 'Ca_OBPM', 'Ca_DBPM', 'Ca_BPM', 'Ca_VORP']
    if len(col_names) != len(stats):
        print(stats[0]) # print name for player pages that didn't parse properly
    return OrderedDict(zip(col_names, stats))

# Create list of dicts to become rows in dataframe

In [151]:
def make_rows(soup):
    """input soup page,
    output: up to 3 rows per player
    r#s means reverse"""
    print(find_name(soup))
    basic_rows = soup.find("table", id="per_game").find('tbody').find_all('tr', attrs={'class':'full_table'}) #basic
    years_exp = len(basic_rows)
 
    #generate static data - will be repeated for same player
    bio = find_bio(soup) 
    career = find_career_stats(soup)

    #decide how many rows to make
    if years_exp == 1:
        r1_season = bio + find_season_stats(soup, 1) + find_season_stats(soup, 2) + career
        return [label(r1_season)]
    
    if years_exp == 2:
        b = find_season_stats(soup, 2)
        r1_season = bio + find_season_stats(soup, 1) + b + career
        r2_season = bio + b + find_season_stats(soup, 3) + career
        return [label(r1_season), label(r2_season)] #list of dicts
    
    if years_exp >= 3:
        b = find_season_stats(soup, 2)
        c = find_season_stats(soup, 3)
        r1_season = bio + find_season_stats(soup, 1) + b + career
        r2_season = bio + b + c + career
        r3_season = bio + c + find_season_stats(soup, 4) + career
        return [label(r1_season), label(r2_season), label(r3_season)]

In [124]:
# dictlist = []
# for soup in soups:
#     dictlist.extend(make_rows(soup))
# pd.DataFrame(dictlist).tail()

# Generate list of urls to scrape

In [125]:
player_list_url = 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html'
player_list_url2 = 'https://www.basketball-reference.com/leagues/NBA_2017_per_game.html'
player_list_url3 = 'https://www.basketball-reference.com/leagues/NBA_2016_per_game.html'
prev_lists = [clean_html(url) for url in [player_list_url2, player_list_url3] ]          
player_list = clean_html(player_list_url)            

In [126]:
player_links_2018 = {row.a['href'] for row in (player_list.find("table", id='per_game_stats').find_all('tr', attrs={'class':'full_table'}))}
len(player_links_2018)

540

In [127]:
all_links = player_links_2018 | other_links
len(all_links)

689

In [128]:
urls = ['https://www.basketball-reference.com' + ref for ref in all_links]

In [129]:
urls[:5]

['https://www.basketball-reference.com/players/m/millemi01.html',
 'https://www.basketball-reference.com/players/v/vanvlfr01.html',
 'https://www.basketball-reference.com/players/c/cartevi01.html',
 'https://www.basketball-reference.com/players/b/bradlto01.html',
 'https://www.basketball-reference.com/players/g/greenje02.html']

# Scrape pages

In [130]:
def scrape(url):
    """return list of dicts from url"""
    return make_rows(clean_html(url))

In [193]:
data_list = []
for i, url in enumerate(urls):
    data = scrape(url)
    data_list.extend(data)
    time.sleep(0.5+2*random.random())
    
    #dump data for every 200 players
    if (i+1) % 200 == 0:
        df = pd.DataFrame(data_list)
        with open('players %s.pkl' % (i+1), 'wb') as picklefile:
            pickle.dump(df, picklefile)
        data_list = [] # delete once saved

#dump final set of players
df = pd.DataFrame(data_list)
with open('players_last.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)        
    
#642 is a broken page. use urls2 if want to scrape again

Mike Miller
Fred VanVleet
Vince Carter
Tony Bradley
Jeff Green
Quincy Pondexter
Ersan Ilyasova
Khris Middleton
Cameron Bairstow
Arron Afflalo
Luke Kennard
Cameron Payne
Paul Pierce
Ivan Rabb
Jrue Holiday
Brandon Paul
Andre Miller
DeMarcus Cousins
London Perrantes
Jake Layman
Julius Randle
Timofey Mozgov
Jarell Martin
Dwight Buycks
Gary Payton II
Nikola Jokic
Tyus Jones
Tim Duncan
Mario Chalmers
Erick Green
Dario Saric
Aron Baynes
Gian Clavell
PJ Dozier
Tim Hardaway
Dakari Johnson
Alan Williams
Jerami Grant
Austin Rivers
C.J. Watson
Demetrius Jackson
Salah Mejri
Tyler Dorsey
Tyreke Evans
Omri Casspi
Miles Plumlee
Dorian Finney-Smith
Sasha Kaun
Michael Beasley
Ian Clark
Mike Dunleavy
Lucas Nogueira
Dion Waiters
Alec Burks
Coty Clarke
Anthony Brown
Elijah Millsap
Brook Lopez
Boban Marjanovic
Rodney Hood
Jarrett Jack
Kosta Koufos
Andrew Nicholson
James Webb
Alex Len
Kirk Hinrich
Tim Frazier
J.R. Smith
Darrell Arthur
Channing Frye
Jason Terry
Jimmy Butler
Brandon Jennings
Kadeem Allen
Tibor

Nick Collison
Taj Gibson
Ben Moore
DeAndre' Bembry
Dewayne Dedmon
Garrett Temple
O.J. Mayo
Bojan Bogdanovic
Jose Calderon
Zhou Qi
Noah Vonleh
Milos Teodosic
Sterling Brown
Jimmer Fredette
Avery Bradley
Maurice Harkless
Andrew Harrison
T.J. Leaf
Donald Sloan
Mike Conley
Randy Foye
Meyers Leonard
MarShon Brooks
Jacob Pullen
Travis Wear
Jeremy Lin
Harrison Barnes
Markieff Morris
Tyler Johnson
Christian Wood
Kendall Marshall
Jarrett Allen
Ray McCallum
Abdel Nader
Antonio Blakeney
Ekpe Udoh
Jeff Ayres
Ryan Hollins
Gary Neal
Buddy Hield
Luke Babbitt
LaMarcus Aldridge
Draymond Green
Kelly Olynyk
Glenn Robinson III
Greg Monroe
Wilson Chandler
Nazr Mohammed
James Young
Scotty Hopson
Marcin Gortat
Dwyane Wade
Jared Sullinger
Al Horford
Dahntay Jones


AttributeError: 'NoneType' object has no attribute 'find'

In [302]:
urls[642]

'https://www.basketball-reference.com/players/b/bryanth01.html'

In [207]:
urls2 = urls[:642]+urls[643:] #642 is a broken page
len(urls2)

688

In [213]:
#pickle last set of players
for url in urls2[600:]:
    data = scrape(url)
    data_list.extend(data)
    time.sleep(0.5+2*random.random())
    
last = pd.DataFrame(data_list)
with open('players 688.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)        
    

Jimmer Fredette
Avery Bradley
Maurice Harkless
Andrew Harrison
T.J. Leaf
Donald Sloan
Mike Conley
Randy Foye
Meyers Leonard
MarShon Brooks
Jacob Pullen
Travis Wear
Jeremy Lin
Harrison Barnes
Markieff Morris
Tyler Johnson
Christian Wood
Kendall Marshall
Jarrett Allen
Ray McCallum
Abdel Nader
Antonio Blakeney
Ekpe Udoh
Jeff Ayres
Ryan Hollins
Gary Neal
Buddy Hield
Luke Babbitt
LaMarcus Aldridge
Draymond Green
Kelly Olynyk
Glenn Robinson III
Greg Monroe
Wilson Chandler
Nazr Mohammed
James Young
Scotty Hopson
Marcin Gortat
Dwyane Wade
Jared Sullinger
Al Horford
Dahntay Jones
Royce O'Neale
John Collins
DeAndre Jordan
Chris Paul
David West
Khem Birch
Ronnie Price
Amir Johnson
Montrezl Harrell
Mangok Mathiang
CJ McCollum
Derrick White
Spencer Hawes
Tyler Hansbrough
Omer Asik
Chris McCullough
Kyle Anderson
Marvin Williams
Dwight Powell
D.J. Augustin
Rondae Hollis-Jefferson
Festus Ezeli
Tarik Black
Jason Thompson
Jordan McRae
Patrick Beverley
Josh McRoberts
Alex Stepheson
James Anderson
Terrenc

# Combine scraped data into dataframe

In [3]:
with open('players 200.pkl', 'rb') as picklefile:
    data1 = pickle.load(picklefile)

In [4]:
with open('players 400.pkl', 'rb') as picklefile:
    data2 = pickle.load(picklefile)

In [5]:
with open('players 600.pkl', 'rb') as picklefile:
    data3 = pickle.load(picklefile)

In [6]:
with open('players 688.pkl', 'rb') as picklefile:
    data4 = pickle.load(picklefile)

In [7]:
active_since_2016 = pd.concat([data1, data2, data3, data4])

In [8]:
active_since_2016.columns

Index(['Name', 'Height', 'Weight', 'Draft Position', 'Career Earnings',
       'Current Season(C)', 'C_Experience', 'C_Age', 'C_Tm', 'C_Lg',
       ...
       'Ca_USG%', 'Ca_ ', 'Ca_OWS', 'Ca_DWS', 'Ca_WS', 'Ca_WS/48', 'Ca_OBPM',
       'Ca_DBPM', 'Ca_BPM', 'Ca_VORP'],
      dtype='object', length=161)

In [9]:
active_since_2016.shape

(1950, 161)

In [10]:
active_since_2016.describe()

Unnamed: 0,Height,Weight,Draft Position,Career Earnings,C_Experience,C_Salary,P_Experience,P_Salary
count,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1579.0,1579.0
mean,78.936923,218.988718,17.401026,29819170.0,5.581026,5054074.0,5.657378,5096517.0
std,3.403831,26.296624,16.094711,46491820.0,4.387742,6256305.0,4.205407,5685953.0
min,69.0,150.0,0.0,0.0,1.0,0.0,1.0,0.0
25%,77.0,200.0,2.0,2052342.0,2.0,981312.0,2.0,1128300.0
50%,79.0,220.0,14.0,11032160.0,4.0,2365560.0,5.0,2978250.0
75%,81.0,240.0,28.0,39227040.0,8.0,6500000.0,8.0,6583500.0
max,87.0,307.0,60.0,343872400.0,21.0,34682550.0,20.0,30963450.0


In [11]:
#dataframe check
#active_since_2016.info()
#active_since_2016.dtypes

### Cleaning dataframe

#### Convert numeric columns from string

In [12]:
col_names = ['C_Experience', 'C_Age', 'C_Tm', 'C_Lg', 'C_Pos', 'C_G', 'C_GS', 'C_MP', 'C_FG', 'C_FGA', 'C_FG%', 'C_3P', 'C_3PA', 'C_3P%', 'C_2P', 'C_2PA', 'C_2P%', 'C_eFG%', 'C_FT', 'C_FTA', 'C_FT%', 'C_ORB', 'C_DRB', 'C_TRB', 'C_AST', 'C_STL', 'C_BLK', 'C_TOV', 'C_PF', 'C_PTS', 'C_Age', 'C_Tm', 'C_Lg', 'C_Pos', 'C_G', 'C_MP', 'C_PER', 'C_TS%', 'C_3PAr', 'C_FTr', 'C_ORB%', 'C_DRB%', 'C_TRB%', 'C_AST%', 'C_STL%', 'C_BLK%', 'C_TOV%', 'C_USG%', 'C_\xa0', 'C_OWS', 'C_DWS', 'C_WS', 'C_WS/48', 'C_\xa0', 'C_OBPM', 'C_DBPM', 'C_BPM', 'C_VORP', 'C_Salary', 'Prev_Season(P)', 'P_Experience', 'P_Age', 'P_Tm', 'P_Lg', 'P_Pos', 'P_G', 'P_GS', 'P_MP', 'P_FG', 'P_FGA', 'P_FG%', 'P_3P', 'P_3PA', 'P_3P%', 'P_2P', 'P_2PA', 'P_2P%', 'P_eFG%', 'P_FT', 'P_FTA', 'P_FT%', 'P_ORB', 'P_DRB', 'P_TRB', 'P_AST', 'P_STL', 'P_BLK', 'P_TOV', 'P_PF', 'P_PTS', 'P_Age', 'P_Tm', 'P_Lg', 'P_Pos', 'P_G', 'P_MP', 'P_PER', 'P_TS%', 'P_3PAr', 'P_FTr', 'P_ORB%', 'P_DRB%', 'P_TRB%', 'P_AST%', 'P_STL%', 'P_BLK%', 'P_TOV%', 'P_USG%', 'P_\xa0', 'P_OWS', 'P_DWS', 'P_WS', 'P_WS/48', 'P_\xa0', 'P_OBPM', 'P_DBPM', 'P_BPM', 'P_VORP', 'P_Salary', 'Ca_Age', 'Ca_Tm', 'Ca_Lg', 'Ca_Pos', 'Ca_G', 'Ca_GS', 'Ca_MP', 'Ca_FG', 'Ca_FGA', 'Ca_FG%', 'Ca_3P', 'Ca_3PA', 'Ca_3P%', 'Ca_2P', 'Ca_2PA', 'Ca_2P%', 'Ca_eFG%', 'Ca_FT', 'Ca_FTA', 'Ca_FT%', 'Ca_ORB', 'Ca_DRB', 'Ca_TRB', 'Ca_AST', 'Ca_STL', 'Ca_BLK', 'Ca_TOV', 'Ca_PF', 'Ca_PTS', 'Ca_Age', 'Ca_Tm', 'Ca_Lg', 'Ca_Pos', 'Ca_G', 'Ca_MP', 'Ca_PER', 'Ca_TS%', 'Ca_3PAr', 'Ca_FTr', 'Ca_ORB%', 'Ca_DRB%', 'Ca_TRB%', 'Ca_AST%', 'Ca_STL%', 'Ca_BLK%', 'Ca_TOV%', 'Ca_USG%', 'Ca_\xa0', 'Ca_OWS', 'Ca_DWS', 'Ca_WS', 'Ca_WS/48', 'Ca_\xa0', 'Ca_OBPM', 'Ca_DBPM', 'Ca_BPM', 'Ca_VORP']

In [13]:
for col in col_names:
    active_since_2016[col] = pd.to_numeric(active_since_2016[col], errors='ignore')
        

#### Remove empty cols

In [14]:
all_cols = active_since_2016.columns
len(all_cols)

161

In [15]:
all_cols[150:161]

Index(['Ca_TOV%', 'Ca_USG%', 'Ca_ ', 'Ca_OWS', 'Ca_DWS', 'Ca_WS', 'Ca_WS/48',
       'Ca_OBPM', 'Ca_DBPM', 'Ca_BPM', 'Ca_VORP'],
      dtype='object')

In [16]:
empty_cols = [col for col in all_cols if '\xa0' in col]
empty_cols

['C_\xa0', 'P_\xa0', 'Ca_\xa0']

In [17]:
clean_active = active_since_2016.drop(empty_cols, axis=1)

In [18]:
clean_active.shape

(1950, 158)

In [19]:
clean_active.head()

Unnamed: 0,Name,Height,Weight,Draft Position,Career Earnings,Current Season(C),C_Experience,C_Age,C_Tm,C_Lg,...,Ca_TOV%,Ca_USG%,Ca_OWS,Ca_DWS,Ca_WS,Ca_WS/48,Ca_OBPM,Ca_DBPM,Ca_BPM,Ca_VORP
0,Mike Miller,80,218,5,93176913,2016-17,17,36,DEN,NBA,...,14.1,18.2,39.2,21.6,60.7,0.105,1.5,-0.6,0.9,20.3
1,Mike Miller,80,218,5,93176913,2015-16,16,35,DEN,NBA,...,14.1,18.2,39.2,21.6,60.7,0.105,1.5,-0.6,0.9,20.3
2,Mike Miller,80,218,5,93176913,2014-15,15,34,CLE,NBA,...,14.1,18.2,39.2,21.6,60.7,0.105,1.5,-0.6,0.9,20.3
3,Fred VanVleet,72,195,0,543471,2017-18,2,23,TOR,NBA,...,11.1,19.4,2.6,2.4,5.0,0.131,1.0,0.1,1.1,1.4
4,Fred VanVleet,72,195,0,543471,2016-17,1,22,TOR,NBA,...,11.1,19.4,2.6,2.4,5.0,0.131,1.0,0.1,1.1,1.4


#### Filter for past three seasons only

In [20]:
three_seasons = ['2015-16', '2016-17', '2017-18']

In [21]:
last_three = clean_active[clean_active['Current Season(C)'].isin(three_seasons)]

In [22]:
last_three.shape

(1711, 158)

# Pickle 'clean' dataframe for analysis

In [23]:
with open('last_three_seasons.pkl', 'wb') as picklefile:
    pickle.dump(last_three, picklefile)    