In [69]:
import pandas as pd
import basketball_reference_scraper
from basketball_reference_scraper.players import get_stats, get_game_logs

from urllib.request import urlopen
from bs4 import BeautifulSoup
import math
import multiprocessing
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# URL page we will scraping (see image above)
url = "https://www.basketball-reference.com/awards/all_defense_by_player.html"
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html)

In [3]:
rows = soup.findAll('tr')[1:]
all_defensive_names = [rows[i].findAll('td')[1].getText() for i in range(1, len(rows))]

In [4]:
all_defensive_names

['Tim Duncan',
 'Kobe Bryant',
 'Kevin Garnett',
 'Kareem Abdul-Jabbar',
 'Bobby Jones',
 'Scottie Pippen',
 'Dennis Johnson',
 'Michael Jordan',
 'Jason Kidd',
 'Hakeem Olajuwon',
 'Chris Paul',
 'Gary Payton',
 'Bruce Bowen',
 'Michael Cooper',
 'John Havlicek',
 'David Robinson',
 'Dennis Rodman',
 'Norm Van Lier',
 'Walt Frazier',
 'Tony Allen',
 'Mookie Blaylock',
 'Don Buse',
 'Dave DeBusschere',
 'LeBron James',
 'Kevin McHale',
 'Dikembe Mutombo',
 'Alvin Robertson',
 'Jerry Sloan',
 'Ben Wallace',
 'Don Chaney',
 'Maurice Cheeks',
 'Joe Dumars',
 'Mark Eaton',
 'Artis Gilmore',
 'Draymond Green',
 'Dwight Howard',
 'Kawhi Leonard',
 'Sidney Moncrief',
 'Dan Roundfield',
 'Paul Silas',
 'John Stockton',
 'Nate Thurmond',
 'Jerry West',
 'Quinn Buckner',
 'Jimmy Butler',
 'Marcus Camby',
 'Doug Christie',
 'Paul George',
 'Horace Grant',
 'Karl Malone',
 'Tayshaun Prince',
 'Rajon Rondo',
 'Buck Williams',
 'Metta World Peace',
 'Larry Bird',
 'P.J. Brown',
 'Tyson Chandler',
 '

In [5]:
players = pd.read_csv('nba_rookies.csv')
players = players.dropna()
rookie_names = players['Name'].values

In [35]:
def rookie_frame(range_list, return_dict):
    rookie_dataframe = get_stats(rookie_names[range_list[0]], stat_type='ADVANCED', playoffs=False, career=False)[0:1]
    rookie_dataframe['Name'] = rookie_names[range_list[0]]
    
    if rookie_names[range_list[0]] in all_defensive_names:
        rookie_dataframe['all_defensive'] = 1
    else:
        rookie_dataframe['all_defensive'] = 0
    
    rookie_dataframe['all_defensive'] = 0
    for rookie in range (range_list[0]+1, range_list[1]):
        try:
            rookie_stats = get_stats(rookie_names[rookie], stat_type='ADVANCED', playoffs=False, career=False).loc[0]
            rookie_stats['Name'] = rookie_names[rookie]
            if rookie_names[rookie] in all_defensive_names:
                rookie_stats['all_defensive'] = 1
            else:
                rookie_stats['all_defensive'] = 0
            rookie_dataframe = rookie_dataframe.append(rookie_stats, ignore_index=True)
        #Due to name formatting some players can't be found
        except:
            pass
    return_dict[rookie_names[range_list[0]]] = rookie_dataframe
    return rookie_dataframe

In [None]:
split = int(len(rookie_names)/5)
names_split = [(0,split), (split, split*2), (split*2, split*3), (split*3, split*4), (split*4, 1538)]
processes = []
for name_range in names_split:
    manager = multiprocessing.Manager()
    return_dict = manager.dict()
    p = multiprocessing.Process(target=rookie_frame, args=(name_range, return_dict))
    processes.append(p)
    p.start()
for process in processes:
    process.join()
for x in range(len((return_dict.values()))):
    return_dict.values()[x].to_csv(f'{x}'.csv)

In [40]:
csv_list = ['307.csv', '614.csv', '921.csv', '1228.csv']
train_df = pd.read_csv('0.csv')
for x in csv_list:
    train_df = train_df.append(pd.read_csv(x), ignore_index=True)
    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [41]:
#Decided to leave teams as that may have an impact
train_df = train_df.drop(['Unnamed: 0','TEAM', 'AGE', 'Name', 'SEASON', 'LEAGUE', 'POS'], axis=1)

In [49]:
train_df.to_csv('train.csv',index=False)

In [102]:
example = get_stats('Ben Simmons', stat_type='ADVANCED', playoffs=False, career=False)[:1].drop([ 'AGE', 'SEASON', 'LEAGUE', 'POS', 'TEAM'], axis=1)

In [103]:
example

Unnamed: 0,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,81,2732,20.0,0.557,0.011,0.342,5.9,19.5,13.0,37.4,...,19.5,22.3,4.2,5.0,9.2,0.162,2.0,2.5,4.5,4.5


In [79]:
rf = RandomForestClassifier(n_estimators=300, max_depth=3)
X = train_df.copy().dropna()
Y = X['all_defensive']
X.drop(['all_defensive'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9495192307692307


In [58]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [101]:
metrics.confusion_matrix(y_test, y_pred)

array([[394,   0],
       [ 21,   1]], dtype=int64)

In [100]:
train_df['all_defensive'].value_counts()

0    1331
1      91
Name: all_defensive, dtype: int64