In [63]:
import pandas as pd
import os

# WNBA Playoff Prediction

## Data Understanding

Importing datasets

In [64]:
dataframes = dict()
for dataset_name in os.listdir('data'):
    if dataset_name.endswith('.csv'):
        dataframes[dataset_name.split(".")[0]] = pd.read_csv('data/' + dataset_name) 

## Data Preparation

### Select Data

Eliminating columns with exclusively unique values

In [65]:
for df_name, df in dataframes.items():
    df_clean = df.copy()
    for col in df.columns:
        if len(pd.unique(df[col])) == 1:
            df_clean.drop(col, axis=1, inplace=True)
    dataframes[df_name] = df_clean


### Integrate Data

Merging all dataframes into a single one

In [66]:
data = pd.merge(dataframes['teams_post'], dataframes['teams'], on=['tmID', 'year'], how='right')
data = pd.merge(data, dataframes['coaches'], on=['tmID', 'year'], suffixes=('_team', '_coach'))
data = pd.merge(data, dataframes['series_post'], left_on=['tmID', 'year'], right_on=['tmIDWinner', 'year'], suffixes=('_team_post', '_series'), how='left')
data = pd.merge(data, dataframes['series_post'], left_on=['tmID', 'year'], right_on=['tmIDLoser', 'year'], suffixes=('_team_post', '_series'), how='left')
data = pd.merge(data, dataframes['players_teams'], on=['tmID', 'year'], suffixes=('_team', '_player'))
data = pd.merge(data, dataframes['players'], left_on='playerID', right_on='bioID')
data = pd.merge(data, dataframes['awards_players'], on=['playerID'], how='left')

In [67]:
data.shape

(2943, 122)

In [68]:
label = "playoff"

In [69]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


In [70]:
df = dataframes["teams"]
df.head(3)

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,9,ATL,ATL,EA,7,N,,,,Atlanta Dream,...,34,1,16,3,14,2,18,6825,141379,Philips Arena
1,10,ATL,ATL,EA,2,Y,L,,,Atlanta Dream,...,34,12,5,6,11,10,12,6950,120737,Philips Arena
2,1,CHA,CHA,EA,8,N,,,,Charlotte Sting,...,32,5,11,3,13,5,16,6475,90963,Charlotte Coliseum


In [71]:
features = df.columns
toDrop = [label,"rank","semis","finals","firstRound"]
for col in toDrop:
    features = features.drop(col)
assert len(features) < len(df.columns)

X = df[features]  # Features
X

Unnamed: 0,year,tmID,franchID,confID,name,o_fgm,o_fga,o_ftm,o_fta,o_3pm,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,9,ATL,ATL,EA,Atlanta Dream,895,2258,542,725,202,...,34,1,16,3,14,2,18,6825,141379,Philips Arena
1,10,ATL,ATL,EA,Atlanta Dream,1089,2428,569,755,114,...,34,12,5,6,11,10,12,6950,120737,Philips Arena
2,1,CHA,CHA,EA,Charlotte Sting,812,1903,431,577,131,...,32,5,11,3,13,5,16,6475,90963,Charlotte Coliseum
3,2,CHA,CHA,EA,Charlotte Sting,746,1780,410,528,153,...,32,11,5,7,9,15,6,6500,105525,Charlotte Coliseum
4,3,CHA,CHA,EA,Charlotte Sting,770,1790,490,663,211,...,32,11,5,7,9,12,9,6450,106670,Charlotte Coliseum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,6,WAS,WAS,EA,Washington Mystics,847,1968,388,546,181,...,34,10,7,6,11,9,11,6900,171501,Verizon Center
138,7,WAS,WAS,EA,Washington Mystics,1016,2199,528,715,187,...,34,13,4,5,12,12,8,6850,133255,Verizon Center
139,8,WAS,WAS,EA,Washington Mystics,877,2170,668,839,163,...,34,8,9,8,9,8,12,6900,133255,Verizon Center
140,9,WAS,WAS,EA,Washington Mystics,885,2131,435,659,163,...,34,6,11,4,13,6,14,6825,154637,Verizon Center


In [72]:
le = LabelEncoder()
X = X.apply(lambda col : le.fit_transform(col))
X

Unnamed: 0,year,tmID,franchID,confID,name,o_fgm,o_fga,o_ftm,o_fta,o_3pm,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,8,0,0,0,0,72,113,75,85,74,...,1,0,14,2,11,0,15,9,84,11
1,9,0,0,0,0,114,125,88,96,17,...,1,10,5,5,8,8,9,15,34,11
2,0,1,1,0,1,39,30,24,25,30,...,0,3,11,2,10,3,13,3,4,4
3,1,1,1,0,1,9,6,15,6,46,...,0,9,5,6,6,13,3,4,11,4
4,2,1,1,0,1,21,7,52,62,79,...,0,9,5,6,6,10,6,2,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,5,19,17,0,19,53,51,9,14,63,...,1,8,7,5,8,7,8,12,121,21
138,6,19,17,0,19,110,101,68,80,67,...,1,11,4,4,9,10,5,10,58,21
139,7,19,17,0,19,66,92,103,107,50,...,1,6,9,7,6,6,9,12,58,21
140,8,19,17,0,19,68,84,26,60,50,...,1,4,11,3,10,4,11,9,108,21


In [73]:
y  =df[label]# Targe
y.head(3)

0    N
1    Y
2    N
Name: playoff, dtype: object

In [74]:
y = le.fit_transform(y)
y

array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1])

In [75]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [76]:
# Create Decision Tree classifer object

clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)


In [77]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.7906976744186046
