In [3]:
# print autosklearn version
import autosklearn
print('auto-sklearn: %s' % autosklearn.__version__)

auto-sklearn: 0.12.7


In [4]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier

import pandas as pd

In [5]:
data = 'blue/Pitch TRAINING Data V3 - Final.csv'

df = pd.read_csv(data)

In [6]:
df.dtypes

player_number          int64
player_name           object
pitch_type            object
release_speed        float64
release_spin_rate    float64
zone                   int64
event                 object
description           object
dtype: object

In [7]:
# check for missing values

df.isnull().sum()

player_number         0
player_name           0
pitch_type            0
release_speed         0
release_spin_rate    49
zone                  0
event                 0
description           0
dtype: int64

In [8]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 4 categorical variables

The categorical variables are :

 ['player_name', 'pitch_type', 'event', 'description']


In [9]:
numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :\n\n', numerical)

There are 4 numerical variables

The numerical variables are :

 ['player_number', 'release_speed', 'release_spin_rate', 'zone']


In [10]:
# Dropping 'player_number' as we have created the dummies for it
df.drop(['player_number'], axis = 1, inplace = True)

df

Unnamed: 0,player_name,pitch_type,release_speed,release_spin_rate,zone,event,description
0,"Wainwright, Adam",CU,75.1,2717.0,12,walk,ball
1,"Wainwright, Adam",FC,83.3,2243.0,11,walk,ball
2,"Wainwright, Adam",FF,88.7,2204.0,12,walk,ball
3,"Wainwright, Adam",FF,89.5,2182.0,11,walk,ball
4,"Wainwright, Adam",FF,88.1,2086.0,11,walk,ball
...,...,...,...,...,...,...,...
39966,"Yamaguchi, Shun",FF,88.8,2076.0,2,strikeout,swinging_strike
39967,"Yamaguchi, Shun",FF,92.6,2150.0,2,strikeout,swinging_strike
39968,"Yamaguchi, Shun",FF,93.5,2215.0,3,strikeout,swinging_strike
39969,"Yamaguchi, Shun",FF,93.0,2169.0,5,strikeout,swinging_strike


In [11]:
# Creating dummy variable
zones = pd.get_dummies(df['zone'])

#Display zones
zones

Unnamed: 0,1,2,3,4,5,6,7,8,9,11,12,13,14
0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39966,0,1,0,0,0,0,0,0,0,0,0,0,0
39967,0,1,0,0,0,0,0,0,0,0,0,0,0
39968,0,0,1,0,0,0,0,0,0,0,0,0,0
39969,0,0,0,0,1,0,0,0,0,0,0,0,0


In [12]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)

# Dropping 'zone' as we have created the dummies for it
df.drop(['zone'], axis = 1, inplace = True)

df

Unnamed: 0,release_speed,release_spin_rate,"player_name_Abreu, Albert","player_name_Abreu, Bryan","player_name_Adam, Jason","player_name_Adams, Austin","player_name_Adams, Chance","player_name_Adrianza, Ehire","player_name_Akin, Keegan","player_name_Alcala, Jorge",...,event_home_run,event_sac_fly,event_single,event_strikeout,event_triple,event_walk,description_ball,description_called_strike,description_hit_into_play,description_swinging_strike
0,75.1,2717.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,83.3,2243.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,88.7,2204.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,89.5,2182.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,88.1,2086.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39966,88.8,2076.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
39967,92.6,2150.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
39968,93.5,2215.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
39969,93.0,2169.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [13]:
X = df.drop(['event_home_run'], axis=1)

y = df['event_home_run']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [15]:
# define search
model = AutoSklearnClassifier(time_left_for_this_task=5*60, per_run_time_limit=30)
# perform the search
model.fit(X_train, y_train)

AutoSklearnClassifier(per_run_time_limit=30, time_left_for_this_task=300)

In [16]:
# summarize
print(model.sprint_statistics())
# evaluate best model
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print("Accuracy: %.3f" % acc)

auto-sklearn results:
  Dataset name: d7c838dd-222d-11ec-b323-a0423f3cf3b2
  Metric: accuracy
  Best validation score: 1.000000
  Number of target algorithm runs: 12
  Number of successful target algorithm runs: 5
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 6
  Number of target algorithms that exceeded the memory limit: 1

Accuracy: 0.999


In [17]:
y_hat

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [18]:
y_test

13644    0
22727    0
33987    0
11232    0
4441     0
        ..
37052    0
20008    0
35615    0
12983    0
3549     0
Name: event_home_run, Length: 11992, dtype: uint8

In [19]:
from sklearn import metrics
metrics.precision_score(y_test, y_hat)

1.0

In [20]:
metrics.recall_score(y_test, y_hat)

0.9889705882352942

In [21]:
metrics.f1_score(y_test, y_hat)

0.9944547134935305

In [22]:
metrics.roc_auc_score(y_test, y_hat)

0.9944852941176471

In [23]:
metrics.fbeta_score(y_test, y_hat, beta=1)

0.9944547134935305