# MLBNN

## Data preprocessing 

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading data

In [2]:
data_path = 'data/dataSet.csv'

data = pd.read_csv(data_path)

In [3]:
data.describe()

Unnamed: 0,inningNo,ana,lan,bos,tor,cha,sdn,cle,tex,col,...,SacFly,Walk,Forceout,SacBunt,PopOut,FieldError,RunnerOut,IntentWalk,DoublePlay,FieldersChoiceOut
count,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,...,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0,23474.0
mean,4.581537,0.034634,0.033526,0.038724,0.035017,0.036849,0.027903,0.029778,0.036253,0.035699,...,0.006433,0.082304,0.020746,0.005708,0.044816,0.009585,0.002556,0.00426,0.002684,0.001491
std,2.397353,0.182855,0.180011,0.19294,0.183828,0.188396,0.164699,0.169977,0.186923,0.185543,...,0.079947,0.274833,0.142537,0.07534,0.206903,0.097435,0.050494,0.065131,0.051737,0.038586
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
data.loc[:5,'isHome':]

Unnamed: 0,isHome,batterHeight,batterHand,pitcherHand,homeTeamRuns,awayTeamRuns,runnersOn,avgFTspeed,avgZone,Lineout,...,SacFly,Walk,Forceout,SacBunt,PopOut,FieldError,RunnerOut,IntentWalk,DoublePlay,FieldersChoiceOut
0,False,0.72,1,1,0,0,0,96.0,12,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0.71,1,1,0,0,0,96.43,2,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0.74,0,1,0,0,1,96.0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0.72,1,1,0,0,1,96.07,12,0,...,0,0,1,0,0,0,0,0,0,0
4,True,0.74,0,1,0,0,0,95.44,0,0,...,0,0,0,0,0,0,0,0,0,0
5,True,0.7,1,1,0,0,0,95.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Scaling innings
data['inningNo'] = data['inningNo'].apply(lambda x: x/10)
data.head()['inningNo']

0    0.1
1    0.1
2    0.1
3    0.1
4    0.1
Name: inningNo, dtype: float64

In [6]:
#Mapping 1 and 0 for True or False values in the dataset
data['isHome'] = data['isHome'].apply(lambda x: 0 if x is False else 1)
data.head()['isHome']

0    0
1    0
2    0
3    0
4    1
Name: isHome, dtype: int64

In [7]:
#Scaling average speed
data['avgFTspeed'] = data['avgFTspeed'].apply(lambda x: x/100)
data.head()['avgFTspeed']

0    0.9600
1    0.9643
2    0.9600
3    0.9607
4    0.9544
Name: avgFTspeed, dtype: float64

In [8]:
#Adding dummy columns for zone classes
dummies = pd.get_dummies(data['avgZone'], prefix="zone", drop_first=False)
data = pd.concat([data, dummies], axis=1)

fields_to_drop = ['avgZone']
data = data.drop(fields_to_drop, axis=1)
data.head()

Unnamed: 0,inningNo,ana,lan,bos,tor,cha,sdn,cle,tex,col,...,zone_4,zone_5,zone_6,zone_7,zone_8,zone_9,zone_11,zone_12,zone_13,zone_14
0,0.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Saving processed data

In [9]:
data.to_csv('data/processedDataset.csv')