# Technical Notebook
This notebook highlights the technical elements of this experiment to include six major steps:
- Acquisition, cleaning, and feature engineering for MLB pitchers
- Acquisition, cleaning, and feature engineering for MLB batters
- Joining pitcher and batter data
- Preprocessing for modeling
- Model selection and hyperparameter tuning
- Model assessment

## Import packages

In [None]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from yellowbrick.classifier import ConfusionMatrix, confusion_matrix
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.features import RadViz
from imblearn.under_sampling import RandomUnderSampler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Step 1 and 2: Pitcher and batter data acquisition
The process for scraping pitcher and batter data are similar.  In the interest of brevity, the code for scraping batter data is included below.  The process for pitchers is identical aside from the urls referenced.

In [None]:
# Function that modifies baseball savant url for each team in order to get all batter data per date in 2019
def url_per_team(front_url, team, back_url):
    team_url = front_url + team + back_url
    team_url = team_url.replace(' ', '')
    return team_url

In [None]:
# Create url components required to reference baseballsavant's data
front_url = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&\
                hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2019%7C&\
                hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&\
                game_date_gt=&game_date_lt=&hfInfield=&team='

back_url = '&position=&hfOutfield=&hfRO=&\
                home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&\
                group_by=name-date&sort_col=ba&player_event_sort=h_launch_speed&sort_order=desc&\
                min_pas=0&chk_stats_pa=on&chk_stats_abs=on&chk_stats_hits=on&chk_stats_singles=on&\
                chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&\
                chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_babip=on&\
                chk_stats_iso=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_slg=on&\
                chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&'

teams = ['LAA', 'HOU', 'OAK', 'TOR', 'ATL', 'MIL', 'STL', 'CHC', 'ARI', 'LAD', 'SF', 'CLE', 'SEA', \
            'MIA', 'NYM', 'WSH', 'BAL', 'SD', 'PHI', 'PIT', 'TEX', 'TB', 'BOS', 'CIN', 'COL', 'KC', \
            'DET', 'MIN', 'CWS', 'NYY']

In [None]:
# Create dictionary of team names and baseaball savant urls
team_urls_dict = {}

for team in teams:
    name_lower = team.lower()
    team_url = url_per_team(front_url, team, back_url)
    team_urls_dict.update({name_lower: team_url})

In [None]:
# Create a function that takes a team name and url
# It captues csv batter data for that team
# Adds a column with the team name
# Saves df to a dataframe
# It returns the team id (lowercase) and the batters data dataframe dictionary

def get_batters_data(team, url):
    batter_df = pd.read_csv(url)
    batter_df['team'] = team
    return team, batter_df

In [None]:
# # Create dictionary with which to save all batter dataframes acquired from baseball savant
batter_df_dict = {}
teams_left = 29

for team in team_urls_dict:
    team, batter_df = get_batters_data(team, team_urls_dict[team])
    batter_df_dict.update({team: batter_df})
    print('Just snagged batter data for {}, {} of 30 teams remaining...'.format(team, teams_left))
    teams_left -= 1
    time.sleep(30)
    
# # After snagging data for half the league (15 teams) hit a 502 error
# # Disabling code so webscraping does not rerun

In [None]:
# Loop through batter_df_dict and concatitate all dfs into one master file
for team in batter_df_dict.keys():
    if team == 'laa':
        all_batters_df = batter_df_dict[team]
    else:
        all_batters_df = pd.concat([all_batters_df, batter_df_dict[team]])
        
# After exporting pickled data, we move on to step 3

## Step 3: Create Batter KMeans Clusters and Join pitcher/batter data
Here we create KMeans clusters for batters and join all our data together

In [None]:
# After importing the batter data, we create KMeans Cluster assigining each hitter into one of four groups
pca = PCA(n_components=2, random_state=31)
X_redux = pca.fit_transform(X_ss)

km = KMeans(n_clusters=4, random_state=31)
clusters = km.fit(X_redux)
labels = km.labels_

sns.set(font_scale=1.5)
plt.figure(figsize=(16, 8))
fig = sns.scatterplot(x = X_redux[:,0], 
                y = X_redux[:,1], 
                hue = labels, 
                palette = 'colorblind').set_title('K-Means Clustering for MLB Hitters');

In [None]:
# Now to join pitcher and batter data
pb = pitchers.merge(batters, 
                   how = 'inner', 
                   left_on = ['game_date', 'batter'], 
                   right_on = ['shift_date', 'player_id'])

## Step 4: Model Preprocessing and Train, Test, Split

In [None]:
# In preparation for modeling we preprocess with one hot encoding and minmax scaling

# Define categorical variables
cats = ['stand', 'if_fielding_alignment', 'of_fielding_alignment', 'balls_strikes', 'all_runners']

# Define columns already standardized
formatted = ['nats_home1_away0', 'ba', 'slg', 'iso', 'babip']

# Define minmax variables
minmax = []
non_nums = cats + formatted

for c in X.columns:
    if c not in non_nums:
        minmax.append(c)

In [None]:
# After fitting all transformations, the target data is processed to represent three pitch types
pitch_3_types = {'FF': 0, 'FC': 1, 'SL': 1, 'CU': 1, 'CH':2}
y_ord_3c = y

for pitch in pitch_3_types:
#     print(pitch, pitch_type_dict[pitch])
    y_ord_3c = np.where(y == pitch, pitch_3_types[pitch], y_ord_3c)

In [None]:
# Train, test, split data.  In the 4.01 notebook you will nice three train test splits.  These include different
# binning of the target data as different train sets were assessed prior to final selection of three classes
X_train_3c, X_test_3c, y_train_3c, y_test_3c = train_test_split(X_trans,
                                                                y_ord_3c,
                                                                test_size = .3,
                                                                random_state = 31,
                                                                shuffle = True,
                                                                stratify = y)

## Step 5: Model Selection

In [None]:
# Due to the imbalance of pitch types in the target data, the majority class is undersampled
under = RandomUnderSampler(sampling_strategy='auto')
X_under, y_under = under.fit_resample(X_train, y_train)

In [None]:
# Create dictionary to track model performance
results = {}

In [None]:
# Dummy model
dc = DummyClassifier(strategy = 'most_frequent')
dc.fit(X_under, y_under)
dc.score(X_under, y_under)
results.update({'Dummy': [dc.score(X_train, y_train), 0]})

# Logistic Regression
lr = LogisticRegression(random_state = 31, max_iter = 1000)
lr.fit(X_under, y_under)
lr.score(X_under, y_under)
results.update({'LogReg': [lr.score(X_under, y_under), 0]})

# Decision Tree
dtree = DecisionTreeClassifier(random_state = 31)
dtree.fit(X_under,y_under)
dtree.score(X_under, y_under)

dtree_cv = cross_val_score(dtree, X_train, y_train, cv=5, scoring= 'accuracy')
np.average(dtree_cv)

results.update({'DecisionTree': [dtree.score(X_under, y_under), np.average(dtree_cv)]})

# Random Forest
rf = RandomForestClassifier(n_jobs=-1, bootstrap=True, random_state=31)
rf.fit(X_under, y_under);
rf.score(X_under, y_under)

rf_cv = cross_val_score(rf, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(rf_cv)

results.update({'RandomForest': [rf.score(X_under, y_under), np.average(rf_cv)]})

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(X_under, y_under)
gbc.score(X_under, y_under)

gbc_cv = cross_val_score(gbc, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(gbc_cv)

results.update({'GradientBoosting': [gbc.score(X_under, y_under), np.average(gbc_cv)]})

# Ada Boost
dc = DecisionTreeClassifier(class_weight='balanced')
ada = AdaBoostClassifier(base_estimator = dc, random_state=31)
ada.fit(X_under, y_under)
ada.score(X_under, y_under)

ada_cv = cross_val_score(ada, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(ada_cv)

results.update({'Ada-DT': [ada.score(X_under, y_under), np.average(ada_cv)]})


#SVC
svc = SVC(C=.1, kernel = 'sigmoid', max_iter=1000)
svc.fit(X_under, y_under)
svc.score(X_under, y_under)

svc_cv = cross_val_score(svc, X_train, y_train, cv=5, n_jobs = 8, verbose = 1)
np.average(svc_cv)

results.update({'SVC': [svc.score(X_under, y_under), np.average(svc_cv)]})

# MLP Classifier
mlp = MLPClassifier(random_state=31, max_iter=10000, hidden_layer_sizes=(500,))
mlp.fit(X_under, y_under)
mlp.score(X_under, y_under)

mlp_cv = cross_val_score(mlp, X_train, y_train, cv=5, n_jobs = 8, verbose = 1)
mlp_cv

results.update({'MLP': [mlp.score(X_under, y_under), np.average(mlp_cv)]})

In [None]:
# Review Results
pd.DataFrame(results, columns = results.keys()).T

### After reviewing the results, Random Forest is selected and then tuned

In [None]:
# Create model and calculate cross validation score
rf = RandomForestClassifier(n_jobs=-1, bootstrap=True, random_state=31)
rf.fit(X_under, y_under);
rf.score(X_train, y_train)

rf_cv = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy', n_jobs = 8, verbose = 1)
np.average(rf_cv)

# Review confusion matrix
rf_cm = ConfusionMatrix(rf, classes=['FF', 'CH', 'MM'])

rf_cm.fit(X_under, y_under)
rf_cm.score(X_train, y_train)

rf_cm.show(outpath = '../viz/rf__under_matrix');

In [None]:
# Create cross validation plots
# Capture cv params in dictionary

params = {'n_estimators': ['n_estimators', [1, 3, 5, 7, 9, 11, 13, 15, 20, 25, 30]],
          'max_depth': ['max_depth', [2, 5, 10, 15, 20, 25]], 
          'min_samples_split': ['min_samples_split', [5, 10, 50, 100, 250, 500]], 
          'min_samples_leaf': ['min_samples_leaf', [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]], 
          'max_leaf_nodes': ['max_leaf_nodes', [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000,
                                               1025, 1000, 1750, 2000, 3000, 4000, 5000, 10000, 20000]]}

In [None]:
# n_estimators
fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = rf,
                      X = X_under,
                      y = y_under,
                      param_name = params['n_estimators'][0],
                      param_range = params['n_estimators'][1],
                      cv = 5,
                      verbose = 1,
                      n_jobs = 4,
                      logx = False)
                                             
viz.fit(X_under, y_under)
viz.show(outpath = '../viz/n_estimators');

In [None]:
# max_depth
fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = rf,
                      X = X_under,
                      y = y_under,
                      param_name = params['max_depth'][0],
                      param_range = params['max_depth'][1],
                      cv = 5,
                      verbose = 1,
                      n_jobs = 8,
                      logx = False)

# Size, fit, and show visualization
viz.fit(X_under, y_under)
viz.show(outpath = '../viz/max_depth');

In [None]:
# min_samples_split
fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = rf,
                      X = X_under,
                      y = y_under,
                      param_name = params['min_samples_split'][0],
                      param_range = params['min_samples_split'][1],
                      cv = 5,
                      verbose = 1,
                      n_jobs = 8,
                      logx = False)

# Size, fit, and show visualization
viz.fit(X_under, y_under)
viz.show(outpath = '../viz/min_samp_split');

In [None]:
# min_samples_leaf
fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = rf,
                      X = X_under,
                      y = y_under,
                      param_name = params['min_samples_leaf'][0],
                      param_range = params['min_samples_leaf'][1],
                      cv = 5,
                      verbose = 1,
                      n_jobs = 8,
                      logx = False)

# Size, fit, and show visualization
viz.fit(X_under, y_under)
viz.show(outpath = '../viz/min_samp_leaf');

In [None]:
# max_leaf_nodes
fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = rf,
                      X = X_under,
                      y = y_under,
                      param_name = params['max_leaf_nodes'][0],
                      param_range = params['max_leaf_nodes'][1],
                      cv = 5,
                      verbose = 1,
                      n_jobs = 8,
                      logx = False)       

# Size, fit, and show visualization
viz.fit(X_under, y_under)
viz.show(outpath = '../viz/max_leaf_nodes');

In [None]:
# Create tuned model
rft = RandomForestClassifier(n_estimators=15,
                             max_depth=20,
                             min_samples_split=100,
                             min_samples_leaf=40, 
                             max_leaf_nodes=12500,
                             random_state=31,
                             verbose=1,
                             n_jobs=8)

rft.fit(X_under, y_under);
rft.score(X_train, y_train)

# CV for tuned model
rft_cv = cross_val_score(rft, X_train, y_train, cv=10, scoring='accuracy', n_jobs = 8, verbose = 1)
np.average(rft_cv)

# Confusion matrix for tuned model
rft_cm = ConfusionMatrix(rft, classes=['FF', 'CH', 'MM'])

rft_cm.fit(X_train, y_train)
rft_cm.score(X_test, y_test)

rft_cm.show(outpath = '../viz/rf_tuned_matrix');

## Step 6: Model Assessment vs Niave Model

In [None]:
# Predict each pitchers pitches against the model and save to dictionary
total = len(models.keys())
count = 1

for pitcher in unique_pitchers:
    pitcher_df = pitches.loc[pitches.pitcher_id == pitcher]
    y_pitcher = list(pitcher_df.pitch_type)
    pitcher_df = pitcher_df.drop(columns=['pitcher_id', 'pitch_type'])
    score = rft.score(pitcher_df, y_pitcher)
    preds = rft.predict(pitcher_df)
    models.update({pitcher: [score, y_pitcher, preds]})
    
    print('Just processed {}.  {} of {} pitchers remaining'.format(pitcher, count, total), end='\r')
    count += 1
    
# Please note, the naive model calculations are long and not included here.
# See 6.02 notebook for details

In [None]:
# Per pitch, create seaborn distplot of model performance vs naive model
# Code below addresses fastballs
sns.set(rc={'figure.figsize':(20, 10)}, font_scale=2)
plt.axvline(0)

ff_dist = sns.distplot(final_results_ff['ff_model_less_n'], bins = 18, kde=False, color='darkblue')
ff_dist.set(title='Model Performance vs. Naive Model with Fastballs', );

plt.xlabel('Model Minus Naive Prediction (% difference)')
plt.ylabel('Total Pitchers')