# Import Libraries

In [1]:
# Data Manipulation Libraries: Standard dataframes and array libraries
import pandas as pd
import numpy as np
import sqlite3 #for querying data 

# Data Visualization Libraries:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns

# Data Analysis: Statistics and Machine Learning Libraries
from scipy import stats
import scikit_posthocs as sp
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

# Displaying plots in jupter notebook
%matplotlib inline

# Update Jupyter Notebook Display Settings

In [2]:
# Display all columns (pandas will collapse some columns if we don't set this option)
pd.set_option('display.max_columns', None)

# Load Data and Prepare for Analysis

###  Step a: Import data using .read_csv()

In [None]:
# use the .read_csv() method
# We use individual file imports rather than the whole databse because the database is a very large file
# This is a subset
bregman = pd.read_csv('static/documents/bregman')
rendon = pd.read_csv('static/documents/rendon')
soto = pd.read_csv('static/documents/soto')

### Step b: Use .concat() method to combine player dataframes

In [None]:
# Create a list of databases
team = [bregman, rendon, soto]
# Use the .concat() method and pass the list of dataframes
result = pd.concat(team)

### Step c: Replace Batter ID's with Player Last Name

In [None]:
# Option 1:
result['batter'].replace({608324: 'Bregman', 543685: 'Rendon', 665742: 'Soto'}, inplace= True)

### Step d: Delete rows in the 'events' table with missing values
 - we are only interested in batter events and must omit rows that lack this key feature.

In [None]:
# Use the .dropna() method to omit any rows where the column 'events' has a missing value
results = result.dropna(how='any', subset=['events'])

### Step e: Inspect data by using the .pivot_table() method

In [None]:
# Use the .pivot_table() method to display a summary table
# The batter names are used as the index
pivot = results.pivot_table(index="batter", columns='events', aggfunc='size', fill_value=0)
pivot

### Plot Entire Season and World Series Home Run Total by player
 - Use individual dataframes
 - Use matplotlib .bar() as a data vis option

In [None]:
# data to plot
# replace batter id with name
plt.style.use('ggplot')

# Store player names in array 'x'
# Store homerun values in array home_run
x = pivot.home_run.index.values
home_runs = pivot.home_run.values

# Use a list comprehension to create a list of playe positions
x_pos = [i for i, _ in enumerate(x)]

# Create plot and labels
plt.bar(x_pos, home_runs, color='blue', alpha = 0.7)
plt.xlabel("Players")
plt.ylabel("Home Runs")
plt.title("Total Home Runs (Including World Series)")
plt.xticks(x_pos, x)

plt.savefig("static/images/total_home_runs.png", bbox_inches='tight') 
plt.show()


Soto made more Home Runs than Rendon, when counting playoffs and world series.

# Objective 1: Evaluate Player Batting Events

In [None]:
order = result.events.value_counts().index

sns.set(rc={'figure.figsize':(25,15)})
sns.set(font_scale=1.5)
result_events = sns.countplot(x='events', hue = "batter", data=result, order=order)
result_events.set_xticklabels(result_events.get_xticklabels(), rotation=45)
fig = result_events.get_figure()
fig.savefig("static/images/batter_event_bar_plots.png", bbox_inches='tight') 

In [None]:
# Determine the number of batted events per player
results.groupby('batter')['events'].agg('count')

In [None]:
# Home run rate
print(f'Bregman HR rate {45/765 * 100}, Rendon HR Rate {37/711* 100}, Soto HR Rate {39/730* 100}')

# Objective 2: Determine which player had the highest hit distance

## Obj 2 Step 1: Clean Data

In [None]:
# Clearn up data by removing an NaN
# Use the .dropna() method to omit any rows where the column 'events' has a missing value
results_hd = results.dropna(how='any', subset=['hit_distance_sc'])

In [None]:
# Filter by player and only for home run events
bregman_hd = results_hd[(results_hd["batter"] == "Bregman") & (results_hd["events"] == "home_run")]
rendon_hd = results_hd[(results_hd["batter"] == "Rendon") & (results_hd["events"] == "home_run")]
soto_hd = results_hd[(results_hd["batter"] == "Soto") & (results_hd["events"] == "home_run")]

In [None]:
# Combine filtered data
frames = [bregman_hd, rendon_hd, soto_hd]
filterd_hd = pd.concat(frames)

## Obj 2 Step 2: Aggregate Descriptive Statistics

In [None]:
bregman_hd_metrics = bregman_hd[["hit_distance_sc"]].describe()
bregman_hd_metrics = bregman_hd_metrics.rename(columns={"hit_distance_sc" : "Bregman Hit Distance"})
rendon_hd_metrics = rendon_hd[["hit_distance_sc"]].describe()
rendon_hd_metrics = rendon_hd_metrics.rename(columns={"hit_distance_sc" : "Rendon Hit Distance"})
soto_hd_metrics = soto_hd[["hit_distance_sc"]].describe()
soto_hd_metrics = soto_hd_metrics.rename(columns={"hit_distance_sc" : "Soto Hit Distance"})
frames = [bregman_hd_metrics,rendon_hd_metrics,soto_hd_metrics]
hd_stats = pd.concat(frames, axis = 1)
hd_stats

## Obj 2 Step 3: Visualize Hit Distance 

In [None]:
sns.set(font_scale=2)
ax = sns.boxplot(x="batter", y="hit_distance_sc", data=filterd_hd)
ax.set(xlabel='Player', ylabel='Hit Distance (ft)')
plt.savefig("static/images/hit_distance_boxplot.png", bbox_inches='tight')
plt.show()

 Soto seems to have the longest hit distances, but need to confirm if this is statistically significant

## Obj 2 Step 4: Determine if there is a statistical difference in the hit distance between players

### Obj2 Step 4a: Determine if the data is normally distrbuted using graphical approaches

In [None]:
fig1, axs1 = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(20, 10))
sns.distplot(bregman_hd["hit_distance_sc"], color='tab:blue', ax=axs1[0]).set_title('Alex Bregman \n Hit Distance, 2019')
sns.distplot(rendon_hd["hit_distance_sc"], color='tab:blue', ax=axs1[1]).set_title('Anthony Rendon \n Hit Distance, 2019')
sns.distplot(soto_hd["hit_distance_sc"], color='tab:blue', ax=axs1[2]).set_title('Juan Soto \n Hit Distance, 2019')
fig1.savefig("static/images/hit_distance_distplot.png", bbox_inches='tight')

The curves appear normally distributed, but need to follow up with a shapiro-wilk test.

### Obj2 Step 4b:  Determine if the data is normally distrbuted using Shapiro-Wilk Test

In [None]:
# Use the scipy.stats.shapiro test for normality
# The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# Bregman
breg_sw_test = [stats.shapiro(bregman_hd["hit_distance_sc"])[0], stats.shapiro(bregman_hd["hit_distance_sc"])[1]]
rendon_sw_test = [stats.shapiro(rendon_hd["hit_distance_sc"])[0], stats.shapiro(rendon_hd["hit_distance_sc"])[1]]
soto_sw_test = [stats.shapiro(soto_hd["hit_distance_sc"])[0], stats.shapiro(soto_hd["hit_distance_sc"])[1]]
Shapiro_Wilk_df = pd.DataFrame({"bregman": breg_sw_test, "rendon": rendon_sw_test, "soto": soto_sw_test}, index = ["W", "p-value"])

In [None]:
# The p-value for the test does not reject the Ho. Data is normally distributed
Shapiro_Wilk_df

The p-values for all the players Shapiro-Wilk test are above 0.05, which suggest that we cannot reject the null hypothesis that the samples came from a normal distribution.

### Obj2 Step 4c: Test for equal variance  using the Barlette's test

In [None]:
# Test for equal variance
# scipy.stats.bartlett because I know the samples are normally distributed
barlette_result = stats.bartlett(bregman_hd["hit_distance_sc"], rendon_hd["hit_distance_sc"], soto_hd["hit_distance_sc"])
if barlette_result[1] >= 0.05:
    print("Do not reject the null hypothesis, the all input samples are from populations with equal variances.")
else:
    print("Reject the null hypothesis, the samples do not have equal variance")

The home run distance data is parametric and we can use an ANOVA to determine if there is a difference in the hit distance between players.

### Obj2 Step 4d: Conduct an ANOVA Test for Variance

In [None]:
# The conditions to test an ANOVA have been met
anova_hd = stats.f_oneway(bregman_hd["hit_distance_sc"], rendon_hd["hit_distance_sc"], soto_hd["hit_distance_sc"])
if anova_hd[1] >= 0.05:
    print(f"The p-value is: {anova_hd[1]}. Do not reject the null hypothesis, no difference in hit distance amongst batters.")
else:
    print(f"The p-value is: {anova_hd[1]}. Reject the null hypothesis, there is a difference in the hit distance amongs batters.")

### Obj2 Step 4e: Conduct a Post hoc test using Tukey's to determine which variables differ significantly

In [None]:
import scikit_posthocs as sp
sp.posthoc_tukey(filterd_hd, val_col='hit_distance_sc', group_col='batter')

There is a difference in the hit distance between 
 - Bregman and Rendon
 - Bregman and Soto
 - Rendon and Bregman
 - Soto and Bregman<br>
This suggets that Both Rendon and Soto have a higher hit distance than Bregman, but no difference between Soto and Rendon.

# Objective 3: Determine the impact of launch speed and launch angle on Home Runs

## Obj 3 Step 1: Use Logistic Regression to determine if launch speed and launch angle can be used to predict home runs

### Obj 3 Step 1a: Prepare Data for Analysis

In [3]:
# Connect to the database using the .connect() method
# This will only connect to the website data, once you save the data as either a csv or db, 
# you will need to reconnect with the correct filepath
conn = sqlite3.connect('/Users/trinitycisneros/Documents/Coding/bitterscientist.com/statcast.db')
season2019_df = pd.read_sql_query("SELECT events, launch_angle, launch_speed FROM statcast;", conn)
season2019_df.to_csv('/Users/trinitycisneros/Documents/Coding/bitterscientist.com/season2019_logreg')
conn.close() 

#### Clean data (remove nan, and change datatypes)

In [4]:
# Drop any rows that have missing values, as we can't use this data in the analysis
season2019_df = season2019_df.dropna(how='any')

In [5]:
# Will change datatypes columns
float_cols = ['launch_angle','launch_speed']
cat_col = ['events']

In [6]:
# Converts columns into floats
for col in float_cols:
    season2019_df[col] = season2019_df[col].astype('float')

In [7]:
# Convert events column into category
for col in cat_col:
    season2019_df[col] = season2019_df[col].astype('category')

In [8]:
# Prepare data for analysis, use .get_dummies() to get binary array for each event
dummy_df = pd.get_dummies(season2019_df)

In [9]:
# Create new dataframe 
dummy_df = dummy_df[['launch_angle', 'launch_speed', 'events_home_run']].copy()

In [None]:
# Plot all events
ax = sns.scatterplot(x="launch_speed", y="launch_angle", hue="events_home_run", data=dummy_df)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.90), ncol=1)
plt.savefig("static/images/events_home_run.png", bbox_inches='tight')

In [10]:
# Assign predictor variables to X and target variable to y
X = dummy_df[['launch_angle', 'launch_speed']]
y = dummy_df['events_home_run']

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [18]:
svc = svm.SVC(kernel='rbf', gamma=1).fit(X_train, y_train)

In [19]:
print("Accuracy on trainig set: {:.2f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(svc.score(X_test, y_test)))

Accuracy on trainig set: 0.97
Accuracy on test set: 0.97


In [29]:
y_pred = svc.predict(X_test)

In [21]:
cm = confusion_matrix(y_test, y_pred).ravel()

In [22]:
cm

array([35970,   427,   658,  1403])

In [23]:
tn, fp, fn, tp = cm
print(tn, fp, fn, tp)

35970 427 658 1403


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36397
           1       0.77      0.68      0.72      2061

    accuracy                           0.97     38458
   macro avg       0.87      0.83      0.85     38458
weighted avg       0.97      0.97      0.97     38458



Logistic Regression

In [24]:
# X_train, X_test, y_train, y_test = train_test_split(
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42)
logreg = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.949
Test set score: 0.952


In [25]:
# Apply fitted model with test values
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.95


In [26]:
# Compute confusion matrix to evaluate the accuracy of a classification
log_cm = confusion_matrix(y_test, y_pred).ravel()

In [27]:
tn, fp, fn, tp = log_cm
print(tn, fp, fn, tp)

36082 315 1538 523


In [None]:
cmm = confusion_matrix(y_test, y_pred)

In [None]:
ax = sns.heatmap(cmm, annot=True, cmap="YlGnBu")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import svm
# clf = svm.SVC(kernel='linear', C = 1.0)

In [None]:
svc = svm.SVC(kernel=’poly’, degree=6).fit(X, y)

In [None]:
# Prepare data for analysis, use .get_dummies() to get binary array for each event
logreg_data = results[["events", 'launch_angle', 'launch_speed']]
dum_logreg_data = pd.get_dummies(logreg_data)

In [None]:
# Drop any rows that have missing values, as we can't use this data in the analysis
dum_logreg_data = dum_logreg_data[['launch_angle','launch_speed','events_home_run']].dropna(how='any')

In [None]:
# Assign predictor variables to X and target variable to y
X = dum_logreg_data[['launch_angle', 'launch_speed']]
y = dum_logreg_data['events_home_run']

### Obj 3 Step 1b: Conduct the logistic regression analysis and use the .train_test_split() method to evaluate the performance of the model

In [None]:
# Split the data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Instantiate the LogisticRegression model, the solver parameter is set to default algorithm
logreg = LogisticRegression(solver='liblinear')

# Fit the data to the model
logreg.fit(X_train, y_train)

In [None]:
# Apply fitted model with test values
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
# Compute confusion matrix to evaluate the accuracy of a classification
test_cm = confusion_matrix(y_test, y_pred).ravel()

In [None]:
tn, fp, fn, tp = test_cm
print(tn, fp, fn, tp)

In [None]:
y_pred

we have 421 correct predictions and 35 incorrect

In [None]:
# Determine the accuracy of the classifier in classifying the data points 
# using the .classification_report() method
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

### Obj 3 Step 1c:  Plot the roc_curve
 - a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied, the farther left from the red dotted line, the better the model

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('static/images/Log_ROC')
plt.show()

### Obj 3 Step 1d:  Test Model using data from a separate player not used in building the logistic regression

Read in CSV file containing the data for Springer (Houston Astros)

In [None]:
springer = pd.read_csv('static/documents/springer')

In [None]:
# Minor data cleaning
springer['batter'].replace({543807: 'Springer'}, inplace= True)

In [None]:
# Prepare data for analysis, use .get_dummies() to get binary array for each event
springer_data = springer[["events", 'launch_angle', 'launch_speed']].sample(frac=1)
springer_test_data = pd.get_dummies(springer_data)

In [None]:
springer_data

In [None]:
# Drop any rows that have missing values, as we can't use this data in the analysis
springer_test_data = springer_test_data[['launch_angle','launch_speed','events_home_run']].dropna(how='any')

In [None]:
# Total number of batted events
print(f"Springer had a total number batted events: {len(springer_test_data['events_home_run'])}")
print(f"Springer's total number of home runs: {len(springer_test_data[springer_test_data['events_home_run'] == 1])}")

In [None]:
# Assign predictor variables to X and target variable to y
X_validation = springer_test_data[['launch_angle', 'launch_speed']]
y_validation = springer_test_data['events_home_run']

In [None]:
# Apply fitted model with test values
y_pred_validation = logreg.predict(X_validation)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_validation, y_validation)))

In [None]:
# Compute confusion matrix to evaluate the accuracy of a classification
tn, fp, fn, tp = confusion_matrix(y_validation, y_pred_validation).ravel()
print(tn, fp, fn, tp)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_validation, logreg.predict(X_validation))
fpr, tpr, thresholds = roc_curve(y_validation, logreg.predict_proba(X_validation)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('static/images/Log_ROC_validation')
plt.show()

## Obj 3 Step 2: Assess the Launch Speed and Launch Angle for each player
 - These two variables have been reported to play an important role in home run events
 - This data uses the results dataframe that combines all players and drops any missing event

### Alex Bregman batted events by launch speed and launch angle

In [None]:
# Alex Bregman
fig1, axs1 = plt.subplots(ncols=1, sharex=True, sharey=True, figsize=(10, 10))
sns.set(font_scale=1.25)
sns.scatterplot(x="launch_speed", y="launch_angle", hue="events", data= results[results["batter"] == "Bregman"]).set_title('Alex Bregman\nBatter Events, 2019')
plt.legend(loc='upper right', bbox_to_anchor=(1.45, 1), ncol=1)
fig1.savefig("static/images/bregman_events_angle_speed.png", bbox_inches='tight')
plt.show()

The home runs appear clustered between 20-40 angles and above 90 mph 

### Athony Rendon batted events by launch speed and launch angle

In [None]:
# Anthony Rendon
fig1, axs1 = plt.subplots(ncols=1, sharex=True, sharey=True, figsize=(10, 10))
sns.set(font_scale=1.25)
sns.scatterplot(x="launch_speed", y="launch_angle", hue="events", data= results[results["batter"] == "Rendon"]).set_title('Anthony Rendon\nBatter Events, 2019')
plt.legend(loc='upper right', bbox_to_anchor=(1.45, 1), ncol=1)
fig1.savefig("static/images/rendon_events_angle_speed.png", bbox_inches='tight')
plt.show()

The home runs appear clustered between 20-40 angles and above 95 mph 

### Juan Soto batted events by launch speed and launch angle

In [None]:
# Juan Soto
fig1, axs1 = plt.subplots(ncols=1, sharex=True, sharey=True, figsize=(10, 10))
sns.set(font_scale=1.25)
sns.scatterplot(x="launch_speed", y="launch_angle", hue="events", data= results[results["batter"] == "Soto"]).set_title('Juan Soto\nBatter Events, 2019')
plt.legend(loc='upper right', bbox_to_anchor=(1.45, 1), ncol=1)
fig1.savefig("static/images/soto_events_angle_speed.png", bbox_inches='tight')
plt.show()

The home runs appear clustered between 20-40 angles and above 100 mph 

## Obj 3 Step 3: Evaluate Home Run Data
 - To focus on the launch angle and speed in home run activity
 - Data used here is the original dataframe that was imported from CSV
 - this data does not have any event filtered, but thats because a home run is an event

### Obj 3 Step 3a: Filter data by player and by home runs
 - use this method because we don't have to go through a series of filters using the combined results dataframe

In [None]:
# Filter to include home runs only
bregman_hr = bregman[bregman["events"] == "home_run"]
rendon_hr = rendon[rendon["events"] == "home_run"]
soto_hr = soto[soto["events"] == "home_run"]

### Obj 3 Step 3b: Compare the descriptive statistics for speed and launch angle
 -  Combine descriptive statistics by player into a single dataframe

In [None]:
bregman_hr_metrics = bregman_hr[["launch_speed", "launch_angle"]].describe()
bregman_hr_metrics = bregman_hr_metrics.rename(columns={"launch_speed" : "Bregman Speed", "launch_angle": "Bregman Angle"})
rendon_hr_metrics = rendon_hr[["launch_speed", "launch_angle"]].describe()
rendon_hr_metrics = rendon_hr_metrics.rename(columns={"launch_speed" : "Rendon Speed", "launch_angle": "Rendon Angle"})
soto_hr_metrics = soto_hr[["launch_speed", "launch_angle"]].describe()
soto_hr_metrics = soto_hr_metrics.rename(columns={"launch_speed" : "Soto Speed", "launch_angle": "Soto Angle"})
frames = [bregman_hr_metrics,rendon_hr_metrics,soto_hr_metrics]
hr_stats = pd.concat(frames, axis = 1)
hr_stats

### Obj 3 Step 3c:  Visualize the launch angle and launch speed for home runs by player

In [None]:
# Create a figure with three scatter plots of launch speed vs. launch angle, one for each player's home runs
fig1, axs1 = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(10, 5))
sns.set(font_scale=1)
sns.regplot(x=bregman_hr["launch_speed"], y=bregman_hr["launch_angle"], fit_reg=False, color='tab:blue', data=bregman_hr, ax=axs1[0]).set_title('Alex Bregman\nHome Runs, 2019')
sns.regplot(x=rendon_hr["launch_speed"], y=rendon_hr["launch_angle"], fit_reg=False, color='tab:blue', data=rendon_hr, ax=axs1[1]).set_title('Anthony Rendon\nHome Runs, 2019')
sns.regplot(x=soto_hr["launch_speed"], y=soto_hr["launch_angle"], fit_reg=False, color='tab:blue', data=soto_hr, ax=axs1[2]).set_title('Juan Soto\nHome Runs, 2019')
fig1.savefig("static/images/angle_speed.png", bbox_inches='tight')

## Obj 3 Step 4: Conduct statistical analysis to determine if launch speed differs among players
 - does speed explain the difference in the difference in home runs?

### Obj 3 Step 4a: test for normality using Shapiro-Wilk Test

In [None]:
# Use the scipy.stats.shapiro test for normality
# The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# Bregman
breg_sw_speed_test = [stats.shapiro(bregman_hd["launch_speed"])[0], stats.shapiro(bregman_hd["launch_speed"])[1]]
rendon_sw_speed_test = [stats.shapiro(rendon_hd["launch_speed"])[0], stats.shapiro(rendon_hd["launch_speed"])[1]]
soto_sw_speed_test = [stats.shapiro(soto_hd["launch_speed"])[0], stats.shapiro(soto_hd["launch_speed"])[1]]
Shapiro_Wilk_speed_df = pd.DataFrame({"bregman": breg_sw_speed_test, "rendon": rendon_sw_speed_test, "soto": soto_sw_speed_test}, index = ["W", "p-value"])

The launch speed for Anthony Rendon does not seem to follow a normal distribution, as the p-value < 0.05

### Obj 3 Step 4b: test the launch angle for normality using Shapiro-Wilk Test

In [None]:
# Use the scipy.stats.shapiro test for normality
# The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
# Bregman
breg_sw_angle_test = [stats.shapiro(bregman_hd["launch_angle"])[0], stats.shapiro(bregman_hd["launch_angle"])[1]]
rendon_sw_angle_test = [stats.shapiro(rendon_hd["launch_angle"])[0], stats.shapiro(rendon_hd["launch_angle"])[1]]
soto_sw_angle_test = [stats.shapiro(soto_hd["launch_angle"])[0], stats.shapiro(soto_hd["launch_angle"])[1]]
Shapiro_Wilk_angle_df = pd.DataFrame({"bregman": breg_sw_angle_test, "rendon": rendon_sw_angle_test, "soto": soto_sw_angle_test}, index = ["W", "p-value"])

In [None]:
Shapiro_Wilk_angle_df

The Shapiro-Wilk test confirms that the launch angle is normally distributed

### Obj 3 Step 4c: determine if samples have equal variance using Barlette's test

In [None]:
# Test for equal variance
# scipy.stats.bartlett because I know the samples are normally distributed
barlette_angle = stats.bartlett(bregman_hd["launch_angle"], rendon_hd["launch_angle"], soto_hd["launch_angle"])
if barlette_angle[1] >= 0.05:
    print("Do not reject the null hypothesis, all the input samples are from a populations with equal variances.")
else:
    print("Reject the null hypothesis, the samples do not have equal variance")

### Obj 3 Step 4d: Conduct an ANOVA to determine if there is a significant differnece in the angles between players

In [None]:
# The conditions to test an ANOVA have been met
anova_angle = stats.f_oneway(bregman_hd["launch_angle"], rendon_hd["launch_angle"], soto_hd["launch_angle"])
if anova_angle[1] >= 0.05:
    print(f"The p-value is: {anova_angle[1]}. Do not reject the null hypothesis, no difference in launch angle amongst batters.")
else:
    print(f"The p-value is: {anova_angle[1]}. Reject the null hypothesis, there is a difference in the launch angle amongs batters.")

### Obj 3 Step 4e: Calculate the Pearson Correlation for those conditions that are normally distributed

In [None]:
# Calculate pearson coefficient and p-value Bregman
bregman_pearson = stats.pearsonr(bregman_hr["launch_speed"], bregman_hr["launch_angle"])
bregman_pearson

In [None]:
# Calculate pearson coefficient and p-value Soto
soto_pearson = stats.pearsonr(soto_hr["launch_speed"], soto_hr["launch_angle"])
soto_pearson

### Obj 3 Step 4f: Calculate the Spearman R for those conditions that are not normally distributed

In [None]:
bregman_spearmanr = stats.spearmanr(bregman_hr["launch_speed"], bregman_hr["launch_angle"])
rendon_spearmanr = stats.spearmanr(rendon_hr["launch_speed"], rendon_hr["launch_angle"])
soto_spearmanr = stats.spearmanr(soto_hr["launch_speed"], soto_hr["launch_angle"])
player_spearmanr = pd.DataFrame({"bregman": bregman_spearmanr, "rendon": rendon_spearmanr, "soto": soto_spearmanr}, index = ["rho", "p-value"])

In [None]:
player_spearmanr

The p value for breman suggest that it is statistically significant

## Obj 3 Step 5: Visualize Launch Speed vs Launch Angle in Home Run Events

### Obj 3 Step 5a: Alex Bregman plot

In [None]:
# Bregman
sns.jointplot(x='launch_speed',y='launch_angle',data=bregman_hr, kind='reg')
plt.savefig("static/images/bregman_join_plot_angle_speed.png", bbox_inches='tight')
plt.show()

### Obj 3 Step 5a: Anthony Rendon plot

In [None]:
# Rendon
sns.jointplot(x='launch_speed',y='launch_angle',data=rendon_hr, kind='reg')
plt.savefig("static/images/rendon_join_plot_angle_speed.png", bbox_inches='tight')
plt.show()

### Obj 3 Step 5a: Juan Soto plot

In [None]:
# Soto
sns.jointplot(x='launch_speed',y='launch_angle',data=soto_hr, kind='reg')
plt.savefig("static/images/soto_join_plot_angle_speed.png", bbox_inches='tight')
plt.show()

## Obj3 Step 6: Compute the Kruskal-Wallis H-test for independent samples

In [None]:
kruskal_wallis_h = stats.kruskal(bregman_hd["launch_speed"], rendon_hd["launch_speed"], soto_hd["launch_speed"])

In [None]:
if kruskal_wallis_h[1] >= 0.05:
    print(f"The p value is : {kruskal_wallis_h}. Do not reject the null hypothesis, no difference in the launch speed amongst batters.")
else:
    print(f"The p value is : {kruskal_wallis_h} Reject the null hypothesis, there is a difference in the launch speed amongs batters.")

## Obj3 Step 7 Post hoc test using scikit-posthocs 0.6.1

In [None]:
dunn = sp.posthoc_dunn(filterd_hd, val_col="launch_speed", group_col='batter', p_adjust = 'holm')
dunn

There is no difference in the launch angle between players<br>
The launch speed between 
 - Soto and Bregman is significantlly different
 - Rendon and Soto is significantly different<br>
<br>

# Objective 4: Determine the impact of pitch velocity on Home Runs

### Obj 4 Step 4a: Summary Statistics for the pitch velocity "release_speed" for each event that results in a home run

In [None]:
# Using the player_hr dataframe because it has been filtered for home run events

# Bregman
bregman_hr_rs = bregman_hr[["release_speed"]].describe()
bregman_hr_rs = bregman_hr_rs.rename(columns={"release_speed" : "Bregman rs"})

# Rendon
rendon_hr_rs = rendon_hr[["release_speed"]].describe()
rendon_hr_rs = rendon_hr_rs.rename(columns={"release_speed" : "Rendon rs"})

# Soto
soto_hr_rs = soto_hr[["release_speed"]].describe()
soto_hr_rs = soto_hr_rs.rename(columns={"release_speed" : "Soto rs"})

# Combine summary stats into a single table
frames = [bregman_hr_rs,rendon_hr_rs,soto_hr_rs]
rs_stats = pd.concat(frames, axis = 1)
rs_stats

It appears as if Soto bats against higher pitch velocities.

### Obj 4 Step 4b: Visualize distribution of pitch velocity for each player

In [None]:
# Create a boxplot that describes the pitch velocity of each player's home runs
fig1, axs1 = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(15, 7.5))
sns.boxplot(bregman_hr["release_speed"], color='tab:blue', ax=axs1[0]).set_title('Alex Bregman \n Home Runs, 2019')
sns.boxplot(rendon_hr["release_speed"], color='tab:blue', ax=axs1[1]).set_title('Anthony Rendon \n Home Runs, 2019')
sns.boxplot(soto_hr["release_speed"], color='tab:blue', ax=axs1[2]).set_title('Juan Soto \n Home Runs, 2019')
fig1.savefig("static/images/release_speed.png", bbox_inches='tight')

### Obj 4 Step 4c: Determine if the release_speed differes for all batted events between players (in other words, did any player have to hit the ball against faster, or slower pitches)

In [None]:
# Used the entire dataset for this one, but dropped any instance that 
# does not have a release_speed associated
results_rs = result.dropna(how='any', subset=['release_speed'])

In [None]:
# Filterd by player
bregman_rs = results_rs[results_rs["batter"] == "Bregman"]["release_speed"]
rendon_rs = results_rs[results_rs["batter"] == "Rendon"]["release_speed"]
soto_rs = results_rs[results_rs["batter"] == "Soto"]["release_speed"]

In [None]:
# Anova
anova_rs = stats.f_oneway(bregman_rs, rendon_rs, soto_rs)

In [None]:
if anova_rs[1] >= 0.05:
    print("Do not reject the null hypothesis, no difference in the pitch velocity amongst batters.")
else:
    print("Reject the null hypothesis, there is a difference in the pitch velocity amongs batters.")

### Obj 4 Step 4d: Conduct a post hoc test to determine which players here significantly different

In [None]:
# x = [bregman_hd["launch_speed"], rendon_hd["launch_speed"], soto_hd["launch_speed"]]
dunn = sp.posthoc_dunn(results_rs, val_col="release_speed", group_col='batter', p_adjust = 'holm')
dunn

There is a difference in the pitch velocity between Soto and Rendon<br>
No difference in the pitch velocity between:
 - Bregman and Rendon
 - Bregman and Soto<br>

# Obj 5: Determine the Player's Pitch Locations

### Obj 5 Step 1: Functions to assign the x and y coordinate for the pitch location

In [None]:
def assign_x_coord(row):
    """
    Assigns an x-coordinate to Statcast's strike zone numbers. Zones 11, 12, 13,
    and 14 are ignored for plotting simplicity.
    """
    # Left third of strike zone
    if row.zone in [1, 4, 7]:
        return 1
    # Middle third of strike zone
    if row.zone in [2, 5, 8]:
        return 2
    # Right third of strike zone
    if row.zone in [3, 6, 9]:
        return 3

In [None]:
def assign_y_coord(row):
    """
    Assigns a y-coordinate to Statcast's strike zone numbers. Zones 11, 12, 13,
    and 14 are ignored for plotting simplicity.
    """
    # Upper third of strike zone
    if row.zone in [1, 2, 3]:
        return 3
    # Middle third of strike zone
    if row.zone in [4, 5, 6]:
        return 2
    # Lower third of strike zone
    if row.zone in [7, 8, 9]:
        return 1

### Obj 5 Step 2a:  Alex Bregman's home run zone

In [None]:
# Zones 11, 12, 13, and 14 are to be ignored for plotting simplicity
bregman_strikes_hr = bregman_hr.copy().loc[bregman_hr.zone <= 9]

# Assign Cartesian coordinates to pitches in the strike zone for Judge home runs
bregman_strikes_hr['zone_x'] = bregman_strikes_hr.apply(assign_x_coord, axis=1)
bregman_strikes_hr['zone_y'] = bregman_strikes_hr.apply(assign_y_coord, axis=1)

# Plot Judge's home run zone as a 2D histogram with a colorbar
plt.hist2d(bregman_strikes_hr['zone_x'], bregman_strikes_hr['zone_y'], bins = 3, cmap='Blues')
plt.title('Alex Bregman Home Runs on\n Pitches in the Strike Zone, 2019')
plt.gca().get_xaxis().set_visible(False)
plt.gca().get_yaxis().set_visible(False)
cb = plt.colorbar()
cb.set_label('Counts in Bin')

plt.savefig("static/images/bregman_hr_zone.png", bbox_inches='tight')

### Obj 5 Step 2b:  Anthony Rendons home run zone

In [None]:
# Zones 11, 12, 13, and 14 are to be ignored for plotting simplicity
rendon_strike_hr = rendon_hr.copy().loc[rendon_hr.zone <= 9]

# Assign Cartesian coordinates to pitches in the strike zone for Stanton home runs
rendon_strike_hr['zone_x'] = rendon_strike_hr.apply(assign_x_coord, axis=1)
rendon_strike_hr['zone_y'] = rendon_strike_hr.apply(assign_y_coord, axis=1)

# Plot Stanton's home run zone as a 2D histogram with a colorbar
plt.hist2d(rendon_strike_hr['zone_x'], rendon_strike_hr['zone_y'], bins = 3, cmap='Blues')
plt.title('Anthony Rendon Home Runs on\n Pitches in the Strike Zone, 2019')
plt.gca().get_xaxis().set_visible(False)
plt.gca().get_yaxis().set_visible(False)
cb = plt.colorbar()
cb.set_label('Counts in Bin')
# Save file
plt.savefig("static/images/rendon_hr_zone.png", bbox_inches='tight')

### ### Obj 5 Step 2c:  Juan Soto's home run zone

In [None]:
# Zones 11, 12, 13, and 14 are to be ignored for plotting simplicity
soto_strike_hr = soto_hr.copy().loc[soto_hr.zone <= 9]

# Assign Cartesian coordinates to pitches in the strike zone for Stanton home runs
soto_strike_hr['zone_x'] = soto_strike_hr.apply(assign_x_coord, axis=1)
soto_strike_hr['zone_y'] = soto_strike_hr.apply(assign_y_coord, axis=1)

# Plot Stanton's home run zone as a 2D histogram with a colorbar
plt.hist2d(soto_strike_hr['zone_x'], soto_strike_hr['zone_y'], bins = 3, cmap='Blues')
plt.title('Jose Soto Home Runs on\n Pitches in the Strike Zone, 2019')
plt.gca().get_xaxis().set_visible(False)
plt.gca().get_yaxis().set_visible(False)
cb = plt.colorbar()
cb.set_label('Counts in Bin')
# Save file
plt.savefig("static/images/soto_hr_zone.png", bbox_inches='tight')