In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from operator import itemgetter

In [2]:
batting_data = pd.read_csv('mlb_batting.csv')

In [3]:
batting_data.info

<bound method DataFrame.info of        name_last_first  player_id  year  player_age   ab   pa  hit  single  \
0        Hunter, Torii     116338  2015          39  521  567  125      81   
1         Ortiz, David     120074  2015          39  528  614  144      70   
2      Rodriguez, Alex     121347  2015          39  523  620  131      75   
3      Ramirez, Aramis     133380  2015          37  475  516  117      68   
4       Beltré, Adrian     134181  2015          36  567  619  163     109   
...                ...        ...   ...         ...  ...  ...  ...     ...   
1370  Chourio, Jackson     694192  2024          20  528  573  145      91   
1371   Schanuel, Nolan     694384  2024          22  519  607  130      98   
1372   Langford, Wyatt     694671  2024          22  499  557  126      81   
1373      Young, Jacob     696285  2024          24  468  521  120      92   
1374  Merrill, Jackson     701538  2024          21  554  593  162     101   

      double  triple  ...  oppo

In [4]:
batting_data[['woba', 'on_base_plus_slg', 'hard_hit_percent']].describe()

Unnamed: 0,woba,on_base_plus_slg,hard_hit_percent
count,1375.0,1375.0,1375.0
mean,0.337727,0.788594,39.171927
std,0.034218,0.090758,7.745796
min,0.239,0.539,8.5
25%,0.315,0.727,34.65
50%,0.335,0.782,39.7
75%,0.358,0.839,44.05
max,0.478,1.185,62.2


In [5]:
batting_data.shape

(1375, 88)

In [6]:
# Z normalize list of desired columns
def normalize(data, col_list):
    new_data = {}
    for column in data.columns:
        if column in col_list:
            col_data = []
            for index_r in range(data.shape[0]):    
                z_normalize = ((data.loc[index_r, column] - data[column].mean())
                    / data[column].std())
                col_data.append(z_normalize)
                col_name = f'Znorm_{column}'
            new_data[col_name] = col_data
    return(new_data)

In [7]:
norm_cols = ['woba', 'on_base_plus_slg', 'hard_hit_percent']
df_norms = pd.DataFrame(normalize(batting_data, norm_cols))
df = pd.concat([batting_data, df_norms], axis=1)

In [8]:
# assign success labels
def assign_labels_quad(data):
    woba_weight = 0.4
    ops_weight = 0.4
    hh_weight = 0.2
    labeled_data = []
    for index in range(data.shape[0]):
        weighted_score = ((data.loc[index, 'Znorm_woba'] * woba_weight) +
            (data.loc[index, 'Znorm_on_base_plus_slg'] * ops_weight) +
            (data.loc[index, 'Znorm_hard_hit_percent'] * hh_weight))
        if weighted_score > 1:
            labeled_data.append(4)
        elif weighted_score > 0:
            labeled_data.append(3)
        elif weighted_score > -1:
            labeled_data.append(2)
        else:
            labeled_data.append(1)

    success = {'success_4': labeled_data}
    return success

In [9]:
# assign success labels
def assign_labels_bin(data):
    woba_weight = 0.4
    ops_weight = 0.4
    hh_weight = 0.2
    labeled_data = []
    for index in range(data.shape[0]):
        weighted_score = ((data.loc[index, 'Znorm_woba'] * woba_weight) +
            (data.loc[index, 'Znorm_on_base_plus_slg'] * ops_weight) +
            (data.loc[index, 'Znorm_hard_hit_percent'] * hh_weight))
        if weighted_score > 0:
            labeled_data.append(1)
        else:
            labeled_data.append(0)
    success = {'success_2': labeled_data}
    return success

In [10]:
success_binary = pd.DataFrame(assign_labels_bin(df))
success_quarters = pd.DataFrame(assign_labels_quad(df))
knn_data = pd.concat([df, success_binary, success_quarters], axis=1)

In [11]:
knn_data

Unnamed: 0,name_last_first,player_id,year,player_age,ab,pa,hit,single,double,triple,...,flyballs,linedrives_percent,linedrives,popups_percent,popups,Znorm_on_base_plus_slg,Znorm_woba,Znorm_hard_hit_percent,success_2,success_4
0,"Hunter, Torii",116338,2015,39,521,567,125,81,22,0,...,95,20.9,88,7.1,30,-0.954120,-0.985661,-0.551516,0,2
1,"Ortiz, David",120074,2015,39,528,614,144,70,37,0,...,113,28.7,127,8.1,36,1.370740,1.206173,1.281737,1,4
2,"Rodriguez, Alex",121347,2015,39,523,620,131,75,22,1,...,96,24.9,96,6.5,25,0.588441,0.680133,0.610405,1,3
3,"Ramirez, Aramis",133380,2015,37,475,516,117,68,31,1,...,101,29.1,120,8.7,36,-0.755791,-0.839539,-0.603156,0,2
4,"Beltré, Adrian",134181,2015,36,567,619,163,109,32,4,...,94,31.6,161,7.6,39,-0.017565,-0.021254,0.158547,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,"Chourio, Jackson",694192,2024,20,528,573,145,91,29,4,...,100,23.9,98,3.7,15,0.026508,0.037195,0.739507,1,3
1371,"Schanuel, Nolan",694384,2024,22,519,607,130,98,19,0,...,96,27.7,118,3.5,15,-0.921065,-0.664192,-1.777987,0,2
1372,"Langford, Wyatt",694671,2024,22,499,557,126,81,25,4,...,112,22.0,85,9.0,35,-0.535425,-0.488845,0.545854,0,2
1373,"Young, Jacob",696285,2024,24,468,521,120,92,24,1,...,58,23.1,87,5.0,19,-1.560126,-1.394803,-1.326129,0,1


In [12]:
def get_player_stats(df, player_id, year):
    player_stats = df[(df['player_id'] == player_id) & (df['year'] == year)]
    if not player_stats.empty:
        return player_stats.index
    else:
        return None

In [13]:
def manhattan_distance(data, player1_id, player1_year, player2):
    player1 = data.loc[get_player_stats(data, player1_id, player1_year)].iloc[0]
    woba = abs(player1['Znorm_woba'] - data.loc[player2, 'Znorm_woba'])
    ops = abs(player1['Znorm_on_base_plus_slg'] - data.loc[player2, 'Znorm_on_base_plus_slg'])
    hhp = abs(player1['Znorm_hard_hit_percent'] - data.loc[player2, 'Znorm_hard_hit_percent'])
    return (woba + ops + hhp)

In [14]:
def knn_predict_quad(data, k, player_id, player_year):
    distances = []
    for i in range(data.shape[0]):
        dist = manhattan_distance(data, 701538, 2024, i)
        success_level = data.iloc[i]['success_4']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [15]:
def knn_predict_bin(data, k, player_id, player_year):
    distances = []
    for i in range(data.shape[0]):
        dist = manhattan_distance(data, 701538, 2024, i)
        success_level = data.iloc[i]['success_2']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [16]:
k = int(math.sqrt(knn_data.shape[0]))

In [17]:
# knn for Jackson Merrill, 2024 using Manhattan quarters
# Expected Success Level: 4
prediction = knn_predict_quad(knn_data, k, 701538, 2024)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1]
Success level: 1


In [18]:
# knn for Jackson Merrill, 2024 using Manhattan quarters
# Expected Success Level: 4
prediction = knn_predict_quad(knn_data, 50, 701538, 2024)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 4, 4, 1, 1, 4, 1, 4, 4, 4, 1, 1, 1, 1]
Success level: 1


In [19]:
# knn for Jackson Merrill, 2024 using Manhattan binary
# Expected Success Level: 1
prediction = knn_predict_bin(knn_data, k, 701538, 2024)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
Success level: 0


In [20]:
# knn for Jackson Merrill, 2024 using Manhattan binary
# Expected Success Level: 1
prediction = knn_predict_bin(knn_data, 50, 701538, 2024)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0]
Success level: 0


In [21]:
# knn for Jackson Chourio, 2024 using Manhattan quarters
# Expected Success Level: 3
prediction = knn_predict_quad(knn_data, k, 701538, 2024)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1]
Success level: 1


In [22]:
# knn for Jackson Chourio, 2024 using Manhattan quarters
# Expected Success Level: 3
prediction = knn_predict_quad(knn_data, 50, 701538, 2024)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 4, 4, 1, 1, 4, 1, 4, 4, 4, 1, 1, 1, 1]
Success level: 1


In [23]:
# knn for Jackson Chourio, 2024 using Manhattan binary
# Expected Success Level: 1
prediction = knn_predict_bin(knn_data, k, 701538, 2024)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
Success level: 0


In [24]:
# knn for Jackson Chourio, 2024 using Manhattan binary
# Expected Success Level: 1
prediction = knn_predict_bin(knn_data, 50, 701538, 2024)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0]
Success level: 0


In [25]:
# knn for Robbie Grossman, 2021 using Manhattan quarters
# Expected Success Level: 2
prediction = knn_predict_quad(knn_data, k, 543257, 2021)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1]
Success level: 1


In [26]:
# knn for Robbie Grossman, 2021 using Manhattan quarters
# Expected Success Level: 2
prediction = knn_predict_quad(knn_data, 50, 543257, 2021)
print(f'Success level: {prediction}')

[4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 4, 4, 1, 1, 4, 1, 4, 4, 4, 1, 1, 1, 1]
Success level: 1


In [27]:
# knn for Robbie Grossman, 2021 using Manhattan binary
# Expected Success Level: 0
prediction = knn_predict_bin(knn_data, k, 543257, 2021)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
Success level: 0


In [28]:
# knn for Robbie Grossman, 2021 using Manhattan binary
# Expected Success Level: 0
prediction = knn_predict_bin(knn_data, 50, 543257, 2021)
print(f'Success level: {prediction}')

[1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0]
Success level: 0


In [29]:
knn_data.loc[(knn_data['player_id'] == 543257) & (knn_data['year'] == 2021)]

Unnamed: 0,name_last_first,player_id,year,player_age,ab,pa,hit,single,double,triple,...,flyballs,linedrives_percent,linedrives,popups_percent,popups,Znorm_on_base_plus_slg,Znorm_woba,Znorm_hard_hit_percent,success_2,success_4
882,"Grossman, Robbie",543257,2021,31,557,671,133,84,23,3,...,147,26.8,110,7.3,30,-0.18284,-0.021254,-0.654797,0,2


In [37]:
counts = {1: 0, 2: 0, 3: 0, 4: 0}
for i in range(knn_data.shape[0]):
    s_level = knn_data.iloc[i]['success_4']
    if s_level in counts.keys():
        counts[s_level] += 1

print(counts)

{1: 156, 2: 557, 3: 480, 4: 182}


In [38]:
import numpy as np

# Extract normalized features
features = knn_data[['Znorm_woba', 'Znorm_on_base_plus_slg', 'Znorm_hard_hit_percent']]
covariance_matrix = np.cov(features.T)  # Covariance matrix
inverse_covariance_matrix = np.linalg.inv(covariance_matrix)  # Inverse covariance matrix

In [39]:
def mahalanobis_distance(data, player1_id, player1_year, player2, inv_cov_matrix):
    player1_stats = data.loc[get_player_stats(data, player1_id, player1_year)].iloc[0]
    player2_stats = data.loc[player2]
    diff = np.array([
        player1_stats['Znorm_woba'] - player2_stats['Znorm_woba'],
        player1_stats['Znorm_on_base_plus_slg'] - player2_stats['Znorm_on_base_plus_slg'],
        player1_stats['Znorm_hard_hit_percent'] - player2_stats['Znorm_hard_hit_percent']
    ])
    mahalanobis_dist = np.sqrt(np.dot(np.dot(diff.T, inv_cov_matrix), diff))
    return mahalanobis_dist

In [40]:
def knn_predict2_quad(data, k, player_id, player_year, inv_cov_matrix):
    distances = []
    for i in range(data.shape[0]):
        dist = mahalanobis_distance(data, player_id, player_year, i, inv_cov_matrix)
        success_level = data.iloc[i]['success_4']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [41]:
def knn_predict2_bin(data, k, player_id, player_year, inv_cov_matrix):
    distances = []
    for i in range(data.shape[0]):
        dist = mahalanobis_distance(data, player_id, player_year, i, inv_cov_matrix)
        success_level = data.iloc[i]['success_2']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [42]:
# knn for Jackson Merrill, 2024 using Mahalanobis quarter
# Expected Success Level: 4
prediction = knn_predict2_quad(knn_data, k, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 4, 1, 2, 4, 1, 1, 4, 4, 2, 1, 1, 1, 1, 1, 2, 4, 4, 1, 4, 1, 2, 2, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 1, 1, 1, 2]
Success level: 1


In [43]:
# knn for Jackson Merrill, 2024 using Mahalanobis quarter
# Expected Success Level: 4
prediction = knn_predict2_quad(knn_data, 50, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 4, 1, 2, 4, 1, 1, 4, 4, 2, 1, 1, 1, 1, 1, 2, 4, 4, 1, 4, 1, 2, 2, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 1, 1, 1, 2, 4, 2, 4, 1, 2, 1, 3, 4, 4, 1, 1, 1, 1]
Success level: 1


In [44]:
# knn for Jackson Merrill, 2024 using Mahalanobis binary
# Expected Success Level: 1
prediction = knn_predict2_bin(knn_data, k, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Success level: 0


In [45]:
# knn for Jackson Merrill, 2024 using Mahalanobis quarter
# Expected Success Level: 1
prediction = knn_predict2_bin(knn_data, 50, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0]
Success level: 0


In [55]:
# knn for Jackson Chourio, 2024 using Mahalanobis quarters
# Expected Success Level: 3
prediction = knn_predict2_quad(knn_data, k, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 4, 1, 2, 4, 1, 1, 4, 4, 2, 1, 1, 1, 1, 1, 2, 4, 4, 1, 4, 1, 2, 2, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 1, 1, 1, 2]
Success level: 1


In [56]:
# knn for Jackson Chourio, 2024 using Mahalanobis quarters
# Expected Success Level: 3
prediction = knn_predict2_quad(knn_data, 50, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 4, 1, 2, 4, 1, 1, 4, 4, 2, 1, 1, 1, 1, 1, 2, 4, 4, 1, 4, 1, 2, 2, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 1, 1, 1, 2, 4, 2, 4, 1, 2, 1, 3, 4, 4, 1, 1, 1, 1]
Success level: 1


In [57]:
# knn for Jackson Chourio, 2024 using Mahalanobis binary
# Expected Success Level: 1
prediction = knn_predict2_bin(knn_data, k, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Success level: 0


In [58]:
# knn for Jackson Chourio, 2024 using Mahalanobis binary
# Expected Success Level: 1
prediction = knn_predict2_bin(knn_data, 50, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0]
Success level: 0


In [59]:
# knn for Robbie Grossman, 2021 using Mahalanobis quarters
# Expected Success Level: 2
prediction = knn_predict2_quad(knn_data, k, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 4, 4, 1, 1, 3, 2, 4, 4, 1, 1, 1, 1, 3, 2, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 4, 4, 4]
Success level: 4


In [60]:
# knn for Robbie Grossman, 2021 using Mahalanobis quarters
# Expected Success Level: 2
prediction = knn_predict2_quad(knn_data, 50, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 4, 4, 1, 1, 3, 2, 4, 4, 1, 1, 1, 1, 3, 2, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 4, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 1, 2, 1, 1, 4]
Success level: 4


In [61]:
# knn for Robbie Grossman, 2021 using Mahalanobis binary
# Expected Success Level: 0
prediction = knn_predict2_bin(knn_data, k, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]
Success level: 1


In [62]:
# knn for Robbie Grossman, 2021 using Mahalanobis binary
# Expected Success Level: 0
prediction = knn_predict2_bin(knn_data, 50, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1]
Success level: 1
