In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from operator import itemgetter

In [None]:
batting_data = pd.read_csv('mlb_batting.csv')

In [None]:
batting_data.info

In [None]:
batting_data[['woba', 'on_base_plus_slg', 'hard_hit_percent']].describe()

In [None]:
batting_data.shape

In [None]:
# Z normalize list of desired columns
def normalize(data, col_list):
    new_data = {}
    for column in data.columns:
        if column in col_list:
            col_data = []
            for index_r in range(data.shape[0]):    
                z_normalize = ((data.loc[index_r, column] - data[column].mean())
                    / data[column].std())
                col_data.append(z_normalize)
                col_name = f'Znorm_{column}'
            new_data[col_name] = col_data
    return(new_data)

In [None]:
norm_cols = ['woba', 'on_base_plus_slg', 'hard_hit_percent']
df_norms = pd.DataFrame(normalize(batting_data, norm_cols))
df = pd.concat([batting_data, df_norms], axis=1)

In [None]:
# assign success labels
def assign_labels(data):
    woba_weight = 0.4
    ops_weight = 0.4
    hh_weight = 0.2
    labeled_data = []
    for index in range(data.shape[0]):
        weighted_score = ((data.loc[index, 'Znorm_woba'] * woba_weight) +
            (data.loc[index, 'Znorm_on_base_plus_slg'] * ops_weight) +
            (data.loc[index, 'Znorm_hard_hit_percent'] * hh_weight))
        if weighted_score > 1:
            labeled_data.append(4)
        elif weighted_score > 0:
            labeled_data.append(3)
        elif weighted_score > -1:
            labeled_data.append(2)
        else:
            labeled_data.append(1)

    success = {'success': labeled_data}
    return success

In [None]:
success_labels = pd.DataFrame(assign_labels(df))
knn_data = pd.concat([df, success_labels], axis=1)

In [None]:
knn_data

In [None]:
def get_player_stats(df, player_id, year):
    player_stats = df[(df['player_id'] == player_id) & (df['year'] == year)]
    if not player_stats.empty:
        return player_stats.index
    else:
        return None

In [None]:
def manhattan_distance(data, player1_id, player1_year, player2):
    player1 = data.loc[get_player_stats(data, player1_id, player1_year)].iloc[0]
    woba = abs(player1['Znorm_woba'] - data.loc[player2, 'Znorm_woba'])
    ops = abs(player1['Znorm_on_base_plus_slg'] - data.loc[player2, 'Znorm_on_base_plus_slg'])
    hhp = abs(player1['Znorm_hard_hit_percent'] - data.loc[player2, 'Znorm_hard_hit_percent'])
    return (woba + ops + hhp)

In [None]:
def knn_predict(data, k, player_id, player_year):
    distances = []
    for i in range(data.shape[0]):
        dist = manhattan_distance(data, 701538, 2024, i)
        success_level = data.iloc[i]['success']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [None]:
k = int(math.sqrt(knn_data.shape[0]))

In [None]:
# knn for Jackson Merrill, 2024 using Manhattan
# Expected Success Level: 4
prediction = knn_predict(knn_data, k, 701538, 2024)
print(f'Success level: {prediction}')

In [None]:
# knn for Jackson Merrill, 2024 using Manhattan
# Expected Success Level: 4
prediction = knn_predict(knn_data, 5, 701538, 2024)
print(f'Success level: {prediction}')

In [None]:
# knn for Robbie Grossman, 2021 using Manhattan
# Expected Success Level: 2
prediction = knn_predict(knn_data, k, 543257, 2021)
print(f'Success level: {prediction}')

In [None]:
# knn for Robbie Grossman, 2021 using Manhattan
# Expected Success Level: 2
prediction = knn_predict(knn_data, 11, 543257, 2021)
print(f'Success level: {prediction}')

In [None]:
knn_data.loc[(knn_data['player_id'] == 543257) & (knn_data['year'] == 2021)]

In [None]:
knn_data['success'].describe()

In [None]:
counts = {1: 0, 2: 0, 3: 0, 4: 0}
for i in range(knn_data.shape[0]):
    s_level = knn_data.iloc[i]['success']
    if s_level in counts.keys():
        counts[s_level] += 1

print(counts)

In [None]:
import numpy as np

# Extract normalized features
features = knn_data[['Znorm_woba', 'Znorm_on_base_plus_slg', 'Znorm_hard_hit_percent']]
covariance_matrix = np.cov(features.T)  # Covariance matrix
inverse_covariance_matrix = np.linalg.inv(covariance_matrix)  # Inverse covariance matrix

In [None]:
def mahalanobis_distance(data, player1_id, player1_year, player2, inv_cov_matrix):
    player1_stats = data.loc[get_player_stats(data, player1_id, player1_year)].iloc[0]
    player2_stats = data.loc[player2]
    diff = np.array([
        player1_stats['Znorm_woba'] - player2_stats['Znorm_woba'],
        player1_stats['Znorm_on_base_plus_slg'] - player2_stats['Znorm_on_base_plus_slg'],
        player1_stats['Znorm_hard_hit_percent'] - player2_stats['Znorm_hard_hit_percent']
    ])
    mahalanobis_dist = np.sqrt(np.dot(np.dot(diff.T, inv_cov_matrix), diff))
    return mahalanobis_dist

In [None]:
def knn_predict2(data, k, player_id, player_year, inv_cov_matrix):
    distances = []
    for i in range(data.shape[0]):
        dist = mahalanobis_distance(data, player_id, player_year, i, inv_cov_matrix)
        success_level = data.iloc[i]['success']
        distances.append([dist, success_level])

    dist_sorted = sorted(distances, key=lambda x: x[0], reverse = True)
    k_nearest_labels = [label for _, label in dist_sorted[:k]]
    print(k_nearest_labels)
    prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
    return prediction

In [None]:
# knn for Jackson Merrill, 2024 using Mahalanobis
# Expected Success Level: 4
prediction = knn_predict2(knn_data, k, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

In [None]:
# knn for Jackson Merrill, 2024 using Mahalanobis
# Expected Success Level: 4
prediction = knn_predict2(knn_data, 7, 701538, 2024, inverse_covariance_matrix)
print(f'Success level: {prediction}')

In [None]:
# knn for Robbie Grossman, 2021 using Mahalanobis
# Expected Success Level: 2
prediction = knn_predict2(knn_data, k, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')

In [None]:
# knn for Robbie Grossman, 2021 using Mahalanobis
# Expected Success Level: 2
prediction = knn_predict2(knn_data, 11, 543257, 2021, inverse_covariance_matrix)
print(f'Success level: {prediction}')