## Cosine Similarity between Z-Scores

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import sys

# reference main directory in existing folder
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

import pandas as pd 
import sys
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from src.paths import RAW_DATA_DIR, CLEANED_DATA_DIR, QUADRANT_DATA_DIR, MISMATCHES
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pandas as pd
from scipy.spatial.distance import cosine

# show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
# display the max column width
pd.set_option('display.max_colwidth', None)

# read in the data
df_clean = pd.read_csv(CLEANED_DATA_DIR / 'fangraphs_zscores_merged.csv')

In [17]:
# Standardize features
df_clean = df_clean.dropna(axis=1, thresh = 2800)
df_clean = df_clean.dropna(axis=0)

# Move all string columns next to each other
string_columns = df_clean.select_dtypes(include=['object']).columns
other_columns = [col for col in df_clean.columns if col not in string_columns]

# Reorder the columns by first placing string columns and then other columns
df_clean = df_clean[string_columns.tolist() + other_columns]

### Use 2023 Season 

In [18]:
# use 2023 data 
df_2023 = df_clean[df_clean['Season'] == 2023]
df_2023.shape

(838, 179)

#### Separate by Elite and Sub-optimal Players

In [19]:
from src.paths import PLAYER_CATEGORIES_DIR, MEAN_CATEGORIES_DIR
from src.const import (clutch_factor, pitching_metrics, pitcher_performance_metric, advanced_pitching_metrics,
                       weighted_pitch_type_runs_statcast, pitch_type_speed_statcast,
                       hitter_performance_metrics, hitter_performance_metrics_statcast, 
                       leverage_index, wins_above_replacement, exit_velocity_metrics, contact_metrics)

# for every dictionary in src.const return the key and put to list
all_metrics = [clutch_factor, pitching_metrics, pitcher_performance_metric, advanced_pitching_metrics, weighted_pitch_type_runs_statcast, pitch_type_speed_statcast, hitter_performance_metrics, hitter_performance_metrics_statcast, leverage_index, wins_above_replacement, exit_velocity_metrics, contact_metrics]
all_metrics = [list(metric.keys()) for metric in all_metrics]
numeric_columns = [item for sublist in all_metrics for item in sublist]

# Filter for only the players who fall under suboptimal and average categories under the Classification column, average_rp, suboptimal_rp, average_sp, suboptimal_sp, average_rp, suboptimal_rp, average_sp, suboptimal_sp
suboptimal_players = df_2023[df_2023['Classification'].isin(['average_rp', 'suboptimal_rp', 'average_sp', 'suboptimal_sp'])]
elite_and_strong_players = df_2023[df_2023['Classification'].isin(['elite_rp', 'strong_rp', 'elite_sp', 'strong_sp'])]

# Include only the columns that are relevant to the analysis
string_columns = ['Name', 'Team', 'PlayerId', 'classified_role', 'Classification']
numeric_columns = df_2023.iloc[:, 13:].columns.tolist()

df_2023 = df_2023[string_columns + numeric_columns]

suboptimal_players = suboptimal_players[string_columns + numeric_columns]
elite_and_strong_players = elite_and_strong_players[string_columns + numeric_columns]



In [20]:
# Save to Categories Path
suboptimal_players.to_csv(PLAYER_CATEGORIES_DIR / 'suboptimal_players.csv', index=False)
elite_and_strong_players.to_csv(PLAYER_CATEGORIES_DIR / 'elite_and_strong_players.csv', index=False)

In [21]:
# Filtering for elite and strong players
is_elite_strong = df_2023['Classification'].isin(['elite_rp', 'strong_rp', 'elite_sp', 'strong_sp'])
elite_and_strong_players = df_2023[is_elite_strong]

is_suboptimal_average = df_2023['Classification'].isin(['average_rp', 'suboptimal_rp', 'average_sp', 'suboptimal_sp'])
suboptimal_players = df_2023[is_suboptimal_average]

# Group by 'classified_role' and calculate mean of non-PCA numeric columns only
elite_and_strong_mean = elite_and_strong_players.groupby('classified_role')[numeric_columns].mean()
suboptimal_mean = suboptimal_players.groupby('classified_role')[numeric_columns].mean()


In [22]:
# Save the means to the categories directory
elite_and_strong_mean.to_csv(MEAN_CATEGORIES_DIR / 'elite_and_strong_mean.csv')
suboptimal_mean.to_csv(MEAN_CATEGORIES_DIR / 'suboptimal_mean.csv')

#### Separate by classified role

In [23]:
# Segment elite and suboptimal players by role
roles = suboptimal_players['classified_role'].unique()
elite_segments = {role: elite_and_strong_players[elite_and_strong_players['classified_role'] == role] for role in roles}
suboptimal_segments = {role: suboptimal_players[suboptimal_players['classified_role'] == role] for role in roles}

# Calculate means for elite and suboptimal players by role
elite_means_by_role = {role: segment[numeric_columns].mean() for role, segment in elite_segments.items()}
suboptimal_means_by_role = {role: segment[numeric_columns].mean() for role, segment in suboptimal_segments.items()}

# Convert means to DataFrames
elite_means_df = pd.DataFrame(elite_means_by_role)
suboptimal_means_df = pd.DataFrame(suboptimal_means_by_role)

In [24]:
# save the means by role to the categories directory
elite_means_df.to_csv(MEAN_CATEGORIES_DIR / 'elite_means_by_role.csv')
suboptimal_means_df.to_csv(MEAN_CATEGORIES_DIR / 'suboptimal_means_by_role.csv')

In [25]:
# Function to calculate similarity to all elite role means
def calculate_similarity_to_all_roles(player_stats, role_means_df):
    """
    Calculate the similarity of a player's statistics to the mean statistics of elite players across all roles.
    """
    similarities = {}
    for role, mean_stats in role_means_df.items():
        similarity = 1 - cosine(player_stats, mean_stats)
        similarities[role] = similarity
    return similarities


In [26]:
def explain_suggested_role(player_stats, role_means, suggested_role):
    if suggested_role not in role_means.columns:
        return f"Role '{suggested_role}' not found in role means data."

    # Standardize the player's stats
    scaler = StandardScaler()
    scaler.fit(role_means.T)  # Fit the scaler on the transpose of role_means to standardize across features
    standardized_player_stats = scaler.transform(player_stats.values.reshape(1, -1)).flatten()

    # Get the standardized mean stats for the suggested role
    standardized_role_mean = scaler.transform(role_means[suggested_role].values.reshape(1, -1)).flatten()

    # Calculate the z-scores, which is the number of standard deviations from the mean
    z_scores = standardized_player_stats - standardized_role_mean

    # Identify the metrics within one standard deviation and those that are farther
    within_one_std = np.where((z_scores > -1) & (z_scores < 1))[0]
    beyond_one_std = np.where((z_scores <= -1) | (z_scores >= 1))[0]

    # Create explanations for metrics within one standard deviation
    close_metrics_explanations = [f"{player_stats.index[i]} (z-score: {z_scores[i]:.2f})" for i in within_one_std]
    # create explanations for positive and negative close metrics
    positive_close_metrics_explanations = [f"{player_stats.index[i]} (z-score: {z_scores[i]:.2f})" for i in within_one_std if z_scores[i] > 0]
    negative_close_metrics_explanations = [f"{player_stats.index[i]} (z-score: {z_scores[i]:.2f})" for i in within_one_std if z_scores[i] < 0]

    # Create explanations for metrics beyond one standard deviation
    far_metrics_explanations = [f"{player_stats.index[i]} (z-score: {z_scores[i]:.2f})" for i in beyond_one_std]

    # Construct the final explanation
    explanation = f"The player's statistics within one standard deviation for the {suggested_role} are: {', '.join(close_metrics_explanations)}."
    if far_metrics_explanations:
        explanation += f" Statistics beyond one standard deviation are: {', '.join(far_metrics_explanations)}."

    return explanation, close_metrics_explanations, far_metrics_explanations, positive_close_metrics_explanations, negative_close_metrics_explanations


In [27]:

def suggest_role_changes(suboptimal_players, elite_means_df, numeric_columns):
    """
    This function suggests role changes for players who are considered suboptimal in their current roles.
    It compares each player's statistics to the mean statistics of elite players' performance in various roles.
    The "Elite mean" refers to the average statistics of players who are top-performing (elite) in a specific role.
    
    The function calculates the similarity of each suboptimal player's stats to the elite means of all possible roles
    and suggests a change to the role where the player's stats are most similar to the elite means.
    It also identifies which stats are within one standard deviation above (positive) or below (negative)
    the elite mean of the suggested role.
    
    Parameters:
    - suboptimal_players (pd.DataFrame): DataFrame containing statistics for players considered suboptimal.
    - elite_means_df (pd.DataFrame): DataFrame containing the mean statistics of elite players by role.
    - numeric_columns (list): List of column names in suboptimal_players that contain numeric statistics to be considered.
    
    Returns:
    - suggested_changes (pd.DataFrame): DataFrame with suggested role changes and similarity scores,
      including detailed metrics comparison within one standard deviation of the elite means for the suggested role.
    """

    suggested_changes = pd.DataFrame(columns=[
        'Name', 'PlayerId', 'Team', 'Current Role', 
        'Suggested Role', 'Similarity Score', 'Positive Stats Within One Std Dev of Elite Mean', 
        'Negative Stats Within One Std Dev of Elite Mean'
    ])

    for index, player in suboptimal_players.iterrows():
        
        player_stats = player[numeric_columns]  # Select numeric columns for the player
        actual_role = player['classified_role']
        
        # Calculate similarities to all elite roles
        similarities = calculate_similarity_to_all_roles(player_stats, elite_means_df)

        # Find the role with the highest similarity score
        suggested_role = max(similarities, key=similarities.get)
        max_similarity_score = similarities[suggested_role]
        
        # Get the detailed explanation and metrics
        explanation, within_one_std_explanations, beyond_one_std_explanations, positive_close_metrics_explanations, negative_close_metrics_explanations = explain_suggested_role(player_stats, elite_means_df, suggested_role)

        # Construct the new row with additional columns
        row_to_add = {
            'Name': player['Name'],
            'PlayerId': player['PlayerId'],
            'Team': player['Team'],
            'Current Role': actual_role,
            'Suggested Role': suggested_role,
            'Similarity Score': max_similarity_score,
            'Positive Stats Within One Std Dev of Elite Mean': ', '.join(positive_close_metrics_explanations),
            'Negative Stats Within One Std Dev of Elite Mean': ', '.join(negative_close_metrics_explanations)
        }
        
        # Use concat to add the new row
        suggested_changes = pd.concat([suggested_changes, pd.DataFrame([row_to_add])], ignore_index=True)

    return suggested_changes
# Suggest role changes for suboptimal players
suggested_changes = suggest_role_changes(suboptimal_players, elite_means_df, numeric_columns)

  suggested_changes = pd.concat([suggested_changes, pd.DataFrame([row_to_add])], ignore_index=True)


In [28]:
# Be sure to return values where current role and suggested role are different
suggested_changes = suggested_changes[suggested_changes['Current Role'] != suggested_changes['Suggested Role']]

In [29]:
suggested_changes[['Name', 'Team', 'PlayerId', 'Current Role', 'Suggested Role', 'Similarity Score']].sort_values(by='Similarity Score', ascending=False)
suggested_changes.shape 

(157, 8)

In [30]:
suggested_changes.head()

Unnamed: 0,Name,PlayerId,Team,Current Role,Suggested Role,Similarity Score,Positive Stats Within One Std Dev of Elite Mean,Negative Stats Within One Std Dev of Elite Mean
0,Tommy Hunter,1157,NYM,Middle Reliever,Setup/Closer Pitcher,0.912471,"HR (z-score: 0.29), SIERA (z-score: 0.65), CStr_pct (z-score: 0.52), Barrels (z-score: 0.20), maxEV (z-score: 0.22), OSwing_pct_sc (z-score: 0.76), average_innings_pitched_per_appearance_SP (z-score: 0.38), average_innings_pitched_per_appearance_RP (z-score: 0.53)","GS (z-score: -0.43), TBF (z-score: -0.95), CG (z-score: -0.16), ShO (z-score: -0.16), MD (z-score: -0.50), R (z-score: -0.03), ER (z-score: -0.04), H (z-score: -0.54), HBP (z-score: -0.30), Events (z-score: -0.72), GB (z-score: -0.67), FB (z-score: -0.52), IFH (z-score: -0.28), RS (z-score: -0.46), HardHit (z-score: -0.50), Pitching_plus (z-score: -0.94)"
2,Corey Kluber,2429,BOS,Starting Pitcher,Setup/Closer Pitcher,0.736388,"HR (z-score: 0.29), FRM (z-score: 0.12), Barrels (z-score: 0.01)","GS (z-score: -0.43), CG (z-score: -0.16), ShO (z-score: -0.16), R (z-score: -0.65), ER (z-score: -0.47), H (z-score: -0.80), FB (z-score: -0.74), IFFB (z-score: -0.79), IFH (z-score: -0.91), RS (z-score: -0.96), maxEV (z-score: -0.83)"
4,JosA© Cisnero,6399,DET,Setup/Closer Pitcher,Middle Reliever,0.994785,"IP (z-score: 0.59), TBF (z-score: 0.79), SV (z-score: 0.80), Events (z-score: 0.54), GB (z-score: 0.58), LD (z-score: 0.14), FB (z-score: 0.65), IFH (z-score: 0.94), Balls (z-score: 0.59), Strikes (z-score: 0.65), Pitches (z-score: 0.63), RS (z-score: 0.73), RS_per_9 (z-score: 0.70), BB_pct (z-score: 0.60), Contact_pct (z-score: 0.62), Pull_pct (z-score: 0.86), BB_pct_plus (z-score: 0.71), FB_pct_plus (z-score: 0.41), Pull_pct_plus (z-score: 1.00), HardHit (z-score: 0.91), ZContact_pct_sc (z-score: 0.49), Contact_pct_sc (z-score: 0.59), botCmd_FA (z-score: 0.77)","GS (z-score: -0.97), W (z-score: -0.03), CG (z-score: -0.30), ShO (z-score: -0.15), tERA (z-score: -0.27), SwStr_pct (z-score: -0.42), CStr_pct (z-score: -0.90), CSW_pct (z-score: -0.66)"
6,Johnny Cueto,6893,MIA,Starting Pitcher,Setup/Closer Pitcher,0.594695,wFA_sc (z-score: 0.29),"GS (z-score: -0.43), CG (z-score: -0.16), ShO (z-score: -0.16), FRM (z-score: -0.06), Pitching_plus (z-score: -0.94)"
8,Daniel Bard,7115,COL,Setup/Closer Pitcher,Middle Reliever,0.97684,"IP (z-score: 0.11), TBF (z-score: 0.39), W (z-score: 0.75), SV (z-score: 0.10), R (z-score: 0.78), ER (z-score: 0.43), SO (z-score: 0.08), WP (z-score: 0.87), FB (z-score: 0.39), IFFB (z-score: 0.96), Balls (z-score: 0.93), Strikes (z-score: 0.25), Pitches (z-score: 0.48), WPA (z-score: 0.77), K_per_9_plus (z-score: 0.06), Barrels (z-score: 0.54), botStf_FA (z-score: 0.46)","GS (z-score: -0.97), L (z-score: -0.62), CG (z-score: -0.30), ShO (z-score: -0.15), H (z-score: -0.42), HR (z-score: -0.28), Events (z-score: -0.11), GB (z-score: -0.47), LD (z-score: -0.27), IFH (z-score: -0.97), RS (z-score: -0.68), K_per_9 (z-score: -0.36), inLI (z-score: -0.35), gmLI (z-score: -0.12), HardHit (z-score: -0.49), FA_X_sc (z-score: -0.80), average_batters_faced_per_appearance_RP (z-score: -0.93)"


In [31]:
# Save to suggested changes
from src.paths import SUGGESTED_CHANGES_DIR
suggested_changes.to_csv(SUGGESTED_CHANGES_DIR / 'suggested_changes.csv', index=False)

In [None]:
""""def explain_suggested_role(player_stats, role_means, suggested_role):
    if suggested_role not in role_means.columns:
        return f"Role '{suggested_role}' not found in role means data."

    # Standardize the player's stats
    scaler = StandardScaler()
    scaler.fit(role_means.T)  # Fit the scaler on the transpose of role_means to standardize across features
    standardized_player_stats = scaler.transform(player_stats.values.reshape(1, -1)).flatten()

    # Get the standardized mean stats for the suggested role
    standardized_role_mean = scaler.transform(role_means[suggested_role].values.reshape(1, -1)).flatten()

    # Calculate the absolute differences
    differences = np.abs(standardized_player_stats - standardized_role_mean)

    # Find metrics with the smallest differences, indicating they are most similar
    closest_metrics = differences.argsort()[:3]  # Let's take the top 3 closest metrics
    close_metrics_names = player_stats.index[closest_metrics]

    # Create an explanation based on the close metrics
    explanations = [f"{player_stats.index[i]} (difference: {differences[i]:.2f})" for i in closest_metrics]
    explanation = f"The player's statistics most similar to the mean values for the {suggested_role} are: {', '.join(explanations)}."
    
    return explanation

# Initialize the suggested_changes DataFrame outside of the loop
suggested_changes = pd.DataFrame(columns=['Name', 'PlayerId', 'Team', 'Current Role', 'Suggested Role', 'Similarity Score', 'Explanation'])

player_id = suboptimal_players['PlayerId']
# Calculate similarities and suggest role changes for suboptimal players
for index, player in suboptimal_players.iterrows():
    player_stats = player[numeric_columns]  # Select numeric columns for the player
    actual_role = player['classified_role']
    
    # Calculate similarities to all elite roles
    similarities = calculate_similarity_to_all_roles(player_stats, elite_means_df)

    # Find the role with the highest similarity score
    suggested_role = max(similarities, key=similarities.get)
    max_similarity_score = similarities[suggested_role]

    explanation = explain_suggested_role(player_stats, elite_means_df, suggested_role)

    # If the suggested role is different from the current role, append the suggestion
    if suggested_role != actual_role:
        row_to_add = pd.DataFrame([{
            'Name': player['Name'],
            'PlayerId': player['PlayerId'],
            'Team': player['Team'],
            'Current Role': actual_role,
            'Suggested Role': suggested_role,
            'Similarity Score': max_similarity_score,
            'Explanation': explanation
        }])
        suggested_changes = pd.concat([suggested_changes, row_to_add], ignore_index=True)"""

In [None]:
"""def explain_suggested_role(player_stats, role_means, suggested_role):
    if suggested_role not in role_means.columns:
        raise ValueError(f"Role '{suggested_role}' not found in role means data.")

    # Standardize the player's stats
    scaler = StandardScaler()
    scaler.fit(role_means.T)  # Fit the scaler on the transpose of role_means to standardize across features
    standardized_player_stats = scaler.transform(player_stats.values.reshape(1, -1)).flatten()

    # Get the standardized mean stats for the suggested role
    standardized_role_mean = scaler.transform(role_means[suggested_role].values.reshape(1, -1)).flatten()

    # Calculate the z-scores
    z_scores = standardized_player_stats - standardized_role_mean

    # Create a structured dictionary for metrics and z-scores
    metrics_z_scores = {
        'metrics': player_stats.index,
        'z_scores': z_scores,
        'positive': [],
        'negative': []
    }

    # Filter to only include metrics within one standard deviation and categorize them
    for i, z_score in enumerate(z_scores):
        if -1 < z_score < 1:
            if z_score >= 0:
                metrics_z_scores['positive'].append((player_stats.index[i], z_score))
            else:
                metrics_z_scores['negative'].append((player_stats.index[i], z_score))

    return metrics_z_scores"""