In [1]:
import pandas as pd
import scipy

In [9]:
data = pd.read_excel('playerratingsNCAAB.xlsx')


In [10]:
data.head()

Unnamed: 0,PlayerId,PlayerName,PlayerTeam,Position,Height,PlayerClass,pocNJ2MakeOff,pocJ2MakeOff,pocJ3MakeOff,pocNJ2MakeDef,...,WinPercentage,Season2,PlayerId4,UsageRate,Column1,DraftPick,PositionDetail,BigBoardNumber,Athlete,Draft
0,23211,Khalif Battle,GONZ,G,77,SR,1113.3845,1114.9724,1069.7445,1126.1106,...,0.7429,2025.0,23211.0,23.532163,KhalifBattle,KhalifBattle,SG,66,7.0,2025.0
1,26236,Mason Jones,BALLST,F,79,SO,1093.3036,1096.2416,1092.5914,1096.4413,...,0.3704,2025.0,26236.0,11.504422,MasonJones,MasonJones,SG,50,7.0,2021.0
2,39814,Jaxson Robinson,UK,G,79,SR,1116.5103,1123.427,1110.4268,1052.8826,...,0.7083,2025.0,39814.0,21.520781,JaxsonRobinson,JaxsonRobinson,SG/SF,48,8.0,2025.0
3,39860,Dain Dainja,MEM,F,81,SR,1093.0825,1098.1799,1071.5513,1138.8735,...,0.8286,2025.0,39860.0,25.835496,DainDainja,DainDainja,PF,59,7.0,2025.0
4,39883,Will Richard,FLA,G,77,SR,1108.0576,1141.5627,1112.7549,1212.3185,...,0.9,2025.0,39883.0,18.488318,WillRichard,WillRichard,SG,89,7.0,2025.0


In [11]:
def calculate_attractiveness_score(position, height, player_class, win_percentage, usage_rate, 
                               athlete, position_avg_heights, position_height_stds=None,
                               height_weight=5, class_weights=None, usage_weight=1.5, 
                               win_weight=0.5, interaction_weight=20, athleticism_weight=15):
    """
    Calculate draft attractiveness score with configurable weights, including athleticism.
    
    Parameters:
    - position: Player's position (string)
    - height: Player's height (numeric)
    - player_class: Player's college class (FR, SO, JR, SR)
    - win_percentage: Player's team win percentage (0-100)
    - usage_rate: Player's usage rate (percentage)
    - athlete: Player's athleticism rating (typically 1-10)
    - position_avg_heights: Dictionary of average heights by position
    - position_height_stds: Dictionary of height standard deviations by position
    
    Tunable parameters:
    - height_weight: Weight for height component (default: 5)
    - class_weights: Dictionary of weights by class (default: FR=40, SO=30, JR=20, SR=10)
    - usage_weight: Weight for usage rate (default: 1.5)
    - win_weight: Weight for win percentage (default: 0.5)
    - interaction_weight: Weight for usage-win interaction (default: 20)
    - athleticism_weight: Weight for athleticism component (default: 15)
    """
    # Default class weights if not provided
    if class_weights is None:
        class_weights = {
            "FR": 40,    # Freshmen get highest weight
            "SO": 35,    # Sophomores 
            "JR": 25,    # Juniors
            "SR": 10     # Seniors
        }
    
    if position not in position_avg_heights:
        return 50  # Default score
    
    avg_height = position_avg_heights[position]
    
    # Height component
    if position_height_stds and position in position_height_stds:
        std_dev = position_height_stds[position]
        height_z_score = (height - avg_height) / std_dev if std_dev > 0 else 0
        height_component = height_z_score * height_weight
    else:
        height_diff = height - avg_height
        height_component = height_diff * height_weight
    
    # Class component
    class_component = class_weights.get(player_class, 10)
    
    # Usage component
    usage_component = usage_rate * usage_weight
    
    # Win component
    win_component = win_percentage * win_weight
    
    # Usage-win interaction
    normalized_usage = usage_rate / 25  # Normalize to typical range
    normalized_win = win_percentage / 75  # Normalize to typical range
    usage_win_interaction = normalized_usage * normalized_win * interaction_weight
    
    # Athleticism component
    # Apply penalty/boost based on athleticism rating
    athleticism_component = 0
    if athlete == 6:
        athleticism_component = -5  # Big penalty
    elif athlete == 7:
        athleticism_component = 0  # Small penalty
    elif athlete == 8:
        athleticism_component = 2    # Neutral
    elif athlete == 9:
        athleticism_component = 5   # Big boost
    elif athlete == 10:
        athleticism_component = 7   # Bigger boost
    else:
        # Linear interpolation for other values
        if athlete < 6:
            athleticism_component = -10  # Even bigger penalty for very low athleticism
        elif athlete > 10:
            athleticism_component = 12   # Even bigger boost for exceptional athleticism
    
    # Apply the athleticism weight
    athleticism_component *= athleticism_weight / 10
    
    # Final score - sum all components
    final_score = (height_component + class_component + usage_component + 
                  win_component + usage_win_interaction + athleticism_component)
    
    # Add a base value to ensure mostly positive scores
    final_score += 50
    
    return final_score

def find_optimal_weights(data, num_trials=10):
    """
    Test different combinations of weights to find the optimal correlation.
    
    Parameters:
    - data: Pandas DataFrame containing the necessary columns
    - num_trials: Number of different values to try for each parameter
    
    Returns:
    - Dictionary with optimal parameters
    - Best correlation achieved
    - DataFrame with all trial results
    """
    import pandas as pd
    import numpy as np
    from itertools import product
    
    # Calculate standard deviations for height by position
    position_height_stds = data.groupby('PositionDetail')['Height'].std().to_dict()
    
    # Calculate average heights by position
    position_avg_heights = data.groupby('PositionDetail')['Height'].mean().to_dict()
    
    # Define parameter ranges to test
    height_weights = np.linspace(1, 10, num_trials)
    fr_weights = np.linspace(20, 60, num_trials)
    class_weight_ratios = np.linspace(0.6, 0.9, num_trials)  # The ratio between consecutive classes
    usage_weights = np.linspace(0.5, 3.0, num_trials)
    win_weights = np.linspace(0.2, 1.0, num_trials)
    interaction_weights = np.linspace(5, 40, num_trials)
    athleticism_weights = np.linspace(5, 25, num_trials)  # New parameter for athleticism
    
    # Sample from the parameter space
    # Taking all combinations would be too many, so we'll sample
    np.random.seed(42)  # For reproducibility
    
    # Calculate how many total combinations we would have
    total_combinations = num_trials**7  # 7 parameters now
    
    # If total is very large, limit to a reasonable number
    max_trials = 1000
    sampling_rate = min(1.0, max_trials / total_combinations)
    
    results = []
    
    # Generate parameter combinations but sample them
    param_combos = list(product(height_weights, fr_weights, class_weight_ratios, 
                                usage_weights, win_weights, interaction_weights,
                                athleticism_weights))  # Added athleticism weights
    
    # Randomly sample parameter combinations
    selected_indices = np.random.choice(
        len(param_combos), 
        size=min(max_trials, len(param_combos)), 
        replace=False
    )
    selected_params = [param_combos[i] for i in selected_indices]
    
    print(f"Testing {len(selected_params)} parameter combinations...")
    
    for i, (height_weight, fr_weight, class_ratio, usage_weight, win_weight, 
            interaction_weight, athleticism_weight) in enumerate(selected_params):
        if i % 100 == 0:
            print(f"Completed {i} trials...")
            
        # Create class weights dictionary with the ratio
        class_weights = {
            "FR": fr_weight,
            "SO": fr_weight * class_ratio,
            "JR": fr_weight * class_ratio**2,
            "SR": fr_weight * class_ratio**3
        }
        
        # Calculate scores with these weights
        scores = []
        for _, row in data.iterrows():
            score = calculate_attractiveness_score(
                row['PositionDetail'],
                row['Height'],
                row['PlayerClass'],
                row['WinPercentage'],
                row['UsageRate'],
                row['Athlete'],  # New parameter
                position_avg_heights,
                position_height_stds,
                height_weight=height_weight,
                class_weights=class_weights,
                usage_weight=usage_weight,
                win_weight=win_weight,
                interaction_weight=interaction_weight,
                athleticism_weight=athleticism_weight  # New parameter
            )
            scores.append(score)
        
        # Add to dataframe temporarily
        data['TempScore'] = scores
        
        # Normalize scores
        min_score = data['TempScore'].min()
        max_score = data['TempScore'].max()
        data['NormScore'] = 100 * (data['TempScore'] - min_score) / (max_score - min_score)
        
        # Calculate correlation (negative is better for BigBoardNumber)
        correlation = data['NormScore'].corr(data['BigBoardNumber'], method='spearman')
        
        # Store result
        results.append({
            'height_weight': height_weight,
            'fr_weight': fr_weight,
            'class_ratio': class_ratio,
            'usage_weight': usage_weight,
            'win_weight': win_weight,
            'interaction_weight': interaction_weight,
            'athleticism_weight': athleticism_weight,  # New parameter
            'correlation': correlation
        })
        
        # Remove temporary columns
        data.drop(['TempScore', 'NormScore'], axis=1, inplace=True)
    
    # Convert results to DataFrame for analysis
    results_df = pd.DataFrame(results)
    
    # Find best correlation (most negative)
    best_idx = results_df['correlation'].idxmin()
    best_params = results_df.loc[best_idx].to_dict()
    best_correlation = best_params['correlation']
    
    # Create full class weights for the best params
    best_class_weights = {
        "FR": best_params['fr_weight'],
        "SO": best_params['fr_weight'] * best_params['class_ratio'],
        "JR": best_params['fr_weight'] * best_params['class_ratio']**2,
        "SR": best_params['fr_weight'] * best_params['class_ratio']**3
    }
    
    # Final best parameters
    optimal_params = {
        'height_weight': best_params['height_weight'],
        'class_weights': best_class_weights,
        'usage_weight': best_params['usage_weight'],
        'win_weight': best_params['win_weight'],
        'interaction_weight': best_params['interaction_weight'],
        'athleticism_weight': best_params['athleticism_weight']  # New parameter
    }
    
    # Get top 10 parameter combinations
    print("\nTop 10 Parameter Combinations:")
    top_10 = results_df.sort_values('correlation').head(10)
    
    # Return both the position data along with the optimal parameters
    return optimal_params, best_correlation, results_df, top_10, position_avg_heights, position_height_stds

# Example of how to use this function:

# Find optimal weights and get position height data
optimal_params, best_correlation, all_results, top_10, position_avg_heights, position_height_stds = find_optimal_weights(data, num_trials=10)

print(f"\nBest correlation: {best_correlation:.4f}")
print("\nOptimal parameters:")
for key, value in optimal_params.items():
    if key == 'class_weights':
        print(f"  Class weights:")
        for cls, wt in value.items():
            print(f"    {cls}: {wt:.2f}")
    else:
        print(f"  {key}: {value:.2f}")

# Print top 10 parameter combinations
print("\nTop 10 Parameter Combinations:")
print(top_10[['height_weight', 'fr_weight', 'class_ratio', 'usage_weight', 
              'win_weight', 'interaction_weight', 'athleticism_weight', 'correlation']].to_string(index=False))

# Apply the optimal weights to get the best scores
data['BestAttractivenessScore'] = data.apply(
    lambda row: calculate_attractiveness_score(
        row['PositionDetail'],
        row['Height'],
        row['PlayerClass'],
        row['WinPercentage'],
        row['UsageRate'],
        row['Athlete'],  # New parameter
        position_avg_heights,  # Now this is defined
        position_height_stds,  # Now this is defined
        height_weight=optimal_params['height_weight'],
        class_weights=optimal_params['class_weights'],
        usage_weight=optimal_params['usage_weight'],
        win_weight=optimal_params['win_weight'],
        interaction_weight=optimal_params['interaction_weight'],
        athleticism_weight=optimal_params['athleticism_weight']  # New parameter
    ),
    axis=1
)

# Normalize final scores
min_score = data['BestAttractivenessScore'].min()
max_score = data['BestAttractivenessScore'].max()
data['BestAttractivenessScore'] = 100 * (data['BestAttractivenessScore'] - min_score) / (max_score - min_score)

# Show top prospects with the optimal formula
print("\nTop 10 Prospects with Optimal Formula:")
top_prospects = data.sort_values('BestAttractivenessScore', ascending=False).head(10)
print(top_prospects[['PlayerName', 'PositionDetail', 'PlayerClass', 'Height', 
                     'UsageRate', 'WinPercentage', 'Athlete', 'BestAttractivenessScore', 'BigBoardNumber']].round(1))

Testing 1000 parameter combinations...
Completed 0 trials...
Completed 100 trials...
Completed 200 trials...
Completed 300 trials...
Completed 400 trials...
Completed 500 trials...
Completed 600 trials...
Completed 700 trials...
Completed 800 trials...
Completed 900 trials...

Top 10 Parameter Combinations:

Best correlation: -0.4371

Optimal parameters:
  height_weight: 1.00
  Class weights:
    FR: 55.56
    SO: 33.33
    JR: 20.00
    SR: 12.00
  usage_weight: 0.50
  win_weight: 0.56
  interaction_weight: 12.78
  athleticism_weight: 20.56

Top 10 Parameter Combinations:
 height_weight  fr_weight  class_ratio  usage_weight  win_weight  interaction_weight  athleticism_weight  correlation
           1.0  55.555556     0.600000           0.5    0.555556           12.777778           20.555556    -0.437137
           2.0  60.000000     0.633333           0.5    0.822222           28.333333           22.777778    -0.436943
           2.0  60.000000     0.733333           0.5    0.466667  

In [12]:
data.sort_values('BestAttractivenessScore', ascending=False).head(20)

Unnamed: 0,PlayerId,PlayerName,PlayerTeam,Position,Height,PlayerClass,pocNJ2MakeOff,pocJ2MakeOff,pocJ3MakeOff,pocNJ2MakeDef,...,Season2,PlayerId4,UsageRate,Column1,DraftPick,PositionDetail,BigBoardNumber,Athlete,Draft,BestAttractivenessScore
96,39612,Zion Williamson,DUKE,F,79,FR,1124.1117,1097.168,1090.2527,1163.3496,...,2019.0,39612.0,28.376655,ZionWilliamson,ZionWilliamson,PF,101,10.0,2020.0,100.0
390,41535,Blake Wesley,N DAME,G,77,FR,1107.4688,1115.3591,1110.4377,1096.653,...,2022.0,41535.0,31.876229,BlakeWesley,BlakeWesley,PG/SG,18,9.0,2022.0,98.444038
55,46637,Cooper Flagg,DUKE,F,81,FR,1110.5912,1129.4302,1127.4376,1209.4802,...,2025.0,46637.0,30.87362,CooperFlagg,CooperFlagg,SF/PF,1,9.0,2025.0,98.331928
56,46638,Ace Bailey,RUTGER,G,82,FR,1111.3539,1128.3014,1104.3748,1063.8494,...,2025.0,46638.0,27.554336,AceBailey,AceBailey,SG/SF,4,9.0,2025.0,98.230344
202,16866,James Wiseman,MEM,C,85,FR,1113.4501,1099.7261,1107.8049,1134.6287,...,2020.0,16866.0,30.20141,JamesWiseman,JamesWiseman,C,7,9.0,2021.0,97.625417
208,2133,Anthony Edwards,UGA,G,77,FR,1105.9753,1104.4038,1090.5713,1103.2496,...,2020.0,2133.0,30.550972,AnthonyEdwards,AnthonyEdwards,SG,1,9.0,2021.0,96.425103
196,30939,Precious Achiuwa,MEM,F,81,FR,1112.2041,1076.6543,1101.1458,1244.1901,...,2020.0,30939.0,27.590053,PreciousAchiuwa,PreciousAchiuwa,SF/PF,15,9.0,2021.0,96.003433
335,39842,Jabari Smith,AUBURN,F,82,FR,1109.8962,1101.362,1099.2767,1195.4293,...,2022.0,39842.0,27.55972,JabariSmith,JabariSmith,PF,1,9.0,2022.0,95.871167
399,4277,Brandon Miller,ALA,F,81,FR,1110.6101,1115.2278,1105.6622,1228.455,...,2023.0,4277.0,26.387889,BrandonMiller,BrandonMiller,SF,3,9.0,2023.0,95.815968
312,42893,Greg Brown,TEXAS,F,81,FR,1101.9286,1120.3264,1104.5393,1137.2584,...,2021.0,42893.0,26.958237,GregBrown,GregBrown,SF/PF,38,9.0,2021.0,95.609891


In [15]:
import numpy as np
# Calculate offensive score
data['OffenseScore'] = ((1.5*data['pocNJ2MakeOff'] + data['pocJ2MakeOff'] + 1.5*data['pocJ3MakeOff'] + 
                        .5*data['pocORB'] + 3*data['pFTMake'] + data['pJ3Make'] + 
                        data['pocNJ2AssistOff'] + data['pocJ2AssistOff'] + data['pocJ3AssistOff']) / 11) * (data['BestAttractivenessScore'])
                        
# Calculate defensive score
data['DefenseScore'] = ((data['pocNJ2MakeDef'] + .5*data['pocJ2MakeDef'] + data['pocJ3MakeDef'] + 
                        data['pocDRB'] + data['pocNJ2BlockDef']) / 4.5) * (data['BestAttractivenessScore'])

# Calculate raw combined score
data['RawScore'] = (data['OffenseScore'] + data['DefenseScore']) / 2

# Normalize the scores
min_o_score = data['OffenseScore'].min()
max_o_score = data['OffenseScore'].max()
data['Offense'] = 100 * (data['OffenseScore'] - min_o_score) / (max_o_score - min_o_score)

min_d_score = data['DefenseScore'].min()
max_d_score = data['DefenseScore'].max()
data['Defense'] = 100 * (data['DefenseScore'] - min_d_score) / (max_d_score - min_d_score)

# First convert normalized scores to ranks
data['OffenseRank'] = data['Offense'].rank(ascending=False, method='min')
data['DefenseRank'] = data['Defense'].rank(ascending=False, method='min')

# Calculate Overall as the average of the two rank columns
data['OverallRank'] = (data['OffenseRank'] + data['DefenseRank']) / 2

# Now rank the OverallRank to get the final ranking
data['Overall'] = data['OverallRank'].rank(ascending=True, method='min')

# Check correlation with BigBoardNumber - MOVED TO HERE
attractiveness_corr = data['BestAttractivenessScore'].corr(data['BigBoardNumber'], method='spearman')
offense_corr = data['Offense'].corr(data['BigBoardNumber'], method='spearman')
defense_corr = data['Defense'].corr(data['BigBoardNumber'], method='spearman')
score_corr = data['Overall'].corr(data['BigBoardNumber'], method='spearman')
win_pct_corr = data['WinPercentage'].corr(data['BigBoardNumber'], method='spearman')

print("\nCorrelations with BigBoardNumber (negative is better):")
print(f"AttractivenessScore: {attractiveness_corr:.4f}")
print(f"OffenseScore: {offense_corr:.4f}")
print(f"DefenseScore: {defense_corr:.4f}")
print(f"Score: {score_corr:.4f}")
print(f"WinPercentage: {win_pct_corr:.4f}")

# Convert to integer and handle NaN values
data['Offense'] = data['OffenseRank'].fillna(-1).astype(int)
data['Defense'] = data['DefenseRank'].fillna(-1).astype(int)
data['Overall'] = data['Overall'].fillna(-1).astype(int)
    
# Replace -1 placeholder with NaN again if needed
data['Offense'] = data['Offense'].replace(-1, np.nan)
data['Defense'] = data['Defense'].replace(-1, np.nan)
data['Overall'] = data['Overall'].replace(-1, np.nan)

# Filter out multiple names
data = data[~data['PlayerName'].isin(['Tyler Bey', 'Devin Carter','Isaac Jones','Tre Jones','Mason Jones','Isaiah Jackson',
                                     'Jalen Johnson','Marcus Garrett','Andre Jackson','David Jones','Jalen Smith'
                                     ,'Jalen Williams','Jaylin Williams','Jalen Harris','Cameron Johnson','Chris Smith',
                                     'Donovan Williams','Tyler Harris','Grant Williams'])]

# Filter out G Webb and AL A&M teams
data = data[~data['PlayerTeam'].isin(['G WEBB', 'AL A&M'])]

KeyError: 'WinningPct'

In [14]:
data['Rank'] = data['Overall'].round(0)
data[['Draft','Rank','PlayerName','PositionDetail','Height','PlayerClass','PlayerTeam','Offense','Defense','Overall']].dropna().to_csv('AllProspects.csv')