In [2]:
# Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score
import random

In [4]:
# Batting dataset without team stats
df = pd.read_csv('ncaa_battingQualifiedCSV/BattingDraftTable.csv', delimiter=',')

In [6]:
# list and drop columns that are less related to the target
cols_to_drop = ['name', 'playerid', 'mlbamid', 'team', 'Acronym', 'Full Team Name', 'Drafted By', 'Drafted From']
df = df.drop(columns=cols_to_drop)
print(df.columns.tolist())

# convert the target to numerical values
df['Drafted?'] = df['Drafted?'].astype(int)

['age', 'nameascii', 'year', 'g', 'ab', 'pa', 'h', '1b', '2b', '3b', 'hr', 'r', 'rbi', 'bb', 'so', 'hbp', 'sf', 'sh', 'gdp', 'sb', 'cs', 'avg', 'bb%', 'k%', 'bb/k', 'obp', 'slg', 'ops', 'iso', 'spd', 'babip', 'wsb', 'wrc', 'wraa', 'woba', 'wrc+', 'Round', 'Pick', 'Drafted?']


In [8]:
df.head()

Unnamed: 0,age,nameascii,year,g,ab,pa,h,1b,2b,3b,...,spd,babip,wsb,wrc,wraa,woba,wrc+,Round,Pick,Drafted?
0,21.0,Ryan Bliss,2021,50,211,237,77,47,14,1,...,4.974168,0.371257,-1.456794,58.746606,20.674834,0.46736,154.30489,2.0,42.0,1
1,21.0,Adrian Del Castillo,2021,54,200,237,55,38,13,1,...,4.409581,0.304094,0.022838,35.612792,-0.514145,0.360145,98.576837,2.0,67.0,1
2,22.0,Tim Tawa,2021,52,214,240,62,37,13,0,...,6.323953,0.304878,1.181782,41.118764,4.050893,0.390554,110.928313,11.0,318.0,1
3,22.0,Billy Cook,2021,33,131,153,39,18,3,1,...,7.549825,0.30137,0.522476,36.614354,16.183401,0.474676,179.210213,10.0,287.0,1
4,21.0,Trey Sweeney,2021,48,170,226,65,39,10,2,...,5.143712,0.377778,-0.352635,69.647429,36.182759,0.530699,208.122262,1.0,20.0,1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11529 entries, 0 to 11528
Data columns (total 39 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        6560 non-null   float64
 1   nameascii  11529 non-null  object 
 2   year       11529 non-null  int64  
 3   g          11529 non-null  int64  
 4   ab         11529 non-null  int64  
 5   pa         11529 non-null  int64  
 6   h          11529 non-null  int64  
 7   1b         11529 non-null  int64  
 8   2b         11529 non-null  int64  
 9   3b         11529 non-null  int64  
 10  hr         11529 non-null  int64  
 11  r          11529 non-null  int64  
 12  rbi        11529 non-null  int64  
 13  bb         11529 non-null  int64  
 14  so         11529 non-null  int64  
 15  hbp        11529 non-null  int64  
 16  sf         11529 non-null  int64  
 17  sh         11529 non-null  int64  
 18  gdp        11529 non-null  int64  
 19  sb         11529 non-null  int64  
 20  cs    

In [12]:
features = list(df[['age', 'g', 'ab', 'pa', 'h', '1b', '2b', '3b', 'hr', 'r', 'rbi', 'bb', 'so', 
                    'hbp', 'sf', 'sh', 'gdp', 'sb', 'cs', 'avg', 'bb%', 'k%', 'bb/k', 'obp', 'slg', 
                    'ops', 'iso', 'spd', 'babip', 'wsb', 'wrc', 'wraa', 'woba', 'wrc+']].columns)

target_drafted = 'Drafted?'
target_round = 'Round'
target_pick = 'Pick'

df[target_drafted] = df[target_drafted].astype(int)

In [14]:
X = df[features]
y_drafted = df[target_drafted]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_drafted, test_size=0.2, random_state=42, stratify=y_drafted
)

In [16]:
draft_model = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

draft_model.fit(X_train, y_train)

y_pred_drafted = draft_model.predict(X_test)
print("üéØ Drafted? Classifier Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_drafted):.3f}")

üéØ Drafted? Classifier Results:
Accuracy: 0.944


In [20]:
drafted_df = df[df[target_drafted] == 1].dropna(subset=[target_round, target_pick])

X_drafted = drafted_df[features]
y_round = drafted_df[target_round]
y_pick = drafted_df[target_pick]

# Split for regression tasks
X_train_r, X_test_r, y_train_round, y_test_round = train_test_split(
    X_drafted, y_round, test_size=0.2, random_state=42
)
_, _, y_train_pick, y_test_pick = train_test_split(
    X_drafted, y_pick, test_size=0.2, random_state=42
)

# Round model
round_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
round_model.fit(X_train_r, y_train_round)

# Pick model
pick_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
pick_model.fit(X_train_r, y_train_pick)

# --- Evaluate ---
round_preds = round_model.predict(X_test_r)
pick_preds = pick_model.predict(X_test_r)

print("\nüìä Round Prediction:")
print(f"MAE: {mean_absolute_error(y_test_round, round_preds):.3f}")
print(f"R¬≤: {r2_score(y_test_round, round_preds):.3f}")

print("\nüìä Pick Prediction:")
print(f"MAE: {mean_absolute_error(y_test_pick, pick_preds):.3f}")
print(f"R¬≤: {r2_score(y_test_pick, pick_preds):.3f}")


üìä Round Prediction:
MAE: 3.905
R¬≤: 0.256

üìä Pick Prediction:
MAE: 116.205
R¬≤: 0.272


In [30]:
def generate_fake_players(n=10):
    players = []
    for i in range(n):
        player = {
            'nameascii': f"Player_{i}",
            'age': np.random.randint(18, 25),
            'g': np.random.randint(20, 162),
            'ab': np.random.randint(50, 600),
            'avg': round(np.random.uniform(0.150, 0.350), 3),
            'pa': np.random.randint(60, 700),
            'h': np.random.randint(10, 200),
            '1b': np.random.randint(5, 120),
            '2b': np.random.randint(0, 50),
            '3b': np.random.randint(0, 10),
            'hr': np.random.randint(0, 50),
            'r': np.random.randint(0, 120),
            'rbi': np.random.randint(0, 120),
            'bb': np.random.randint(0, 100),
            'so': np.random.randint(0, 200),
            'hbp': np.random.randint(0, 10),
            'sf': np.random.randint(0, 10),
            'sh': np.random.randint(0, 10),
            'gdp': np.random.randint(0, 30),
            'sb': np.random.randint(0, 50),
            'cs': np.random.randint(0, 20),
            'bb%': round(np.random.uniform(0, 0.20), 3),
            'k%': round(np.random.uniform(0.1, 0.35), 3),
            'bb/k': round(np.random.uniform(0, 1.5), 3),
            'obp': round(np.random.uniform(0.200, 0.450), 3),
            'slg': round(np.random.uniform(0.200, 0.700), 3),
            'ops': round(np.random.uniform(0.400, 1.200), 3),
            'iso': round(np.random.uniform(0, 0.400), 3),
            'spd': round(np.random.uniform(0, 30), 1),
            'babip': round(np.random.uniform(0.200, 0.400), 3),
            'wsb': round(np.random.uniform(-5, 10), 2),
            'wrc': round(np.random.uniform(-50, 150), 2),
            'wraa': round(np.random.uniform(-5, 10), 2),
            'woba': round(np.random.uniform(0.200, 0.450), 3),
            'wrc+': round(np.random.uniform(50, 200), 1)
        }
        players.append(player)
    return pd.DataFrame(players)

fake_players = generate_fake_players(5)
print("\nüßç‚Äç‚ôÇÔ∏è Example Fake Players:")
print(fake_players.head())


üßç‚Äç‚ôÇÔ∏è Example Fake Players:
  nameascii  age   g   ab    avg   pa    h  1b  2b  3b  ...    slg    ops  \
0  Player_0   22  74  508  0.218  668   13  74  33   6  ...  0.581  0.498   
1  Player_1   19  57  310  0.167  300  141  15  10   4  ...  0.416  1.124   
2  Player_2   22  53  290  0.229  445   91  29  42   9  ...  0.405  0.840   
3  Player_3   24  35  563  0.154  225  110  13  11   7  ...  0.249  0.645   
4  Player_4   23  89  485  0.310  182   57  66  25   2  ...  0.339  0.478   

     iso   spd  babip   wsb     wrc  wraa   woba   wrc+  
0  0.248   9.8  0.286  4.34   50.87  3.10  0.228  136.0  
1  0.323  24.7  0.229 -2.60   27.62 -0.05  0.319   93.2  
2  0.127  19.5  0.364  4.39  -49.85  7.54  0.405  196.2  
3  0.120   7.8  0.397  6.44  136.01  7.40  0.419  167.9  
4  0.012   0.8  0.323  5.46   26.90  2.73  0.251   70.3  

[5 rows x 35 columns]


In [32]:
X_fake = fake_players[features]

# Step 1: predict drafted or not
fake_players['Predicted_Drafted'] = draft_model.predict(X_fake)

# Step 2: predict round and pick only if drafted
fake_players['Predicted_Round'] = np.nan
fake_players['Predicted_Pick'] = np.nan

drafted_mask = fake_players['Predicted_Drafted'] == 1
fake_players.loc[drafted_mask, 'Predicted_Round'] = np.round(
    round_model.predict(X_fake[drafted_mask])
).astype(int)
fake_players.loc[drafted_mask, 'Predicted_Pick'] = np.round(
    pick_model.predict(X_fake[drafted_mask])
).astype(int)

print("\nüéØ Draft Predictions for Fake Players:")
print(fake_players[['nameascii', 'Predicted_Drafted', 'Predicted_Round', 'Predicted_Pick']])


üéØ Draft Predictions for Fake Players:
  nameascii  Predicted_Drafted  Predicted_Round  Predicted_Pick
0  Player_0                  1             11.0           361.0
1  Player_1                  0              NaN             NaN
2  Player_2                  0              NaN             NaN
3  Player_3                  0              NaN             NaN
4  Player_4                  0              NaN             NaN
