In [1]:
# Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, r2_score
import random

In [5]:
# Pitching dataset without team stats
df = pd.read_csv('PitchingDraftTable.csv', delimiter=',')

In [7]:
# list and drop columns that are less related to the target
cols_to_drop = ['name', 'playerid', 'mlbamid', 'team', 'Acronym', 'Full Team Name', 'Drafted By', 'Drafted From']
df = df.drop(columns=cols_to_drop)
print(df.columns.tolist())

# convert the target to numerical values
df['Drafted?'] = df['Drafted?'].astype(int)

['age', 'nameascii', 'year', 'w', 'l', 'era', 'g', 'gs', 'cg', 'sho', 'sv', 'ip', 'tbf', 'h', 'r', 'er', 'hr', 'bb', 'hbp', 'wp', 'bk', 'so', 'k/9', 'bb/9', 'k/bb', 'hr/9', 'k%', 'bb%', 'k-bb%', 'avg', 'whip', 'babip', 'lob%', 'fip', 'e-f', 'Round', 'Pick', 'Drafted?']


In [9]:
df.head()

Unnamed: 0,age,nameascii,year,w,l,era,g,gs,cg,sho,...,k-bb%,avg,whip,babip,lob%,fip,e-f,Round,Pick,Drafted?
0,21.0,Gordon Graceffo,2021,7,2,1.536585,11,11,2,1,...,0.224615,0.214984,0.963415,0.298643,0.714286,2.242553,-0.705967,5.0,151.0,1
1,23.0,Patrick Monteverde,2021,7,4,3.752896,16,16,0,0,...,0.222841,0.235821,1.158301,0.308036,0.674157,3.687751,0.065144,,,0
2,23.0,Alek Jacob,2021,8,1,2.521401,17,11,2,2,...,0.276471,0.191083,0.910506,0.286432,0.745721,2.541372,-0.019972,16.0,490.0,1
3,22.0,Matt Svanson,2021,4,4,2.303317,13,12,0,0,...,0.131833,0.218182,1.194313,0.278846,0.708155,4.365784,-2.062467,13.0,392.0,1
4,21.0,Andrew Hoffmann,2021,3,0,2.87234,11,11,0,0,...,0.189922,0.204167,1.021277,0.265896,0.700637,3.252616,-0.380275,12.0,367.0,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5642 entries, 0 to 5641
Data columns (total 38 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        3138 non-null   float64
 1   nameascii  5642 non-null   object 
 2   year       5642 non-null   int64  
 3   w          5642 non-null   int64  
 4   l          5642 non-null   int64  
 5   era        5642 non-null   float64
 6   g          5642 non-null   int64  
 7   gs         5642 non-null   int64  
 8   cg         5642 non-null   int64  
 9   sho        5642 non-null   int64  
 10  sv         5642 non-null   int64  
 11  ip         5642 non-null   float64
 12  tbf        5642 non-null   int64  
 13  h          5642 non-null   int64  
 14  r          5642 non-null   int64  
 15  er         5642 non-null   int64  
 16  hr         5642 non-null   int64  
 17  bb         5642 non-null   int64  
 18  hbp        5642 non-null   int64  
 19  wp         5642 non-null   int64  
 20  bk      

In [13]:
features = list(df[['age', 'w', 'l', 'era', 'g', 'gs', 'cg', 'sho', 'sv', 'ip', 'tbf',
                    'h', 'r', 'er', 'hr', 'bb', 'hbp', 'wp', 'bk', 'so', 'k/9', 'bb/9', 'k/bb',
                    'hr/9', 'k%', 'bb%', 'k-bb%', 'avg', 'whip', 'babip', 'lob%', 'fip', 'e-f']].columns)

target_drafted = 'Drafted?'
target_round = 'Round'
target_pick = 'Pick'

df[target_drafted] = df[target_drafted].astype(int)

In [15]:
X = df[features]
y_drafted = df[target_drafted]
y_pick = df[target_pick]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_drafted, test_size=0.2, random_state=42
)

In [21]:
draft_model = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

draft_model.fit(X_train_c, y_train_c)

y_pred_proba = draft_model.predict_proba(X_test_c)[:, 1]
y_pred_c = (y_pred_proba > 0.4).astype(int)  # default is 0.5
print("üéØ Drafted? Classifier Results:")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_c):.3f}")
print(classification_report(y_test_c, y_pred_c))

üéØ Drafted? Classifier Results:
Accuracy: 0.902
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       992
           1       0.60      0.55      0.57       137

    accuracy                           0.90      1129
   macro avg       0.77      0.75      0.76      1129
weighted avg       0.90      0.90      0.90      1129



In [23]:
drafted_df = df[df[target_drafted] == 1]

X_drafted = drafted_df[features]
y_pick = drafted_df[target_pick]

X_train, X_test, y_train, y_test = train_test_split(
    X_drafted, y_pick, test_size=0.2, random_state=42
)

pick_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
pick_model.fit(X_train, y_train)

pick_preds = pick_model.predict(X_test)
print("\nüìä Pick Prediction:")
print(f"MAE: {mean_absolute_error(y_test, pick_preds):.3f}")
print(f"R¬≤: {r2_score(y_test, pick_preds):.3f}")


üìä Pick Prediction:
MAE: 102.572
R¬≤: 0.386


In [25]:
def generate_fake_players(n=10):
    players = []
    for i in range(n):
        player = {
            'nameascii': f"Player_{i}",
            'age': np.random.randint(18, 25),
            'w': np.random.randint(0, 15),
            'l': np.random.randint(0, 15),
            'era': round(np.random.uniform(1.5, 6.0), 2),
            'g': np.random.randint(5, 40),
            'gs': np.random.randint(0, 30),
            'cg': np.random.randint(0, 5),
            'sho': np.random.randint(0, 3),
            'sv': np.random.randint(0, 10),
            'ip': round(np.random.uniform(10, 120), 1),
            'tbf': np.random.randint(50, 500),
            'h': np.random.randint(10, 150),
            'r': np.random.randint(0, 80),
            'er': np.random.randint(0, 80),
            'hr': np.random.randint(0, 20),
            'bb': np.random.randint(0, 60),
            'hbp': np.random.randint(0, 10),
            'wp': np.random.randint(0, 10),
            'bk': np.random.randint(0, 2),
            'so': np.random.randint(10, 150),
            'k/9': round(np.random.uniform(5, 15), 2),
            'bb/9': round(np.random.uniform(1, 6), 2),
            'k/bb': round(np.random.uniform(0.5, 5.0), 2),
            'hr/9': round(np.random.uniform(0.3, 2.5), 2),
            'k%': round(np.random.uniform(10, 35), 2),
            'bb%': round(np.random.uniform(3, 12), 2),
            'k-bb%': round(np.random.uniform(5, 25), 2),
            'avg': round(np.random.uniform(0.150, 0.350), 3),
            'whip': round(np.random.uniform(0.9, 1.6), 2),
            'babip': round(np.random.uniform(0.250, 0.350), 3),
            'lob%': round(np.random.uniform(60, 85), 2),
            'fip': round(np.random.uniform(2.0, 6.0), 2),
            'e-f': round(np.random.uniform(-1.0, 1.0), 2)
        }
        players.append(player)
    return pd.DataFrame(players)

fake_players = generate_fake_players(5)
print("\nüßç‚Äç‚ôÇÔ∏è Example Fake Players:")
print(fake_players.head())


üßç‚Äç‚ôÇÔ∏è Example Fake Players:
  nameascii  age   w   l   era   g  gs  cg  sho  sv  ...  hr/9     k%   bb%  \
0  Player_0   18  10  13  1.81  35  11   4    0   0  ...  2.11  28.19  6.94   
1  Player_1   20   5   7  4.76  39  19   3    0   1  ...  2.39  32.17  5.37   
2  Player_2   21   5   3  3.23  36  27   1    1   9  ...  1.96  21.38  7.69   
3  Player_3   20   3   9  5.39  12   5   3    1   1  ...  0.88  10.24  6.35   
4  Player_4   22   3   4  2.64  34  28   4    2   1  ...  1.60  14.49  4.38   

   k-bb%    avg  whip  babip   lob%   fip   e-f  
0  20.74  0.237  1.59  0.330  80.91  5.37  0.44  
1  16.33  0.316  1.24  0.314  65.88  2.31  0.38  
2  10.54  0.199  1.30  0.253  79.60  3.66  0.19  
3   7.61  0.214  1.39  0.328  80.94  2.32  0.51  
4  13.38  0.330  1.05  0.288  81.73  3.15  1.00  

[5 rows x 34 columns]


In [30]:
X_fake = fake_players[features]

# Step 1: predict drafted or not
fake_players['Predicted_Drafted'] = draft_model.predict(X_fake)

# Step 2: predict round and pick only if drafted
fake_players['Predicted_Round'] = np.nan
fake_players['Predicted_Pick'] = np.nan

drafted_mask = fake_players['Predicted_Drafted'] == 1
fake_players.loc[drafted_mask, 'Predicted_Pick'] = np.round(
    pick_model.predict(X_fake[drafted_mask])
).astype(int)

print("\nüéØ Draft Predictions for Fake Players:")
print(fake_players[['nameascii', 'Predicted_Drafted', 'Predicted_Pick']])


üéØ Draft Predictions for Fake Players:
  nameascii  Predicted_Drafted  Predicted_Pick
0  Player_0                  0             NaN
1  Player_1                  0             NaN
2  Player_2                  1           166.0
3  Player_3                  0             NaN
4  Player_4                  1           270.0
