In [34]:
##Pitching

In [28]:
import pandas as pd
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# === Track runtime ===
start_time = time.time()

# === Load standardized mappings ===
team_map = pd.read_csv("standardized/team_name_mapping.csv")
missing_map = pd.read_csv("standardized/missing_schools.csv")
unique_teams = pd.read_csv("standardized/unique_teams.csv")
missing_map = missing_map.dropna(subset=["Old name"])

# === Load datasets ===
pitching = pd.read_csv("ncaa_pitchingQualifiedCSV/pitching_combined_all.csv")
teams = pd.read_csv("ncaabb_dataset.csv")

with open("all_drafts.json", "r") as f:
    draft_data = json.load(f)
draft_df = pd.DataFrame(draft_data)

# === Standardize Team Names ===
teams = teams.merge(team_map, left_on="team", right_on="team_old", how="left")
teams["team"] = teams["team_new"].combine_first(teams["team"])
teams.drop(columns=["team_old", "team_new"], inplace=True, errors="ignore")

draft_df = draft_df.merge(team_map, left_on="Drafted From", right_on="team_old", how="left")
draft_df["Drafted From"] = draft_df["team_new"].combine_first(draft_df["Drafted From"])
draft_df.drop(columns=["team_old", "team_new"], inplace=True, errors="ignore")

draft_df = draft_df.merge(missing_map, left_on="Drafted From", right_on="Old name", how="left")
draft_df["Drafted From"] = draft_df["New name"].combine_first(draft_df["Drafted From"])
draft_df.drop(columns=["Old name", "New name"], inplace=True, errors="ignore")

pitching = pitching.merge(unique_teams, left_on="team", right_on="Acronym", how="left")
pitching["team"] = pitching["Full Name"].combine_first(pitching["team"])
pitching = pitching.drop(columns=["Acronym", "Full Name"], errors="ignore")

# === Filter Data for Years 2021–2025 ===
pitching = pitching[pitching["year"].between(2021, 2025)]
draft_df = draft_df[draft_df["Year"].between(2021, 2025)]
teams = teams[teams["year"].between(2021, 2025)] if "year" in teams.columns else teams

# === Process Draft Data ===
draft_df["Drafted"] = 1
draft_df["Round"] = draft_df["Round"].astype(int)
draft_df["Pick"] = draft_df["Pick"].astype(int)

# === Merge draft data into pitching ===
pitching = pitching.merge(
    draft_df[["Player Name", "Year", "Drafted", "Round"]],
    left_on=["name", "year"],
    right_on=["Player Name", "Year"],
    how="left"
)
pitching["Drafted"] = pitching["Drafted"].fillna(0)
pitching["Round"] = pitching["Round"].fillna(0).astype(int)

# === Merge standardized team info ===
pitching = pitching.merge(teams, on="team", how="left")

# === Filter team stats (optional: only teams >20 players) ===
team_counts = pitching["team"].value_counts()
valid_teams = team_counts[team_counts > 20].index
pitching = pitching[pitching["team"].isin(valid_teams)]

# === Features and Drafted Classifier ===
# Adjust features to pitcher-specific stats
features = ["era", "bb", "k/9", "HR", "WPCT", "PE", "WHIP"]
X = pitching[features].fillna(0)
y = pitching["Drafted"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
draft_model = RandomForestClassifier(n_estimators=200, random_state=42)
draft_model.fit(X_train, y_train)

# === Train Round Classifier (only drafted players) ===
drafted_pitching = pitching[pitching["Drafted"] == 1].copy()
X_round = drafted_pitching[features].fillna(0)
y_round = drafted_pitching["Round"].astype(int)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_round, y_round, test_size=0.2, random_state=42)
round_model = RandomForestClassifier(n_estimators=200, random_state=42)
round_model.fit(Xr_train, yr_train)

# === Ensure 'year' column exists ===
year_cols = [c for c in pitching.columns if "year" in c.lower()]
pitching["year"] = pitching[year_cols[0]] if len(year_cols) == 1 else pitching[year_cols[0]].combine_first(pitching[year_cols[1]])

# === Prediction Function ===
def predict_pitcher(player_name, year, team_name):
    start_time = time.time()

    row = pitching[(pitching["name"] == player_name) &
                   (pitching["year"] == year) &
                   (pitching["team"] == team_name)]
    if row.empty:
        return "Player not found."

    X_row = row[features].fillna(0)
    drafted_prob = draft_model.predict_proba(X_row)[0][1]

    if drafted_prob < 0.5:
        runtime = time.time() - start_time
        return f"Prediction: Not Drafted (prob={drafted_prob:.2f}) | Runtime: {runtime:.2f}s"

    predicted_round = int(round_model.predict(X_row)[0])

    runtime = time.time() - start_time
    return (f"For {player_name} | Prediction: Drafted (prob={drafted_prob:.2f}) | "
            f"Round {predicted_round} | Runtime: {runtime:.2f}s")

# === End Runtime ===
end_time = time.time()
print(f"Script completed in {end_time - start_time:.2f} seconds.\n")

# === Example Predictions ===
print(predict_pitcher("Jordan Gottesman", 2025, "Northeastern University"))

Script completed in 7.72 seconds.

For Jordan Gottesman | Prediction: Drafted (prob=0.79) | Round 6 | Runtime: 0.02s


  pitching["year"] = pitching[year_cols[0]] if len(year_cols) == 1 else pitching[year_cols[0]].combine_first(pitching[year_cols[1]])


In [32]:
## Batting

In [50]:
import pandas as pd
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# === Track runtime ===
start_time = time.time()

# === Load standardized mappings ===
team_map = pd.read_csv("standardized/team_name_mapping.csv")
missing_map = pd.read_csv("standardized/missing_schools.csv")
unique_teams = pd.read_csv("standardized/unique_teams.csv")
missing_map = missing_map.dropna(subset=["Old name"])

# === Load datasets ===
batting = pd.read_csv("ncaa_battingQualifiedCSV/batting_combined_all.csv")
teams = pd.read_csv("ncaabb_dataset.csv")

with open("all_drafts.json", "r") as f:
    draft_data = json.load(f)
draft_df = pd.DataFrame(draft_data)

# === Standardize Team Names ===
teams = teams.merge(team_map, left_on="team", right_on="team_old", how="left")
teams["team"] = teams["team_new"].combine_first(teams["team"])
teams.drop(columns=["team_old", "team_new"], inplace=True, errors="ignore")

draft_df = draft_df.merge(team_map, left_on="Drafted From", right_on="team_old", how="left")
draft_df["Drafted From"] = draft_df["team_new"].combine_first(draft_df["Drafted From"])
draft_df.drop(columns=["team_old", "team_new"], inplace=True, errors="ignore")

draft_df = draft_df.merge(missing_map, left_on="Drafted From", right_on="Old name", how="left")
draft_df["Drafted From"] = draft_df["New name"].combine_first(draft_df["Drafted From"])
draft_df.drop(columns=["Old name", "New name"], inplace=True, errors="ignore")

batting = batting.merge(unique_teams, left_on="team", right_on="Acronym", how="left")
batting["team"] = batting["Full Name"].combine_first(batting["team"])
batting = batting.drop(columns=["Acronym", "Full Name"], errors="ignore")

# === Filter Data for Years 2021–2025 ===
batting = batting[batting["year"].between(2021, 2025)]
draft_df = draft_df[draft_df["Year"].between(2021, 2025)]
teams = teams[teams["year"].between(2021, 2025)] if "year" in teams.columns else teams

# === Process Draft Data ===
draft_df["Drafted"] = 1
draft_df["Round"] = draft_df["Round"].astype(int)
draft_df["Pick"] = draft_df["Pick"].astype(int)

# === Merge draft data into batting ===
batting = batting.merge(
    draft_df[["Player Name", "Year", "Drafted", "Round", "Pick"]],
    left_on=["name", "year"],
    right_on=["Player Name", "Year"],
    how="left"
)
batting["Drafted"] = batting["Drafted"].fillna(0)
batting["Round"] = batting["Round"].fillna(0).astype(int)
batting["Pick"] = batting["Pick"].fillna(0).astype(int)

# === Merge standardized team info ===
batting = batting.merge(teams, on="team", how="left")

# === Filter team stats (optional: only teams > 20 players) ===
team_counts = batting["team"].value_counts()
valid_teams = team_counts[team_counts > 20].index
batting = batting[batting["team"].isin(valid_teams)]

# === Features and Drafted Classifier ===
# You can adjust these features to batter-specific stats
features = ["SLG", "avg", "HR", "BB (Batting)", "RPG", "DPPG", "OBP"]
X = batting[features].fillna(0)
y = batting["Drafted"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
draft_model = RandomForestClassifier(n_estimators=200, random_state=42)
draft_model.fit(X_train, y_train)

# === Train Round Classifier (only drafted players) ===
drafted_batting = batting[batting["Drafted"] == 1].copy()
X_round = drafted_batting[features].fillna(0)
y_round = drafted_batting["Round"].astype(int)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_round, y_round, test_size=0.2, random_state=42)
round_model = RandomForestClassifier(n_estimators=200, random_state=42)
round_model.fit(Xr_train, yr_train)

# === Ensure 'year' column exists ===
year_cols = [c for c in batting.columns if "year" in c.lower()]
batting["year"] = batting[year_cols[0]] if len(year_cols) == 1 else batting[year_cols[0]].combine_first(batting[year_cols[1]])

# === Prediction Function ===
def predict_batter(player_name, year, team_name):
    start_time = time.time()

    row = batting[(batting["name"] == player_name) &
                  (batting["year"] == year) &
                  (batting["team"] == team_name)]
    if row.empty:
        return "Player not found."

    X_row = row[features].fillna(0)
    drafted_prob = draft_model.predict_proba(X_row)[0][1]

    if drafted_prob < 0.5:
        runtime = time.time() - start_time
        return f"Prediction: Not Drafted (prob={drafted_prob:.2f}) | Runtime: {runtime:.2f}s"

    predicted_round = int(round_model.predict(X_row)[0])
    
    # Compute pick within round using historical draft_df
    year_rounds = draft_df[draft_df["Year"] == year].groupby("Round")["Pick"].max().reset_index()
    year_rounds = year_rounds.sort_values("Round").reset_index(drop=True)
    cumulative = year_rounds["Pick"].cumsum().shift(fill_value=0)
    round_info = year_rounds[year_rounds["Round"] == predicted_round]
    if not round_info.empty:
        picks_in_round = round_info["Pick"].iloc[0]
        prev_cumulative = cumulative.loc[round_info.index[0]]
        pick_in_round = min(picks_in_round, prev_cumulative + 1)  # fallback if needed
    else:
        picks_in_round = 30  # fallback
        pick_in_round = 1

    # === Return the prediction including pick ===
    runtime = time.time() - start_time
    return (f"For {player_name} | Prediction: Drafted (prob={drafted_prob:.2f}) | "
        f"Round {predicted_round}, Pick {pick_in_round} | Runtime: {runtime:.2f}s")

# === End Runtime ===
end_time = time.time()
print(f"Script completed in {end_time - start_time:.2f} seconds.\n")

# === Example Predictions ===
print(predict_batter("Aiva Arquette", 2025, "Oregon State University"))
print(predict_batter("Mason Neville", 2025, "University of Oregon"))
print(predict_batter("Mitch Voit", 2025, "University of Michigan"))

Script completed in 11.22 seconds.

For Aiva Arquette | Prediction: Drafted (prob=0.89) | Round 1, Pick 1 | Runtime: 0.02s
For Mason Neville | Prediction: Drafted (prob=0.71) | Round 4, Pick 143 | Runtime: 0.02s
For Mitch Voit | Prediction: Drafted (prob=0.57) | Round 1, Pick 1 | Runtime: 0.02s


  batting["year"] = batting[year_cols[0]] if len(year_cols) == 1 else batting[year_cols[0]].combine_first(batting[year_cols[1]])
