# Preprocess for training data (X, y)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
# to_drop = ['id', 'home_team_season', 'away_team_season', 'home_team_abbr', 'away_team_abbr']
to_drop = ['id', 'home_team_season', 'away_team_season', 'home_pitcher', 'away_pitcher']

In [4]:
df = pd.read_csv("./dataset/train_data.csv")

df = df.drop(columns=['date']+to_drop)

In [5]:
X = df.drop('home_team_win', axis=1)  # Features
y = df['home_team_win']  # Label
X.shape, y.shape

((11067, 160), (11067,))

In [6]:
# 哪些是類別型欄位
categorical_cols = X.select_dtypes(exclude="number").columns
print("Categorical columns:")
print(categorical_cols.tolist())
# 哪些是數值型欄位
numeric_cols = X.select_dtypes(include="number").columns
print("Numeric columns:")
print(numeric_cols.tolist())

Categorical columns:
['home_team_abbr', 'away_team_abbr', 'is_night_game']
Numeric columns:
['home_team_rest', 'away_team_rest', 'home_pitcher_rest', 'away_pitcher_rest', 'season', 'home_batting_batting_avg_10RA', 'home_batting_onbase_perc_10RA', 'home_batting_onbase_plus_slugging_10RA', 'home_batting_leverage_index_avg_10RA', 'home_batting_RBI_10RA', 'away_batting_batting_avg_10RA', 'away_batting_onbase_perc_10RA', 'away_batting_onbase_plus_slugging_10RA', 'away_batting_leverage_index_avg_10RA', 'away_batting_RBI_10RA', 'home_pitching_earned_run_avg_10RA', 'home_pitching_SO_batters_faced_10RA', 'home_pitching_H_batters_faced_10RA', 'home_pitching_BB_batters_faced_10RA', 'away_pitching_earned_run_avg_10RA', 'away_pitching_SO_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA', 'away_pitching_BB_batters_faced_10RA', 'home_pitcher_earned_run_avg_10RA', 'home_pitcher_SO_batters_faced_10RA', 'home_pitcher_H_batters_faced_10RA', 'home_pitcher_BB_batters_faced_10RA', 'away_pitcher_earn

In [7]:
# 處理類別型欄位缺失值
# 處理 是否晚場賽事：眾數
X['is_night_game'] = X['is_night_game'].fillna(X['is_night_game'].mode()[0])

# 處理 主、客投手：Unknown
# X['home_pitcher'] = X['home_pitcher'].fillna("Unknown")
# X['away_pitcher'] = X['away_pitcher'].fillna("Unknown")

# -------------------------------------------
# 處理數值型欄位缺失值
# 處理 休息天數 和 賽季：中位數
rest_season_cols = ['home_team_rest', 'away_team_rest', 'home_pitcher_rest', 
                   'away_pitcher_rest', 'season']
for col in rest_season_cols:
    X[col] = X[col].fillna(X[col].median())

# 處理 _10RA：平均數
ra_cols = [col for col in numeric_cols if col.endswith('_10RA')]
for col in ra_cols:
    X[col] = X[col].fillna(X[col].mean())

# 處理 統計數據：平均數
stat_cols = [col for col in numeric_cols if col.endswith(('_mean', '_std', '_skew'))]
for col in stat_cols:
    X[col] = X[col].fillna(X[col].mean())

  X['is_night_game'] = X['is_night_game'].fillna(X['is_night_game'].mode()[0])


In [8]:
X.shape, y.shape

((11067, 160), (11067,))

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# 添加主客隊勝率欄位
home_win_rate = y.groupby([X['home_team_abbr'], X['season']]).mean().reset_index()
home_win_rate = home_win_rate.rename(columns={'home_team_win': 'home_win_rate'})
X = X.merge(home_win_rate, on=['home_team_abbr', 'season'], how='left')
away_win_rate = (1 - y).groupby([X['away_team_abbr'], X['season']]).mean().reset_index()
away_win_rate = away_win_rate.rename(columns={'home_team_win': 'away_win_rate'})
X = X.merge(away_win_rate, on=['away_team_abbr', 'season'], how='left')

# 添加主客隊勝率差異欄位
X['win_rate_diff'] = X['home_win_rate'] - X['away_win_rate']
# X = X.drop(['home_team_abbr', 'away_team_abbr'], axis=1)
# print(
#     X[X['home_team_abbr'] == 'HXK'][
#         ['season', 'home_team_abbr', 'away_team_abbr', 'home_win_rate', 'away_win_rate']
#     ].head(10)
# )

# 處理類別型欄位 -> Encoding
# 處理隊伍名稱 - One-Hot Encoding
team_features = ['home_team_abbr', 'away_team_abbr']
# onehot = OneHotEncoder(sparse_output=False)
# team_encoded = onehot.fit_transform(X[team_features])
# team_encoded_df = pd.DataFrame(
#     team_encoded, 
#     columns=onehot.get_feature_names_out(team_features),
#     index=X.index
# )
# X = pd.concat([X, team_encoded_df], axis=1)
X = X.drop(team_features, axis=1)

# 處理投手名字 - Label Encoding
# pitcher_features = ['home_pitcher', 'away_pitcher']
# label_encoders = {}

# for col in pitcher_features:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     label_encoders[col] = le
#     joblib.dump(le, f'label_encoder_{col}.joblib')


In [10]:
# X = X.assign(
#     pitcher_wpa_def_diff=X['home_pitcher_wpa_def_mean'] - X['away_pitcher_wpa_def_mean'],
#     batting_batting_avg_10RA_diff=X['home_batting_batting_avg_10RA'] - X['away_batting_batting_avg_10RA'],
#     team_wins_mean_diff=X['home_team_wins_mean'] - X['away_team_wins_mean']
# )
def calculate_home_away_differences(df, drop_original=False):
    suffixes = ["_mean", "_std", "_skew", "_10RA"]
    relevant_columns = [col for col in df.columns if any(col.endswith(suffix) for suffix in suffixes)]
    
    new_columns = {}
    columns_to_drop = []
    
    for col in relevant_columns:
        if col.startswith("home_"):
            away_col = col.replace("home_", "away_")
            if away_col in df.columns:
                base_col_name = col.split("_", 1)[1]
                new_col_name = base_col_name + "_diff"
                new_columns[new_col_name] = df[col] - df[away_col]
                
                if drop_original:
                    columns_to_drop.extend([col, away_col])
    
    df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
    
    if drop_original:
        df = df.drop(columns=columns_to_drop)
    
    return df

X_copy = X.copy()
X = calculate_home_away_differences(X, drop_original=False)

# Preprocess for testing data (test_df)

In [11]:
test_df = pd.read_csv("./dataset/same_season_test_data.csv")
test_ids = test_df['id'].copy()
test_df = test_df.drop(columns=to_drop)

In [12]:
# 類別型欄位缺失值
test_df['is_night_game'] = test_df['is_night_game'].fillna(X_copy['is_night_game'].mode()[0])
# test_df['home_pitcher'] = test_df['home_pitcher'].fillna("Unknown")
# test_df['away_pitcher'] = test_df['away_pitcher'].fillna("Unknown")
# 數值型欄位缺失值
rest_season_cols = ['home_team_rest', 'away_team_rest', 'home_pitcher_rest', 
                    'away_pitcher_rest', 'season']
for col in rest_season_cols:
    test_df[col] = test_df[col].fillna(X_copy[col].median())

ra_cols = [col for col in numeric_cols if col.endswith('_10RA')]
for col in ra_cols:
    test_df[col] = test_df[col].fillna(X_copy[col].mean())

stat_cols = [col for col in numeric_cols if col.endswith(('_mean', '_std', '_skew'))]
for col in stat_cols:
    test_df[col] = test_df[col].fillna(X_copy[col].mean())

test_df = test_df.merge(home_win_rate, on=['home_team_abbr', 'season'], how='left')
test_df = test_df.merge(away_win_rate, on=['away_team_abbr', 'season'], how='left')
test_df['win_rate_diff'] = test_df['home_win_rate'] - test_df['away_win_rate']
# print(
#     test_df[test_df['home_team_abbr'] == 'HXK'][
#         ['season', 'home_team_abbr', 'away_team_abbr', 'home_win_rate', 'away_win_rate']
#     ].head(10)
# )
# 編碼類別型欄位
# One-Hot Encoding 隊伍名稱
team_features = ['home_team_abbr', 'away_team_abbr']
# team_encoded = onehot.transform(test_df[team_features])
# team_encoded_df = pd.DataFrame(
#     team_encoded, 
#     columns=onehot.get_feature_names_out(team_features),
#     index=test_df.index
# )
# test_df = pd.concat([test_df, team_encoded_df], axis=1)
test_df = test_df.drop(team_features, axis=1)

# Label Encoding 投手名字
# pitcher_features = ['home_pitcher', 'away_pitcher']
# for col in pitcher_features:
#     le = joblib.load(f'label_encoder_{col}.joblib')
    
#     def safe_transform(val, encoder):
#         if val in encoder.classes_:
#             return encoder.transform([val])[0]
#         else:
#             return -1
    
#     test_df[col] = test_df[col].apply(lambda x: safe_transform(x, le))


test_df = test_df.assign(
    pitcher_wpa_def_diff = test_df['home_pitcher_wpa_def_mean'] - test_df['away_pitcher_wpa_def_mean'],
    batting_batting_avg_10RA_diff = test_df['home_batting_batting_avg_10RA'] - test_df['away_batting_batting_avg_10RA'],
    team_wins_mean_diff = test_df['home_team_wins_mean'] - test_df['away_team_wins_mean']
)
test_df = calculate_home_away_differences(test_df, drop_original=False)
print(test_df.shape)

(6185, 240)


  test_df['is_night_game'] = test_df['is_night_game'].fillna(X_copy['is_night_game'].mode()[0])


In [13]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [None]:
# Data
label_encoder = LabelEncoder()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# categorical_columns = ['home_team_abbr', 'away_team_abbr', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']

# Model
base_models = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('catboost', CatBoostClassifier(iterations=100, learning_rate=0.01, depth=11, verbose=0, random_state=42)),
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('gradient_boosting', GradientBoostingClassifier(random_state=42)),
    #('mlp', MLPClassifier(random_state=42, max_iter=1000)),
    #('naive_bayes', GaussianNB()),
    #('knn', KNeighborsClassifier()),
    #('svm', SVC(probability=True, random_state=42))
]

final_estimator = LogisticRegression(max_iter=1000, random_state=42)

stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=5)
stacking_model.fit(X_train, y_train)

y_val_pred = stacking_model.predict(X_val)

validation_accuracy = accuracy_score(y_val, y_val_pred)
print(validation_accuracy)

model_path = 'stacking_model.pkl'
joblib.dump(stacking_model, model_path)

# # Prediction
# test_data_path = 'dataset/same_season_test_data.csv'
# test_data = pd.read_csv(test_data_path)

# for col in categorical_columns:
#     if col in test_data.columns:
#         test_data[col] = label_encoder.fit_transform(test_data[col].astype(str))

# for col in test_data.columns:
#     if test_data[col].dtype in ['int64', 'float64']:
#         test_data[col] = test_data[col].fillna(test_data[col].mean())
#     else:
#         test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

# X_test_final = test_data.drop(columns=['id'])

# y_test_pred = stacking_model.predict(X_test_final)

# # Submissios
# submission_template_path = 'dataset/same_season_sample_submission.csv'
# submission = pd.read_csv(submission_template_path)
# submission['home_team_win'] = ['TRUE' if pred else 'FALSE' for pred in y_test_pred.astype(bool)]

# submission_output_path = 'submission.csv'
# submission.to_csv(submission_output_path, index=False)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

