# UFC 경기 승패 예측 (모든 특성 사용 최종본)

## 1. 라이브러리 임포트 및 데이터 로드

In [173]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [174]:
df = pd.read_csv('UFC.csv')

## 2. 전처리 및 특성 공학

In [175]:
# [전처리] 불필요 컬럼 제거 및 결측치 처리
df.dropna(subset=['winner', 'r_dob', 'b_dob', 'r_stance', 'b_stance', 'r_height', 'b_height'], inplace=True)
df['r_reach'].fillna(df['r_height'], inplace=True)
df['b_reach'].fillna(df['b_height'], inplace=True)
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(0)

# [특성 공학] 타겟 변수 및 파생 변수 생성
df['winner_is_red'] = (df['winner'] == df['r_name']).astype(int)
df['r_dob'] = pd.to_datetime(df['r_dob'])
df['b_dob'] = pd.to_datetime(df['b_dob'])
df['r_age'] = (pd.to_datetime('today') - df['r_dob']).dt.days / 365.25
df['b_age'] = (pd.to_datetime('today') - df['b_dob']).dt.days / 365.25
df.drop(['r_dob', 'b_dob'], axis=1, inplace=True)

# 차이 특성
df['age_diff'] = df['r_age'] - df['b_age']
df['height_diff'] = df['r_height'] - df['b_height']
df['reach_diff'] = df['r_reach'] - df['b_reach']
df['wins_diff'] = df['r_wins'] - df['b_wins']
df['losses_diff'] = df['r_losses'] - df['b_losses']
df['splm_diff'] = df['r_splm'] - df['b_splm']
df['str_acc_diff'] = df['r_str_acc'] - df['b_str_acc']
df['sapm_diff'] = df['r_sapm'] - df['b_sapm']
df['str_def_diff'] = df['r_str_def'] - df['b_str_def']
df['td_avg_diff'] = df['r_td_avg'] - df['b_td_avg']
df['td_acc_diff'] = df['r_td_avg_acc'] - df['b_td_avg_acc']
df['td_def_diff'] = df['r_td_def'] - df['b_td_def']
df['sub_avg_diff'] = df['r_sub_avg'] - df['b_sub_avg']

# 비율 및 복합 특성
# ratio features (0~1 scaled)
df['sig_str_ratio'] = df['r_splm'] / (df['r_splm'] + df['b_splm'] + 1e-6)
df['td_ratio'] = df['r_td_avg'] / (df['r_td_avg'] + df['b_td_avg'] + 1e-6)
df['str_acc_ratio'] = df['r_str_acc'] / (df['r_str_acc'] + df['b_str_acc'] + 1e-6)
df['td_acc_ratio'] = df['r_td_avg_acc'] / (df['r_td_avg_acc'] + df['b_td_avg_acc'] + 1e-6)

# win ratio features
df['r_win_ratio'] = df['r_wins'] / (df['r_wins'] + df['r_losses'] + 1e-6)
df['b_win_ratio'] = df['b_wins'] / (df['b_wins'] + df['b_losses'] + 1e-6)
df['win_ratio_diff'] = df['r_win_ratio'] - df['b_win_ratio']

# ---------- 추가 파생 특성 ----------
# BMI
df['r_bmi'] = df['r_weight'] / ((df['r_height'] / 100) ** 2 + 1e-6)
df['b_bmi'] = df['b_weight'] / ((df['b_height'] / 100) ** 2 + 1e-6)
df['bmi_diff'] = df['r_bmi'] - df['b_bmi']

# Reach / Height 비율
df['r_reach_ht_ratio'] = df['r_reach'] / (df['r_height'] + 1e-6)
df['b_reach_ht_ratio'] = df['b_reach'] / (df['b_height'] + 1e-6)
df['reach_ht_ratio_diff'] = df['r_reach_ht_ratio'] - df['b_reach_ht_ratio']


# 총 경기 수
df['r_total_fights'] = df['r_wins'] + df['r_losses']
df['b_total_fights'] = df['b_wins'] + df['b_losses']
df['total_fights_diff'] = df['r_total_fights'] - df['b_total_fights']

# ---------- 추가 파생 특성 v3 ----------
# 1) 공격 점수: Striking + Grappling 효율 합
df['r_offense_score'] = df['r_str_eff'] + df['r_grap_eff'] if 'r_str_eff' in df.columns else \
                        (df['r_splm'] * df['r_str_acc']) + (df['r_td_avg'] * df['r_td_avg_acc'])
df['b_offense_score'] = df['b_str_eff'] + df['b_grap_eff'] if 'b_str_eff' in df.columns else \
                        (df['b_splm'] * df['b_str_acc']) + (df['b_td_avg'] * df['b_td_avg_acc'])
df['offense_score_diff'] = df['r_offense_score'] - df['b_offense_score']

# 2) 방어 점수: 타격·테이크다운 방어율 평균
df['r_defense_score'] = (df['r_str_def'] + df['r_td_def']) / 2
df['b_defense_score'] = (df['b_str_def'] + df['b_td_def']) / 2
df['defense_score_diff'] = df['r_defense_score'] - df['b_defense_score']

# 3) 순공격 이득(Net Advantage) = 공격 diff + 방어 diff
df['net_advantage'] = df['offense_score_diff'] + df['defense_score_diff']

# 4) 상호작용 특성: 레드의 공격 vs 블루의 방어, 블루의 공격 vs 레드의 방어
df['str_vs_def_diff'] = (df['r_str_acc'] * df['b_str_def']) - (df['b_str_acc'] * df['r_str_def'])

# 5) 공격/방어 스코어 비율 차이
df['off_def_ratio_diff'] = (df['r_offense_score'] / (df['r_defense_score'] + 1e-6)) - \
                           (df['b_offense_score'] / (df['b_defense_score'] + 1e-6))

# 스탠스 조합 (범주형)
df['stance_comb'] = df['r_stance'].astype(str) + '_' + df['b_stance'].astype(str)


## 3. 모델링 준비 (모든 특성 사용)

In [177]:
# 모든 생성된 특성 정의
numerical_features = [col for col in df.columns if df[col].dtype != 'object' and col not in ['winner_is_red', 'winner']]
categorical_features = ['r_stance', 'b_stance', 'stance_comb']

# 최종 데이터셋 정의 및 분할
X = df[numerical_features + categorical_features]
y = df['winner_is_red']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f'사용된 총 특성 개수: {X_train.shape[1]}')

사용된 총 특성 개수: 75


## 4. 스태킹 모델 하이퍼파라미터 튜닝

In [178]:
# 전처리기 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)])

# 클래스 불균형 처리 가중치
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# 스태킹 기본 모델 정의
estimators = [
    ('lr', LogisticRegression(random_state=42, class_weight='balanced')),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced')),
    ('lgb', LGBMClassifier(random_state=42, class_weight='balanced', verbose=-1)),
    ('xgb', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight))
]

# 스태킹 모델 파이프라인 구축
stacking_pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                                    ('classifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5))])

# 튜닝할 하이퍼파라미터 그리드 정의
param_distributions = {
    'classifier__lr__C': [0.01, 0.1, 1, 10],
    'classifier__rf__n_estimators': [100, 200, 300],
    'classifier__rf__max_depth': [5, 10, 20, None],
    'classifier__lgb__n_estimators': [100, 200, 300],
    'classifier__lgb__learning_rate': [0.01, 0.05, 0.1],
    'classifier__lgb__num_leaves': [20, 31, 40],
    'classifier__xgb__n_estimators': [100, 200, 300],
    'classifier__xgb__max_depth': [3, 5, 7],
    'classifier__xgb__learning_rate': [0.01, 0.05, 0.1],
    'classifier__final_estimator__C': [0.1, 1, 10]
}

# RandomizedSearchCV 실행
search = RandomizedSearchCV(stacking_pipeline, param_distributions, n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42, verbose=2)
print("--- Stacking Model Hyperparameter Tuning ---")
search.fit(X_train, y_train)

print(f'Best ROC-AUC Score: {search.best_score_:.4f}')
print("Best Parameters:")
print(search.best_params_)

--- Stacking Model Hyperparameter Tuning ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

## 5. 최종 모델 평가

In [None]:
# 최적의 모델로 예측 및 평가
best_stacking_model = search.best_estimator_

probs_final = best_stacking_model.predict_proba(X_test)[:, 1]
y_pred_final = (probs_final >= 0.5).astype(int)

print('--- Final Tuned Stacking Ensemble Model ---')
print(f'Accuracy: {accuracy_score(y_test, y_pred_final):.4f}')
print(f'ROC-AUC: {roc_auc_score(y_test, probs_final):.4f}')
print(classification_report(y_test, y_pred_final))

# 혼동 행렬 시각화
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Final Stacking Model Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()