#### 1. 데이터 불러오기

In [1]:
import pandas as pd

# 데이터 로드
df = pd.read_csv("enhanced_dataset.csv")

# 유효한 result만 사용
df = df.dropna(subset=['result'])

# 이진 타겟: 홈팀 승리 여부
df['is_home_win'] = (df['result'] == 2).astype(int)

# 타겟 및 제거할 컬럼
drop_cols = ['result', 'MatchDate', 'HomeTeam', 'AwayTeam', 'is_home_win']
X = df.drop(columns=drop_cols)
y = df['is_home_win']

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Optuna 최적 파라미터
best_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 500,
    'random_state': 42,
    'learning_rate': 0.012587582340310363,
    'max_depth': 4,
    'num_leaves': 172,
    'min_child_samples': 40,
    'subsample': 0.9700719452188175,
    'colsample_bytree': 0.8490552601335883,
    'reg_alpha': 1.952069295331694,
    'reg_lambda': 0.66681019000377
}

# 모델 학습
model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred, digits=3))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

🎯 Accuracy: 0.6327774283793455

📋 Classification Report:
               precision    recall  f1-score   support

           0      0.639     0.776     0.701     25536
           1      0.621     0.454     0.525     20575

    accuracy                          0.633     46111
   macro avg      0.630     0.615     0.613     46111
weighted avg      0.631     0.633     0.622     46111


🧱 Confusion Matrix:
 [[19827  5709]
 [11224  9351]]


### 최근 3시즌 데이터 기반 LightGBM 이진 분류

In [8]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1️⃣ 데이터 로드
df = pd.read_csv("enhanced_dataset.csv")

# 2️⃣ season 파생 및 필터링 (2021~2024 시즌만)
df['MatchDate'] = pd.to_datetime(df['MatchDate'], errors='coerce')
df['season'] = df['MatchDate'].dt.year
df = df[df['season'] >= 2021]  # 최근 3시즌

# 3️⃣ 타겟 처리 (홈팀 승리 여부 → 1, 그 외 0)
df = df.dropna(subset=['result'])
df['is_home_win'] = (df['result'] == 2).astype(int)

# 4️⃣ X, y 정의
drop_cols = [
    'result', 'MatchDate', 'HomeTeam', 'AwayTeam', 'is_home_win',  # 타겟 및 식별정보
]
X = df.drop(columns=drop_cols)
y = df['is_home_win']

# 5️⃣ 학습/검증 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6️⃣ 최적 하이퍼파라미터로 LightGBM 모델 학습
best_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 500,
    'random_state': 42,
    'learning_rate': 0.012587582340310363,
    'max_depth': 4,
    'num_leaves': 172,
    'min_child_samples': 40,
    'subsample': 0.9700719452188175,
    'colsample_bytree': 0.8490552601335883,
    'reg_alpha': 1.952069295331694,
    'reg_lambda': 0.66681019000377
}

model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)

# 7️⃣ 평가
y_pred = model.predict(X_test)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred, digits=3))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


🎯 Accuracy: 0.6472378054578302

📋 Classification Report:
               precision    recall  f1-score   support

           0      0.652     0.809     0.722      5954
           1      0.637     0.436     0.517      4563

    accuracy                          0.647     10517
   macro avg      0.644     0.623     0.620     10517
weighted avg      0.645     0.647     0.633     10517


🧱 Confusion Matrix:
 [[4819 1135]
 [2575 1988]]


### 데이터 정리

In [17]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. 데이터 불러오기 및 정제
def clean_dataset(path: str, season_cutoff: int = 2015):
    df = pd.read_csv(path)
    df['season'] = pd.to_datetime(df['MatchDate']).dt.year
    df = df[df['season'] >= season_cutoff]

    important_cols = [
        'HomeElo', 'AwayElo', 'elo_diff',
        'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away',
        'prob_home', 'prob_draw', 'prob_away',
        'h_xg', 'a_xg', 'xG_diff', 'xg_ratio',
        'rolling_xg_home_5', 'rolling_xg_away_5',
        'elo_change_home', 'elo_change_away',
        'month', 'weekday', 'result'
    ]
    df = df.dropna(subset=important_cols)
    df['result'] = df['result'].astype(int)
    df = df.drop(columns=['MatchDate', 'HomeTeam', 'AwayTeam'])

    return df

# 2. 데이터 로드 및 분할
df = clean_dataset('enhanced_dataset.csv')
X = df.drop(columns=['result'])
y = df['result']

# 3. 결과값 이진화: 1 = 홈팀 승, 0 = 무/원정승
y_binary = (y == 2).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# 4. 모델 학습
model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# 5. 예측 및 평가
y_pred = model.predict(X_test)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred, digits=3))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


🎯 Accuracy: 0.7307692307692307

📋 Classification Report:
               precision    recall  f1-score   support

           0      0.694     0.794     0.741        63
           1      0.776     0.672     0.720        67

    accuracy                          0.731       130
   macro avg      0.735     0.733     0.730       130
weighted avg      0.736     0.731     0.730       130


🧱 Confusion Matrix:
 [[50 13]
 [22 45]]


---

### 정은님 코드 수정

In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# 1. 데이터 로드
df = pd.read_csv("EPL_with_Elo_Ratings.csv")

# 결측 제거 및 수식 정비
df = df.copy()
df['elo_diff'] = df['elo_home'] - df['elo_away']
df['xG_diff'] = df['h_xg'] - df['a_xg']
df = df.dropna(subset=['elo_home', 'elo_away', 'elo_diff', 'h_xg', 'a_xg', 'xG_diff'])

# 피처 생성
df['xg_ratio'] = df['h_xg'] / (df['h_xg'] + df['a_xg'])
df['home_advantage_score'] = df['elo_diff'] + df['xG_diff']

# 안전하게: 결과 기반 피처 제외
features = ['elo_diff', 'xG_diff', 'xg_ratio', 'home_advantage_score']
X = df[features]
y = df['result']

# 4. Stratified K-Fold 정확도 측정
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print(f"Fold {fold} 정확도: {acc:.4f}")

# 5. 평균과 표준편차 출력
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
print(f"\n✅ 평균 정확도: {mean_acc:.4f}")
print(f"📉 표준편차: {std_acc:.4f}")


Fold 1 정확도: 0.5542
Fold 2 정확도: 0.5301
Fold 3 정확도: 0.5976
Fold 4 정확도: 0.6220
Fold 5 정확도: 0.6098
Fold 6 정확도: 0.6098
Fold 7 정확도: 0.5366
Fold 8 정확도: 0.6707
Fold 9 정확도: 0.5488
Fold 10 정확도: 0.5854

✅ 평균 정확도: 0.5865
📉 표준편차: 0.0421


In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 데이터 로드
df = pd.read_csv("EPL_with_Elo_Ratings.csv")

# 2. 파생 변수 생성
df = df.dropna(subset=['elo_home', 'elo_away', 'elo_diff', 'h_xg', 'a_xg'])

df = df[(df['h_xg'] + df['a_xg']) > 0]  # 0으로 나누는 경우 제거
df['xg_ratio'] = df['h_xg'] / (df['h_xg'] + df['a_xg'])
df['home_advantage_score'] = df['elo_diff'] + df['xG_diff']

def compute_is_upset(row):
    if row['elo_diff'] > 0 and row['result'] == 1:
        return 1
    elif row['elo_diff'] < 0 and row['result'] == 2:
        return 1
    else:
        return 0
df['is_upset'] = df.apply(compute_is_upset, axis=1)
df['away_win_against_elo'] = ((df['elo_home'] > df['elo_away']) & (df['result'] == 1)).astype(int)

# 3. 타겟 분포 확인
print("🎯 클래스 분포:")
print(df['result'].value_counts(normalize=True))

# 4. 학습용 피처 및 타겟 설정
features = ['elo_diff', 'xG_diff', 'xg_ratio', 'home_advantage_score', 'is_upset', 'away_win_against_elo']
X = df[features]
y = df['result']

# # (선택사항) 이진 분류로 단순화하고 싶다면 아래 라인 사용:
# y = (df['result'] == 2).astype(int)  # 홈팀 승리 여부 예측

# 5. Stratified K-Fold + 평가
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []
all_y_true, all_y_pred = [], []

for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = RandomForestClassifier(random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print(f"\n📂 Fold {fold} 정확도: {acc:.4f}")
    print(classification_report(y_test, y_pred, digits=3))

    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)

# 6. 평균과 표준편차 출력
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
print(f"\n✅ 평균 정확도: {mean_acc:.4f}")
print(f"📉 표준편차: {std_acc:.4f}")

# 7. 최종 confusion matrix 시각화
cm = confusion_matrix(all_y_true, all_y_pred)
print("\n🧱 Confusion Matrix:\n", cm)


🎯 클래스 분포:
result
2    0.489051
0    0.296837
1    0.214112
Name: proportion, dtype: float64

📂 Fold 1 정확도: 0.8434
              precision    recall  f1-score   support

           0      0.739     0.708     0.723        24
           1      0.929     0.722     0.812        18
           2      0.870     0.976     0.920        41

    accuracy                          0.843        83
   macro avg      0.846     0.802     0.818        83
weighted avg      0.845     0.843     0.840        83


📂 Fold 2 정확도: 0.8313
              precision    recall  f1-score   support

           0      0.708     0.708     0.708        24
           1      0.938     0.833     0.882        18
           2      0.860     0.902     0.881        41

    accuracy                          0.831        83
   macro avg      0.835     0.815     0.824        83
weighted avg      0.833     0.831     0.831        83


📂 Fold 3 정확도: 0.8293
              precision    recall  f1-score   support

           0      0.739  

---

### XGBoost 이진 분류 모델 사용

In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. 데이터 로딩 및 전처리
df = pd.read_csv('enhanced_dataset.csv')
df['MatchDate'] = pd.to_datetime(df['MatchDate'])
df = df[df['MatchDate'].dt.year >= 2021]  # 최근 3시즌만 사용

# 사용할 피처 목록
features = [
    'HomeElo', 'AwayElo', 'elo_diff',
    'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away',
    'prob_home', 'prob_draw', 'prob_away',
    'h_xg', 'a_xg', 'xG_diff', 'xg_margin', 'xg_ratio',
    'rolling_xg_home_5', 'rolling_xg_away_5',
    'elo_change_home', 'elo_change_away',
    'month', 'weekday'
]

df = df.dropna(subset=features + ['result'])

X = df[features]
y = (df['result'] == 2).astype(int)  # 홈승 여부

# 2. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. 모델 학습
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# 4. 예측 및 평가
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("🎯 Accuracy:", acc)
print("\n📋 Classification Report:\n", report)
print("\n🧱 Confusion Matrix:\n", cm)


🎯 Accuracy: 0.7384615384615385

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.78      0.74        63
           1       0.77      0.70      0.73        67

    accuracy                           0.74       130
   macro avg       0.74      0.74      0.74       130
weighted avg       0.74      0.74      0.74       130


🧱 Confusion Matrix:
 [[49 14]
 [20 47]]


Parameters: { "use_label_encoder" } are not used.



In [3]:
import joblib
joblib.dump(model, 'xgb_model.pkl')

['xgb_model.pkl']

In [4]:
import joblib
import pandas as pd

# 모델 불러오기
model = joblib.load('xgb_model.pkl')

# 테스트용 입력값 구성
test_input = {
    'HomeElo': 1750,
    'AwayElo': 1690,
    'elo_diff': 60,
    'Form3Home': 2.0,
    'Form5Home': 3.5,
    'Form3Away': 1.0,
    'Form5Away': 2.0,
    'prob_home': 0.58,
    'prob_draw': 0.25,
    'prob_away': 0.17,
    'h_xg': 1.8,
    'a_xg': 1.2,
    'xG_diff': 0.6,
    'xg_margin': 0.6,
    'xg_ratio': 0.6,
    'rolling_xg_home_5': 1.7,
    'rolling_xg_away_5': 1.3,
    'elo_change_home': 5.0,
    'elo_change_away': -5.0,
    'month': 9,
    'weekday': 6  # 일요일
}

# 예측
df_input = pd.DataFrame([test_input])
proba = model.predict_proba(df_input)[0]
pred = model.predict(df_input)[0]

print("예측 결과 (홈승 여부):", "홈 승리" if pred == 1 else "무 or 원정 승리")
print("확률 (홈/무/원정):", proba)


예측 결과 (홈승 여부): 홈 승리
확률 (홈/무/원정): [0.41163272 0.5883673 ]
