In [10]:
# 다운로드가 필요한 모듈, 라이브러리
# pip install lightgbm
# pip install catboost
# pip install soccerdata
# pip install xgboost
# pip install imbalanced-learn

In [25]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

from soccerdata.fbref import FBref 
from pathlib import Path

# 학습 모델들
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier


In [12]:
# 1) CSV 불러오기 & match_id 생성
data = pd.read_csv('Matches.csv', parse_dates=['MatchDate'])
data = data.reset_index().rename(columns={'index':'match_id'})

# 2) 홈/원정 각각 long 포맷으로 전환
home = data[['match_id','MatchDate','HomeTeam','FTHome','FTAway']].copy()
home = home.assign(
    team           = home['HomeTeam'],
    goals_for      = home['FTHome'],
    goals_against  = home['FTAway'],
    venue          = 'Home'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

away = data[['match_id','MatchDate','AwayTeam','FTAway','FTHome']].copy()
away = away.assign(
    team           = away['AwayTeam'],
    goals_for      = away['FTAway'],
    goals_against  = away['FTHome'],
    venue          = 'Away'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

matches_long = pd.concat([home, away], ignore_index=True)

# 3) 정렬하고 인덱스 재설정 (꼭 필요)
matches_long = matches_long.sort_values(['team','MatchDate']).reset_index(drop=True)

# 4) 과거 3·5경기 득실 합계 계산 (transform 이용)
for N in (3, 5):
    # 먼저 “현재 경기” 제외를 위해 shift()
    shifted_gf = matches_long.groupby('team')['goals_for']     .shift()
    shifted_ga = matches_long.groupby('team')['goals_against'] .shift()

    # rolling 합계 계산
    matches_long[f'GF{N}'] = (shifted_gf
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))
    matches_long[f'GA{N}'] = (shifted_ga
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))

# 5) 홈/Away별로 다시 뽑아서 이름 바꾸기
home_stats = (
    matches_long[matches_long['venue']=='Home']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Home','GA3':'GA3Home',
        'GF5':'GF5Home','GA5':'GA5Home'
    })
)
away_stats = (
    matches_long[matches_long['venue']=='Away']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Away','GA3':'GA3Away',
        'GF5':'GF5Away','GA5':'GA5Away'
    })
)

# 6) map으로 원본 data에 컬럼 추가
for col in home_stats.columns:
    data[col] = data['match_id'].map(home_stats[col])
for col in away_stats.columns:
    data[col] = data['match_id'].map(away_stats[col])

# 7) 불필요해진 match_id 제거 (선택)
data = data.drop(columns=['match_id'])

# 8) 결과 확인
# data.info()

In [13]:
data = data[data['Division'] == 'E0']   # 프리미어 리그(epl) 데이터 추출

# 1-1 xg 데이터 불러오기
xg_data = pd.read_csv('xg_data.csv')

# 1-2 xg 데이터와 Matches 데이터 합치기

# 날짜 칼럼을 datetime.date 로 맞추기
xg_data['MatchDate'] = pd.to_datetime(xg_data['MatchDate']).dt.date
data['MatchDate'] = pd.to_datetime(data['MatchDate']).dt.date 

# Mathces와 다른 팀명들을 모두 동일하도록 mapping
team_name_map = {
    'Manchester City'   : 'Man City',
    'Manchester United' : 'Man United',
    'Newcastle United' : 'Newcastle',
    'Norttingham Forest' : 'Nottm Forest',
    'Wolverhampton Wanderers' : 'Wolves',
    'West Bromwich Albion' : 'West Brom',
}   

# xg_data 에 적용 (Home / Away 양쪽)
xg_data['HomeTeam'] = xg_data['HomeTeam'].replace(team_name_map)
xg_data['AwayTeam'] = xg_data['AwayTeam'].replace(team_name_map)



# MatchDate를 시계열 데이터로 전환
xg_data['MatchDate'] = pd.to_datetime(xg_data['MatchDate'])  # MatchDate를 datetime 형식으로 변환
data['MatchDate'] = pd.to_datetime(data['MatchDate'])  # MatchDate를 datetime 형식으로 변환
data = data[(data['MatchDate'] > '2016-08-13')]


# xg_data와 Matches 데이터 합치기
data_final = data.merge(
    xg_data,
    on=['MatchDate', 'HomeTeam', 'AwayTeam'],
    how='inner',          # 이전에 left
    validate='1:1'        # 같은 키가 중복되면 오류로 알려줌
)

In [14]:
# 2. 데이터 전처리
# 2-1 데이터 전처리 :데이터 연도 기준 필터링
# 프리미어 리그(epl)의 2022~2023연도 데이터를 사용


# 2-2 데이터 전처리 : 학습에 사용할 column만 추출
# 'MatchTime' 제거
columns = ['MatchDate', 'HomeTeam', 'AwayTeam', 'HomeElo', 'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'FTResult', 'GF3Home', 'GA3Home', 'GF5Home', 'GA5Home', 'GF3Away', 'GA3Away', 'GF5Away', 'GA5Away', 'home_goals_l3', 'home_goals_l5', 'away_goals_l3', 'away_goals_l5', 'home_xg_l3', 'home_xg_l5', 'away_xg_l3', 'away_xg_l5'
]
data_final = data_final[columns]

In [15]:
# 2-3 데이터 전처리 : 결측치 확인
data_final.isnull().sum()

# 결측치는 학습에 영향을 주지 않도록 모두 0으로 처리 
columns_NaN = ['home_goals_l3', 'home_goals_l5', 'away_goals_l3', 'away_goals_l5', 'home_xg_l3', 'home_xg_l5', 'away_xg_l3', 'away_xg_l5']

# 결측치 있는 행 제거
data_final[columns_NaN] = data_final[columns_NaN].fillna(0)

# data_final = data_final.dropna(subset=columns).copy()

In [16]:
# 2-4 데이터 전처리 : Standardize(표준화), OneHotEncoding(원-핫 인코딩)
log_columns = ['HomeElo', 'AwayElo'] # 로그 변환할 columns
standarize_columns = ['HomeElo', 'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'GF3Home', 'GA3Home', 'GF5Home', 'GA5Home', 'GF3Away', 'GA3Away', 'GF5Away', 'GA5Away']  # 표준화할 columns
encoding_columns = ['HomeTeam', 'AwayTeam']  # 원-핫 인코딩할 columns

# 분산이 큰 배당률 관련 columns와 Elo columns는 표준화 전에 log scale을 먼저 적용
data_final[log_columns] = np.log1p(data_final[log_columns])

# Pipeline에서 전처리 해줄 ColumnsTransformer 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), standarize_columns),  # 표준화
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True), encoding_columns)  # 원-핫 인코딩
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로 유지
)


In [17]:
# 팀의 전력 차이를 예측하기 위한 파생 column 생성

# home과 away의 xG차이
data_final['xg_l3_diff'] = data_final['home_xg_l3'] - data_final['away_xg_l3']  
data_final['xg_l5_diff'] = data_final['home_xg_l5'] - data_final['away_xg_l5']

# home과 away의 Elo 차이
data_final['elo_diff'] = data_final['HomeElo'] - data_final['AwayElo']  



In [18]:
# 1) 학습·테스트 시즌 분리
train_mask = (data_final['MatchDate'] > '2016-08-13') & (data_final['MatchDate'] < '2025-01-01')
test_mask  =  data_final['MatchDate'] >= '2025-01-01'

data_train = data_final.loc[train_mask].copy()
data_test  = data_final.loc[test_mask].copy()

# 2) 무승부(Draw) 제거 → Home=0, Away=1
data_train = data_train[data_train['FTResult'] != 'D']
data_test  = data_test[data_test['FTResult']  != 'D']

target_map = {'H': 0, 'A': 1}

# 3) MatchDate 컬럼 삭제
data_train.drop(columns='MatchDate', inplace=True)
data_test.drop(columns='MatchDate', inplace=True)

# 4) feature 컬럼 자동 도출 & 분리
features = [c for c in data_train.columns if c != 'FTResult']

X_train = data_train[features]
X_test  = data_test[features]

y_train = data_train['FTResult'].map(target_map)
y_test  = data_test['FTResult'].map(target_map)

In [19]:
pipe_lgb = Pipeline([
    ('pre', preprocessor),
    ('clf', LGBMClassifier(
        objective      = 'binary',
        n_estimators   = 1200,
        learning_rate  = 0.035,
        max_depth      = -1,
        num_leaves     = 63,
        colsample_bytree = 0.8,
        subsample        = 0.8,
        reg_alpha        = 0.1,
        reg_lambda       = 1.0,
        random_state     = 42,
        class_weight     = 'balanced',  # 자동 비율
    ))
])

pipe_et = Pipeline([
    ('pre', preprocessor),                               # 기존 ColumnTransformer
    ('clf', ExtraTreesClassifier(
        n_estimators   = 800,
        max_depth      = None,
        max_features   = 'sqrt',
        min_samples_leaf = 2,
        n_jobs         = -1,
        random_state   = 42,
        class_weight   = 'balanced_subsample'   # 불균형 보정
    ))
])

pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(
        objective='binary:logistic',
        n_estimators=600, 
        max_depth=4,
        learning_rate=0.05, 
        subsample=0.8, 
        colsample_bytree=0.8,
        scale_pos_weight=1.2, 
        eval_metric='logloss',
        random_state=42))
])

# 3) CatBoost  (binary)
pipe_cat = Pipeline([
    ('pre', preprocessor),
    ('clf', CatBoostClassifier(
        loss_function = 'Logloss',      # binary
        iterations    = 1200,
        learning_rate = 0.05,
        depth         = 6,
        l2_leaf_reg   = 6,
        random_seed   = 42,
        verbose       = False,
    ))
])

# 4) Logistic Regression (binary)
pipe_logit = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(
        max_iter   = 1000,
        solver     = 'lbfgs',
        class_weight = 'balanced'       # 자동 비율
    ))
])

In [20]:
stack_bin = StackingClassifier(
    estimators=[
        ('lgb',   pipe_lgb),
        ('xgb',  pipe_xgb),        
        ('cat',   pipe_cat),
        ('rf',  pipe_et),
        ('logit', pipe_logit)
    ],
    final_estimator = LogisticRegression(max_iter=1000, class_weight='balanced'),
    stack_method    = 'predict_proba',
    cv              = StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs          = -1
)

In [21]:
# 모델 학습
# pipe_lightgbm.fit(X_train, y_train)
stack_bin.fit(X_train, y_train)

In [None]:
# 예측
# y_pred_lightgbm  = pipe_lightgbm.predict(X_test)
# y_prob_lightgbm  = pipe_lightgbm.predict_proba(X_test)   # shape = (n_samples, 3)

y_pred = stack_bin.predict(X_test)
y_prob = stack_bin.predict_proba(X_test)


In [27]:
print(y_prob)

[[0.77244951 0.22755049]
 [0.4757907  0.5242093 ]
 [0.82322299 0.17677701]
 [0.71620832 0.28379168]
 [0.77233757 0.22766243]
 [0.81931877 0.18068123]
 [0.68442229 0.31557771]
 [0.78164005 0.21835995]
 [0.47579328 0.52420672]
 [0.46500291 0.53499709]
 [0.73116904 0.26883096]
 [0.70645476 0.29354524]
 [0.71178493 0.28821507]
 [0.29417062 0.70582938]
 [0.73453891 0.26546109]
 [0.57954375 0.42045625]
 [0.77023158 0.22976842]
 [0.59186246 0.40813754]
 [0.28383734 0.71616266]
 [0.68684945 0.31315055]
 [0.23007839 0.76992161]
 [0.20148456 0.79851544]
 [0.69105485 0.30894515]
 [0.51980785 0.48019215]
 [0.51274168 0.48725832]
 [0.50417834 0.49582166]
 [0.71717921 0.28282079]
 [0.58741642 0.41258358]
 [0.30637228 0.69362772]
 [0.58936004 0.41063996]
 [0.70331028 0.29668972]
 [0.59072157 0.40927843]
 [0.66047736 0.33952264]
 [0.26479248 0.73520752]
 [0.74557108 0.25442892]
 [0.77988201 0.22011799]
 [0.83197918 0.16802082]
 [0.82527111 0.17472889]
 [0.6979437  0.3020563 ]
 [0.7097614  0.2902386 ]


In [23]:
print("Accuracy :", accuracy_score(y_test, y_pred))
print("\nClassification Report\n", classification_report(y_test, y_pred))

# (선택) 로그-로스 — 다중 클래스 확률 평가
print("Log-loss :", log_loss(y_test, y_prob))

# (선택) 혼동 행렬
print("Confusion Matrix\n", confusion_matrix(y_test, y_pred))

Accuracy : 0.7205882352941176

Classification Report
               precision    recall  f1-score   support

           0       0.71      0.77      0.74        71
           1       0.73      0.66      0.69        65

    accuracy                           0.72       136
   macro avg       0.72      0.72      0.72       136
weighted avg       0.72      0.72      0.72       136

Log-loss : 0.5451358023302539
Confusion Matrix
 [[55 16]
 [22 43]]


In [26]:
joblib.dump(stack_bin, 'model.pkl')  # 모델 저장

['model.pkl']

In [24]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, log_loss

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {'auc':'roc_auc', 'logloss':'neg_log_loss', 'acc':'accuracy'}

cv_res = cross_validate(stack_bin, X_train, y_train,
                        cv=cv, scoring=scoring, return_train_score=True)

print("Train vs Val (mean ± std)")
for k in scoring:
    tr = cv_res[f'train_{k}'];  va = cv_res[f'test_{k}']
    print(f"{k:8s}: {np.mean(tr):.3f} ±{np.std(tr):.3f}  |  "
          f"{np.mean(va):.3f} ±{np.std(va):.3f}")

Train vs Val (mean ± std)
auc     : 0.985 ±0.008  |  0.772 ±0.015
logloss : -0.334 ±0.024  |  -0.574 ±0.014
acc     : 0.929 ±0.025  |  0.701 ±0.017
