In [61]:
# pip install lightgbm

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss


In [63]:
# 1) CSV 불러오기 & match_id 생성
data = pd.read_csv('Matches.csv', parse_dates=['MatchDate'])
data = data.reset_index().rename(columns={'index':'match_id'})

# 2) 홈/원정 각각 long 포맷으로 전환
home = data[['match_id','MatchDate','HomeTeam','FTHome','FTAway']].copy()
home = home.assign(
    team           = home['HomeTeam'],
    goals_for      = home['FTHome'],
    goals_against  = home['FTAway'],
    venue          = 'Home'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

away = data[['match_id','MatchDate','AwayTeam','FTAway','FTHome']].copy()
away = away.assign(
    team           = away['AwayTeam'],
    goals_for      = away['FTAway'],
    goals_against  = away['FTHome'],
    venue          = 'Away'
)[['match_id','MatchDate','team','goals_for','goals_against','venue']]

matches_long = pd.concat([home, away], ignore_index=True)

# 3) 정렬하고 인덱스 재설정 (꼭 필요)
matches_long = matches_long.sort_values(['team','MatchDate']).reset_index(drop=True)

# 4) 과거 3·5경기 득실 합계 계산 (transform 이용)
for N in (3, 5):
    # 먼저 “현재 경기” 제외를 위해 shift()
    shifted_gf = matches_long.groupby('team')['goals_for']     .shift()
    shifted_ga = matches_long.groupby('team')['goals_against'] .shift()

    # rolling 합계 계산
    matches_long[f'GF{N}'] = (shifted_gf
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))
    matches_long[f'GA{N}'] = (shifted_ga
                              .groupby(matches_long['team'])
                              .transform(lambda x: x.rolling(N).sum()))

# 5) 홈/Away별로 다시 뽑아서 이름 바꾸기
home_stats = (
    matches_long[matches_long['venue']=='Home']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Home','GA3':'GA3Home',
        'GF5':'GF5Home','GA5':'GA5Home'
    })
)
away_stats = (
    matches_long[matches_long['venue']=='Away']
    .set_index('match_id')[['GF3','GA3','GF5','GA5']]
    .rename(columns={
        'GF3':'GF3Away','GA3':'GA3Away',
        'GF5':'GF5Away','GA5':'GA5Away'
    })
)

# 6) map으로 원본 data에 컬럼 추가
for col in home_stats.columns:
    data[col] = data['match_id'].map(home_stats[col])
for col in away_stats.columns:
    data[col] = data['match_id'].map(away_stats[col])

# 7) 불필요해진 match_id 제거 (선택)
data = data.drop(columns=['match_id'])

# 8) 결과 확인
data.info()

  data = pd.read_csv('Matches.csv', parse_dates=['MatchDate'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230557 entries, 0 to 230556
Data columns (total 56 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Division     230557 non-null  object        
 1   MatchDate    230557 non-null  datetime64[ns]
 2   MatchTime    99072 non-null   object        
 3   HomeTeam     230557 non-null  object        
 4   AwayTeam     230557 non-null  object        
 5   HomeElo      141597 non-null  float64       
 6   AwayElo      141528 non-null  float64       
 7   Form3Home    229057 non-null  float64       
 8   Form5Home    229057 non-null  float64       
 9   Form3Away    229057 non-null  float64       
 10  Form5Away    229057 non-null  float64       
 11  FTHome       230554 non-null  float64       
 12  FTAway       230554 non-null  float64       
 13  FTResult     230554 non-null  object        
 14  HTHome       175977 non-null  float64       
 15  HTAway       175977 non-null  floa

In [64]:
# 2. 데이터 전처리
# 2-1 데이터 전처리 :데이터 연도 기준 필터링
# 프리미어 리그(epl)의 2022~2023연도 데이터를 사용
data = data[ data['Division'] == 'E0']   # 프리미어 리그(epl) 데이터 추출

# 2-2 데이터 전처리 : 학습에 사용할 column만 추출
# 'MatchTime' 임시 제거 
columns = ['MatchDate', 'HomeTeam', 'AwayTeam', 'HomeElo', 'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'OddHome', 'OddDraw', 'OddAway', 'FTResult', 'MaxHome', 'MaxDraw', 'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize', 'HandiHome', 'HandiAway', 'GF3Home', 'GA3Home', 'GF5Home', 'GA5Home', 'GF3Away', 'GA3Away', 'GF5Away', 'GA5Away'
]
data = data[columns]

In [65]:
# 2-3 데이터 전처리 : 결측치 확인
data.isnull().sum()

# 결측치 있는 행 제거
# data = data.dropna(subset=['HomeElo', 'AwayElo']).reset_index(drop=True)

data = data.dropna(subset=columns).copy()

In [66]:
# 2-4 데이터 전처리 : Standardize(표준화), OneHotEncoding(원-핫 인코딩)
log_columns = ['HomeElo', 'AwayElo', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw', 'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiHome', 'HandiAway'] # 로그 변환할 columns
standarize_columns = ['HomeElo', 'AwayElo', 'OddHome', 'OddDraw', 'OddAway', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'HandiSize', 'HandiHome', 'HandiAway', 'GF3Home', 'GA3Home', 'GF5Home', 'GA5Home', 'GF3Away', 'GA3Away', 'GF5Away', 'GA5Away']  # 표준화할 columns
encoding_columns = ['HomeTeam', 'AwayTeam']  # 원-핫 인코딩할 columns

# 분산이 큰 배당률 관련 columns와 Elo columns는 표준화 전에 log scale을 먼저 적용
data[log_columns] = np.log1p(data[log_columns])

# Pipeline에서 전처리 해줄 ColumnsTransformer 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), standarize_columns),  # 표준화
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True), encoding_columns)  # 원-핫 인코딩
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로 유지
)


In [None]:
data['MatchDate'] = pd.to_datetime(data['MatchDate'])  # MatchDate를 datetime 형식으로 변환

data_train = data[(data['MatchDate'] < '2025-01-01')]   # 학습 데이터 : 24/25 시즌제외 모든 데이터 추출
data_test = data[(data['MatchDate'] >= '2025-01-01')]   # 테스트 데이터 : 24/25 시즌 데이터 추출

# # MatchDate 제거
data_train = data_train.drop(columns=['MatchDate']) 
data_test = data_test.drop(columns=['MatchDate'])  

#  featuer, target 분리

X_train = data_train.drop(columns=['FTResult'])  # Feature columns
X_test = data_test.drop(columns=['FTResult'])  # Feature columns
y_train = data_train['FTResult'].map({'H':0, 'D':1, 'A':2})  # Target column
y_test = data_test['FTResult'].map({'H':0, 'D':1, 'A':2})  # Target column

In [68]:
# 학습, 테스트 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y,
#     test_size=0.2,           # 20 % 검증(또는 0.25 등)
#     random_state=42,         
#     stratify=y               # 클래스 비율 유지 
# )

In [69]:
model_lightgbm = Pipeline(
    steps=[
        ('preprocessor', preprocessor),  # 전처리 단계
        ('classifier', LGBMClassifier(
            objective='multiclass',  # 다중 클래스 분류
            num_class=3,  # 홈 승, 무승부, 원정 승
            n_estimators   = 1200,
            learning_rate  = 0.03,
            max_depth      = -1,             # 자동
            num_leaves     = 63,             # 2^(max_depth) -1 근사
            colsample_bytree = 0.8,
            subsample        = 0.8,
            reg_alpha        = 0.1,
            reg_lambda       = 1.0,
            random_state     = 42,
            class_weight     = 'balanced'
        ))  # LightGBM 
    ]
)

In [70]:
model_lightgbm.fit(X_train, y_train)

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>)

In [None]:
y_pred_lightgbm  = model_lightgbm.predict(X_test)
y_prob_lightgbm  = model_lightgbm.predict_proba(X_test)   # shape = (n_samples, 3)



In [None]:
print("Accuracy :", accuracy_score(y_test, y_pred_lightgbm))
print("\nClassification Report\n", classification_report(y_test, y_pred_lightgbm))

# (선택) 로그-로스 — 다중 클래스 확률 평가
print("Log-loss :", log_loss(y_test, y_prob_lightgbm))

# (선택) 혼동 행렬
print("Confusion Matrix\n", confusion_matrix(y_test, y_pred_lightgbm))

Accuracy : 0.5036255767963085

Classification Report
               precision    recall  f1-score   support

           0       0.58      0.68      0.62       694
           1       0.28      0.20      0.24       367
           2       0.49      0.48      0.49       456

    accuracy                           0.50      1517
   macro avg       0.45      0.45      0.45      1517
weighted avg       0.48      0.50      0.49      1517

Log-loss : 1.2184625661886839
Confusion Matrix
 [[471 104 119]
 [188  74 105]
 [155  82 219]]
