### 데이터 로드

In [24]:
from kcycle.loader import load_data

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, r2_score, hamming_loss
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

train = load_data()
sample = pd.read_csv('./data/20250420_광명01경주_sample.csv')

print(train.shape, sample.shape)

(137207, 45) (7, 44)


In [2]:
def clean_race_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 드롭할 컬럼
    drop_cols = [
        # '날짜',
        '경주시간', '이름', '기수', '훈련지',
        '훈련동참자', '훈련내용', '훈련일수',
        '최근3_장소일자', '최근2_장소일자', '최근1_장소일자'
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # 광명에서 경주한 것만 필터링
    df = df[df['경주지역'] == '광명'].reset_index(drop=True)

    # 기어배수 처리
    df['기어배수'] = df['기어배수'].astype(str).str.strip().str[:4]

    # 200m 처리
    df['200m'] = df['200m'].astype(str).str.replace('"', '.', regex=False)

    # 지표관련 변수 처리
    cols = ['승률', '연대율', '삼연대율', '입상/출전', '선행', '젖히기', '추입', '마크']
    for col in cols:
        col_str = df[col].astype(str)

        # 괄호 안쪽 데이터 사용
        # inner = col_str.str.extract(r"\((.*?)\)")[0]
        # df[col] = col_str.str.extract(r"\((.*?)\)")[0]
        # df[col] = inner.fillna(col_str)

        # 괄호 바깥쪽 데이터 사용
        df[col] = col_str.str.replace(r"\(.*?\)", "", regex=True).str.strip()

    # '/'가 포함되어있는 변수 처리
    # '입상/출전' 분리
    ratio_split = df['입상/출전'].astype(str).str.extract(r"(\d+)/(\d+)")
    df['입상'] = pd.to_numeric(ratio_split[0], errors='coerce')
    df['출전'] = pd.to_numeric(ratio_split[1], errors='coerce')
    df = df.drop(columns='입상/출전')

    # '최근3순위' 계산 (소수) ('279/554' → 0.5036)
    ratio_recent = df['최근3순위'].astype(str).str.extract(r"(\d+)/(\d+)")
    a = pd.to_numeric(ratio_recent[0], errors='coerce')
    b = pd.to_numeric(ratio_recent[1], errors='coerce')
    df['최근3순위'] = a / b

    # 등급조정, 현재와 이전 등급 분리
    s = df['등급조정'].astype(str)
    df['현재_등급'] = s.str.slice(5, 7)
    df['이전_등급'] = s.str.slice(12, 14)
    df = df.drop(columns='등급조정')

    # 최근3득점: 광명과 종합 점수 분리
    s = df['최근3득점'].astype(str)
    gwang = s.str.extract(r"\(광명\)\s*([\d.]+)")[0]
    jonghap = s.str.extract(r"\(종합\)\s*([\d.]+)")[0]
    df['광명_3득점'] = pd.to_numeric(gwang, errors='coerce')
    df['종합_3득점'] = pd.to_numeric(jonghap, errors='coerce')
    df = df.drop(columns='최근3득점')

    cols = [
        '최근3_1일', '최근3_2일', '최근3_3일',
        '최근2_1일', '최근2_2일', '최근2_3일',
        '최근1_1일', '최근1_2일', '최근1_3일',
        '금회_1일', '금회_2일', '금회_3일',
    ]
    for col in cols:
        col_data = df[col].astype(str)
        pattern = r'^(\S{2})\s*(\d+)-(\d+)(\S?)'

        extracted = col_data.str.extract(pattern)
        # df[f'{col}_종류'] = extracted[0]
        df[f'{col}_순위'] = pd.to_numeric(extracted[2], errors='coerce').clip(upper=7)
        # df[f'{col}_전법'] = extracted[3]
        df = df.drop(columns=col)

    # 경주종류 단순화
    mapping = {
        '특선': '특선',
        '특별특선': '특선',
        '특선결승': '특선결승',
        '특선준결': '특선결승',
        '그랑프리결승': '특선결승',
        '우수': '우수',
        '우수결승': '우수결승',
        '우수준결': '우수결승',
        '선발': '선발',
        '선발결승': '선발결승',
        '선발준결': '선발결승',

        '준결': '특선', # 준결 경기에는 S등급 선수들이 가장 많음
        '특별': '우수', # 특별 경기에는 A등급 선수들이 가장 많음
        '특우': '특선', # 특우 경기에는 S등급 선수들이 가장 많음
        '특별우수': '우수',
    }
    df['경주종류'] = df['경주종류'].map(mapping)

    return df

train = clean_race_data(train)
sample = clean_race_data(sample)

### Train, Valid, Test split

In [3]:
def split_train_test_by_race(df, test_size=0.2) -> (pd.DataFrame, pd.DataFrame):
    # '경주' 단위로 train/test 분리 (한 경주에 속한 모든 선수는 같은 세트에 속하도록).
    # 미래의 경주 데이터로 과거의 경주를 예측하지 않도록 shuffle은 X.

    df = df.copy()
    df['race_id'] = (
        df['연도'].astype(str) + '_' +
        df['회차'].astype(str) + '_' +
        df['일차'].astype(str) + '_' +
        df['경주번호']
    )

    unique_races = df['race_id'].drop_duplicates().tolist()
    n = len(unique_races)
    cutoff = int(n * (1 - test_size))

    train_race_ids = set(unique_races[:cutoff])
    test_race_ids  = set(unique_races[cutoff:])

    train_df = df[df['race_id'].isin(train_race_ids)].drop(columns='race_id').reset_index(drop=True)
    test_df  = df[df['race_id'].isin(test_race_ids )].drop(columns='race_id').reset_index(drop=True)

    return train_df, test_df

train, val = split_train_test_by_race(train, test_size=0.2)
val, test = split_train_test_by_race(val, test_size=0.5)

### Data Preprocessing

In [4]:
def impute_missing_value(train: pd.DataFrame, valid: pd.DataFrame):
    # 선수가 속한 현재 등급의 평균으로 대체
    train['200m'] = pd.to_numeric(train['200m'].replace('-', pd.NA), errors='coerce')
    valid['200m'] = pd.to_numeric(valid['200m'].replace('-', pd.NA), errors='coerce')

    group_means = train.groupby('현재_등급')['200m'].mean()
    train['200m'] = train.apply(
        lambda row: group_means[row['현재_등급']] if pd.isna(row['200m']) else row['200m'],
        axis=1
    )
    valid['200m'] = valid.apply(
        lambda row: group_means.get(row['현재_등급'], pd.NA) if pd.isna(row['200m']) else row['200m'],
        axis=1
    )

    group_means = train.groupby('현재_등급')['종합_3득점'].mean()
    train['종합_3득점'] = train.apply(
        lambda row: group_means[row['현재_등급']] if pd.isna(row['종합_3득점']) else row['종합_3득점'],
        axis=1
    )
    valid['종합_3득점'] = valid.apply(
        lambda row: group_means.get(row['현재_등급'], pd.NA) if pd.isna(row['종합_3득점']) else row['종합_3득점'],
        axis=1
    )

    # 최근 순위가 결측인 경우(후보, 결장 등), 입상하지 못한 것과 동일하게 대체
    # 순위보다 입상여부, 입상을 했으면 몇등을 했는지가 중요하다고 판단
    # 1,2,3 > 순위 / 4 > 미입상
    cols = [
        '최근3_1일_순위', '최근3_2일_순위', '최근3_3일_순위',
        '최근2_1일_순위', '최근2_2일_순위', '최근2_3일_순위',
        '최근1_1일_순위', '최근1_2일_순위', '최근1_3일_순위',
        '금회_1일_순위', '금회_2일_순위', '금회_3일_순위'
    ]
    train[cols] = train[cols].fillna(7).clip(upper=4)
    valid[cols] = valid[cols].fillna(7).clip(upper=4)

    return train, valid

def drop_constant_columns(train: pd.DataFrame, valid: pd.DataFrame):
    nunique = train.nunique()
    constant_cols = nunique[nunique == 1].index.tolist()

    train = train.drop(columns=constant_cols)
    valid = valid.drop(columns=constant_cols)

    return train, valid

def drop_unused_columns(df):
    cols_to_drop = [
        '날짜', '연도', '회차', '일차', '경주번호',
    ]

    return df.drop(columns=cols_to_drop)

def encode_categorical(train: pd.DataFrame, valid: pd.DataFrame):
    cat_cols = [
        '경주종류', '번호', '현재_등급', '이전_등급',
        '최근3_1일_순위', '최근3_2일_순위', '최근3_3일_순위',
        '최근2_1일_순위', '최근2_2일_순위', '최근2_3일_순위',
        '최근1_1일_순위', '최근1_2일_순위', '최근1_3일_순위',
        '금회_1일_순위', '금회_2일_순위', '금회_3일_순위'
    ]

    for col in cat_cols:
        encoder = LabelEncoder()
        train[col] = encoder.fit_transform(train[col])
        valid[col] = encoder.transform(valid[col])

    return train, valid

def cast_features(df):
    cols = [col for col in df.columns]
    df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')

    return df

def all_process(train, valid):
    train, valid = impute_missing_value(train, valid)
    train, valid = drop_constant_columns(train, valid)
    train, valid = drop_unused_columns(train), drop_unused_columns(valid)
    train, valid = encode_categorical(train, valid)
    train, valid = cast_features(train), cast_features(valid)
    return train, valid

train, val = all_process(train, val)

  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


In [5]:
def add_target(df, bet_type='복승'):
    """
    bet_type:
      - '삼복승': rank <= 3  →  target=1
      - '복승':   rank <= 2  → target=1
      - '단승':   rank == 1  → target=1
    """
    df = df.copy()
    # 기준 등수 설정
    if bet_type == '삼복승':
        cutoff = 3
        df['target'] = (df['rank'] <= cutoff).astype(int)
    elif bet_type == '복승':
        cutoff = 2
        df['target'] = (df['rank'] <= cutoff).astype(int)
    elif bet_type == '단승':
        # ==1 일 때만 1
        df['target'] = (df['rank'] == 1).astype(int)
    else:
        raise ValueError(f"알 수 없는 bet_type: {bet_type!r}. ('단승','복승','삼복승' 중 하나)")

    df = df.drop(columns=['rank'])

    return df

In [6]:
train = load_data()
train = clean_race_data(train)
train, val = split_train_test_by_race(train, test_size=0.2)
val, test = split_train_test_by_race(val, test_size=0.5)
train_ = train.copy()

train, val = all_process(train, val)
_, test = all_process(train_, test)

print(train.shape, val.shape, test.shape)

  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


(88368, 32) (11046, 32) (11046, 32)


In [7]:
train = add_target(train, bet_type='복승')
val = add_target(val, bet_type='복승')
test = add_target(test, bet_type='복승')

### Model Training

In [11]:
X_train = train.drop(columns=['target'])
X_val = val.drop(columns=['target'])
X_test = test.drop(columns=['target'])

y_train = train['target']
y_val = val['target']
y_test = test['target']

cat_cols = [
    '경주종류', '번호', '현재_등급', '이전_등급',
    '최근3_1일_순위', '최근3_2일_순위', '최근3_3일_순위',
    '최근2_1일_순위', '최근2_2일_순위', '최근2_3일_순위',
    '최근1_1일_순위', '최근1_2일_순위', '최근1_3일_순위',
    '금회_1일_순위', '금회_2일_순위', '금회_3일_순위'
]
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_val[cat_cols] = X_val[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')


# model = LGBMClassifier(
#     n_estimators=5000,
#     random_state=42,
#     enable_categorical=True,
#     early_stopping_rounds=100,
#     verbose=-1,
# )
# model.fit(
#     X_train, y_train,
#     eval_set=(X_val, y_val),
#     categorical_feature=cat_cols
# )

model = XGBClassifier(
    n_estimators=5000,
    random_state=42,
    enable_categorical=True,
    early_stopping_rounds=100,
)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=0,
)

# model = RandomForestClassifier(
#     n_estimators=1000,
#     random_state=42,
#     n_jobs=-1,
# )
# model.fit(X_train, y_train)


y_train_pred = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict_proba(X_val)[:, 1]
y_test_pred = model.predict_proba(X_test)[:, 1]

### Evaluation

- 개별 지표

In [17]:
# Train set metrics
train_accuracy = accuracy_score(y_train, (y_train_pred > 0.5).astype(int))
train_auc = roc_auc_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, (y_train_pred > 0.5).astype(int))
train_r2 = r2_score(y_train, y_train_pred)

# Validation set metrics 
val_accuracy = accuracy_score(y_val, (y_val_pred > 0.5).astype(int))
val_auc = roc_auc_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, (y_val_pred > 0.5).astype(int))
val_r2 = r2_score(y_val, y_val_pred)

# Test set metrics
test_accuracy = accuracy_score(y_test, (y_test_pred > 0.5).astype(int))
test_auc = roc_auc_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, (y_test_pred > 0.5).astype(int))
test_r2 = r2_score(y_test, y_test_pred)

metrics_df = pd.DataFrame({
    'Train': [train_accuracy, train_auc, train_f1, train_r2],
    'Validation': [val_accuracy, val_auc, val_f1, val_r2],
    'Test': [test_accuracy, test_auc, test_f1, test_r2]
}, index=['Accuracy', 'AUC', 'F1', 'R2'])

metrics_df.round(4)

Unnamed: 0,Train,Validation,Test
Accuracy,0.8104,0.7835,0.7799
AUC,0.8452,0.8142,0.8153
F1,0.6125,0.5799,0.5632
R2,0.3335,0.2557,0.262


- 경주별 지표

In [22]:
def reshape_by_race(arr: np.ndarray, per_race: int = 7) -> np.ndarray:
    # 1차원 배열을 경주 단위로 묶어 2차원 배열로 변환
    arr = np.asarray(arr)
    if arr.ndim != 1:
        raise ValueError(f"입력 배열은 1차원이어야 합니다. (현재 ndim={arr.ndim})")
    total = arr.size
    if total % per_race != 0:
        raise ValueError(f"배열 길이({total})가 per_race({per_race})의 배수가 아닙니다.")
    return arr.reshape(-1, per_race)

# y_test 배열을 (N,)에서 (N/7, 7)로 변환
y_train_race = reshape_by_race(y_train, per_race=7)
y_train_pred_race = reshape_by_race(y_train_pred, per_race=7)

y_val_race = reshape_by_race(y_val, per_race=7)
y_val_pred_race = reshape_by_race(y_val_pred, per_race=7)

y_test_race = reshape_by_race(y_test, per_race=7)
y_test_pred_race = reshape_by_race(y_test_pred, per_race=7)

print(y_train_race.shape, y_train_pred_race.shape)
print(y_val_race.shape, y_val_pred_race.shape)
print(y_test_race.shape, y_test_pred_race.shape)

(12624, 7) (12624, 7)
(1578, 7) (1578, 7)
(1578, 7) (1578, 7)


In [27]:
def compute_race_metrics(y_true: np.ndarray,
                         y_score: np.ndarray,
                         threshold: float = 0.5,
                         per_race: int = 7) -> pd.Series:
    """
    경주별(7명 단위)로 묶어서 multi‐label 지표를 계산합니다.

    Parameters
    ----------
    y_true : array-like, shape (n_samples,)
        실제 binary 타겟 벡터 (0/1). 길이는 per_race의 배수여야 합니다.
    y_score : array-like, shape (n_samples,)
        예측 확률(또는 연속 점수) 벡터.
    threshold : float, default=0.5
        y_score > threshold 를 positive로 간주합니다.
    per_race : int, default=7
        한 경주당 참가자 수.

    Returns
    -------
    pd.Series
        hamming_loss, precision_samples, recall_samples, f1_samples,
        race_accuracy (race 단위로 모두 정확하게 맞춘 비율)
    """
    # reshape to (n_races, per_race)
    y_true_r = reshape_by_race(y_true, per_race)
    y_score_r = reshape_by_race(y_score, per_race)
    # 이진 예측
    y_pred_r = (y_score_r > threshold).astype(int)

    # multi‐label 샘플 단위 지표
    ham = hamming_loss(y_true_r, y_pred_r)
    f1 = f1_score(y_true_r, y_pred_r, average="samples", zero_division=0)

    # race‐level accuracy: 한 경주의 7명 모두를 정확히 맞춘 비율
    correct_per_race = np.all(y_true_r == y_pred_r, axis=1)
    race_acc = correct_per_race.mean()

    return pd.Series({
        "hamming_loss": ham,
        "f1_samples": f1,
        "race_accuracy": race_acc
    })

# 예시 사용
# y_train, y_train_pred 준비되어 있다고 가정
train_metrics = compute_race_metrics(y_train, y_train_pred, threshold=0.5, per_race=7)
val_metrics   = compute_race_metrics(y_val,   y_val_pred,   threshold=0.5, per_race=7)
test_metrics  = compute_race_metrics(y_test,  y_test_pred,  threshold=0.5, per_race=7)

metrics_df = pd.DataFrame({
    "Train": train_metrics,
    "Validation": val_metrics,
    "Test": test_metrics
})

metrics_df.round(4)

Unnamed: 0,Train,Validation,Test
hamming_loss,0.1896,0.2165,0.2201
f1_samples,0.5571,0.5388,0.5226
race_accuracy,0.1957,0.1305,0.1109


In [None]:
# # 특정 라인을 제외하고 섞기(선택 가능)
# def shuffle_races(data, year, ratio, exclude_back_no_list=None, random_seed=None):
#
#     if random_seed is not None:
#         np.random.seed(random_seed)
#
#     df_year = data[data['BASE_YEAR'] == year].copy()
#
#     # 각 경기를 구분하는 고유한 식별자 생성
#     df_year['RACE_ID'] = df_year.groupby(['TME_VALUE', 'DAY_ORD_VALUE', 'RACE_NO']).ngroup()
#
#     # 경기 ID의 리스트 생성
#     race_ids = df_year['RACE_ID'].unique()
#
#     # 경기 ID 리스트에서 주어진 비율만큼 무작위 추출 (random_state 추가)
#     selected_race_ids = np.random.choice(race_ids, int(len(race_ids) * ratio), replace=False)
#
#     # 추출된 경기만 포함하는 데이터 프레임 생성
#     df_sampled = df_year[df_year['RACE_ID'].isin(selected_race_ids)]
#
#     grouped = df_sampled.groupby(['TME_VALUE', 'DAY_ORD_VALUE', 'RACE_NO'])
#
#     shuffled_dfs = []
#
#     for name, group in grouped:
#         if exclude_back_no_list:
#             rows_exclude_list = []
#             for exclude_back_no in exclude_back_no_list:
#                 row_exclude = group[group['BACK_NO'] == exclude_back_no]
#                 rows_exclude_list.append(row_exclude)
#
#             # 샘플링 시 random_state 추가
#             shuffled_rows_except_excluded = group[~group['BACK_NO'].isin(exclude_back_no_list)].sample(frac=1.0, random_state=random_seed)
#
#             first_half_shuffled_rows_except_excluded = shuffled_rows_except_excluded[shuffled_rows_except_excluded.index < rows_exclude_list[0].index[0]]
#             second_half_shuffled_rows_except_excluded = shuffled_rows_except_excluded[shuffled_rows_except_excluded.index > rows_exclude_list[-1].index[0]]
#
#             shuffled_group = pd.concat([first_half_shuffled_rows_except_excluded] + rows_exclude_list + [second_half_shuffled_rows_except_excluded])
#         else:
#             # 샘플링 시 random_state 추가
#             shuffled_group = group.sample(frac=1.0, random_state=random_seed)
#
#        # 결과 리스트에 추가합니다.
#         shuffled_dfs.append(shuffled_group)
#
#     # 모든 셔플된 그룹을 합칩니다.
#     df_shuffled = pd.concat(shuffled_dfs)
#     df_shuffled = df_shuffled.reset_index(drop=True)
#     df_shuffled.drop(['RACE_ID'], axis=1, inplace=True)
#
#     return df_shuffled
#
#
# # 순서를 완전히 반대로
# def reverse_order(data, year, ratio, random_seed=None):
#     if random_seed is not None:
#         np.random.seed(random_seed)
#
#     df_year = data[data['BASE_YEAR'] == year].copy()
#
#     # 각 경기를 구분하는 고유한 식별자 생성
#     df_year['RACE_ID'] = df_year.groupby(['TME_VALUE', 'DAY_ORD_VALUE', 'RACE_NO']).ngroup()
#
#     # 경기 ID의 리스트 생성
#     race_ids = df_year['RACE_ID'].unique()
#
#     # 경기 ID 리스트에서 주어진 비율만큼 무작위 추출
#     selected_race_ids = np.random.choice(race_ids, int(len(race_ids) * ratio), replace=False)
#
#     # 추출된 경기만 포함하는 데이터 프레임 생성
#     df_sampled = df_year[df_year['RACE_ID'].isin(selected_race_ids)]
#
#      # 그룹 생성
#     grouped = df_sampled.groupby(['TME_VALUE', 'DAY_ORD_VALUE', 'RACE_NO'])
#
#     reversed_dfs = []
#
#     for name, group in grouped:
#          # 각 그룹의 행 순서를 반대로 하고 결과 리스트에 추가합니다.
#         reversed_group  = group.iloc[::-1]
#         reversed_dfs.append(reversed_group)
#
#      # 모든 뒤집힌 그룹을 합칩니다.
#     df_reversed= pd.concat(reversed_dfs)
#     df_reversed.drop(['RACE_ID'], axis=1,inplace=True)
#
#     return df_reversed.reset_index(drop=True)


In [27]:
# years = range(2016, 2024)
#
# df_shuffled1 = {}
# df_shuffled2 = {}
# df_shuffled3 = {}
#
# print('4번을 제외하고 섞기')
# for year in years:
#     ratio = 1.0 if year >= 2021 else 0.5
#     df_shuffled1[year] = shuffle_races(data=train_data,
#                                       year=year,
#                                       ratio=ratio,
#                                       exclude_back_no_list=[4],
#                                       random_seed=42)
#     print(year, df_shuffled1[year].shape)
#
# print('-'*15)
# print('\n무작위로 섞기')
#
# for year in years:
#     ratio = 0.4 if year >= 2021 else 0.2
#     df_shuffled2[year] = shuffle_races(data=train_data,
#                                       year=year,
#                                       ratio=ratio,
#                                       exclude_back_no_list=None,
#                                       random_seed=42)
#     print(year, df_shuffled2[year].shape)
#
# print('-'*15)
# print('\n순서를 뒤집기')
#
# for year in years:
#     ratio = 1.0 if year >= 2021 else 0.5
#     df_shuffled3[year] = reverse_order(data=train_data,
#                                       year=year,
#                                       ratio=ratio,
#                                       random_seed=42)
#     print(year, df_shuffled3[year].shape)

In [None]:
# reshape 기능을 이용해 7명의 선수를 하나의 데이터로 합치기 (train data)
n_samples1 = len(input_train) // 7
n_samples2 = len(output_train) // 7
n_rows = 7
n_cols1 = input_train.shape[1]
n_cols2 = 1

reshaped_input_train = input_train.values.reshape((n_samples1, n_rows, n_cols1))
reshaped_output_train = output_train.values.reshape((n_samples2, n_rows, n_cols2))

reshaped_input_train = reshaped_input_train.reshape(-1, n_rows * n_cols1) 
reshaped_output_train = reshaped_output_train.reshape(-1, n_rows * n_cols2)

display(reshaped_input_train.shape, reshaped_output_train.shape)

In [None]:
# reshape 기능을 이용해 7명의 선수를 하나의 데이터로 합치기 (test data)
n_samples1 = len(input_test) // 7
n_samples2 = len(output_test) // 7
n_rows = 7
n_cols1 = input_test.shape[1]
n_cols2 = 1

reshaped_input_test = input_test.values.reshape((n_samples1, n_rows, n_cols1))
reshaped_output_test = output_test.values.reshape((n_samples2, n_rows, n_cols2))

reshaped_input_test = reshaped_input_test.reshape(-1, n_rows * n_cols1) 
reshaped_output_test = reshaped_output_test.reshape(-1, n_rows * n_cols2)

display(reshaped_input_test.shape, reshaped_output_test.shape)

In [None]:
# reshape 기능을 이용해 7명의 선수를 하나의 데이터로 합치기 (test data)
n_samples1 = len(input_sample) // 7
n_rows = 7
n_cols1 = input_sample.shape[1]

reshaped_input_sample = input_sample.values.reshape((n_samples1, n_rows, n_cols1))

reshaped_input_sample = reshaped_input_sample.reshape(-1, n_rows * n_cols1) 

display(reshaped_input_sample.shape)

In [None]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
import catboost as cb


iteration = 0

def objective(trial):
    # classifier_name = trial.suggest_categorical('classifier', ['DecisionTree', 'LGBM', 'XGBoost', 'CatBoost'])
    global iteration
    start_time = time.time()
    
    classifier_name = 'XGBoost'
       
    if classifier_name == 'LGBM':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 700),
            'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 15, 128),
            'max_depth': trial.suggest_int('max_depth', 3, 30),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5),
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5),  # L2 정규화
            'num_threads': 2,
#            'device': 'gpu',
            
        }
        model_lgbm = LGBMClassifier(**param, verbose=0)
        model = MultiOutputClassifier(model_lgbm, n_jobs=4)
        
    elif classifier_name == 'XGBoost':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 150, 700),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 50),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1),
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 100),
            'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
            'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear']),
            'tree_method': trial.suggest_categorical('tree_method', ['auto', 'hist']),
            'n_jobs': 7,
            'random_state': 42,
        }
        model = xgb.XGBClassifier(**param)
    
    elif classifier_name == 'RandomForest':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 60),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'n_jobs': 7,
            'random_state': 42
        }
        model = RandomForestClassifier(**param)


    model.fit(reshaped_input_train, reshaped_output_train)   # 데이터 셋 이름에 맞게 수정

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Trial {iteration} 완료, 경과 시간: {elapsed_time:.2f}초')
    iteration += 1

    y_pred_proba = model.predict_proba(reshaped_input_test)
#    y_pred_proba = np.array([proba[:, 1] for proba in y_pred_proba]).T
    top_3_indices = np.argsort(y_pred_proba, axis=1)[:, -3:]

    # y_pred_max 초기화 (모든 값을 0으로 설정)
    y_pred_max = np.zeros_like(y_pred_proba)

    # 상위 3개 인덱스의 위치에 1 할당
    for i, indices in enumerate(top_3_indices):
        y_pred_max[i, indices] = 1

    # perfect_match 계산
    perfect_match = []
    for true, pred in zip(reshaped_output_test, y_pred_max):
        # reshaped_output_test에서 1인 값이 y_pred_max에서도 1인 값에 포함되어 있는지 확인
        match = np.all(np.logical_or(np.logical_not(pred), true))
        perfect_match.append(match)

    perfect_match = np.array(perfect_match, dtype=int)

    accuracy = np.mean(perfect_match)
    print("accuracy for perfect match:", accuracy)
    
    return accuracy

In [None]:
start_time = time.time()
iteration = 0

# Optuna Study 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
# study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=10000)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

end_time = time.time()
elapsed_time = end_time - start_time
print(f'경과 시간: {elapsed_time:.2f}초')

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100,
                               max_features='log2',
                               max_depth=10,
                               min_samples_split=19,
                               min_samples_leaf=11,
                               bootstrap=True,
                               criterion='gini',
                               n_jobs=-1,
                               random_state=42,
                              )

start_time = time.time()
model_rf = model.fit(reshaped_input_train, reshaped_output_train)
end_time = time.time()


# 예측
y_pred_proba = model_rf.predict_proba(reshaped_input_test)
y_pred_proba = np.array([proba[:, 1] for proba in y_pred_proba]).T
top_3_indices = np.argsort(y_pred_proba, axis=1)[:, -3:]

# y_pred_max 초기화 (모든 값을 0으로 설정)
y_pred_max = np.zeros_like(y_pred_proba)

# 상위 3개 인덱스의 위치에 1 할당
for i, indices in enumerate(top_3_indices):
    y_pred_max[i, indices] = 1

# perfect_match 계산
perfect_match = []
for true, pred in zip(reshaped_output_test, y_pred_max):
    # reshaped_output_test에서 1인 값이 y_pred_max에서도 1인 값에 포함되어 있는지 확인
    match = np.all(np.logical_or(np.logical_not(pred), true))
    perfect_match.append(match)

perfect_match = np.array(perfect_match, dtype=int)

accuracy = np.mean(perfect_match)
print("적중률 for perfect match:", accuracy)
print("Training Time: ", end_time - start_time)

In [None]:
predict_value_rf = model_rf.predict_proba(reshaped_input_sample)
predict_value_rf = np.array([proba[:, 1] for proba in predict_value_rf]).T

In [None]:
pd.DataFrame(predict_value_rf, columns=[1,2,3,4,5,6,7])

{'n_estimators': 345, 'learning_rate': 0.08787865227073664, 'max_depth': 34, 'min_child_weight': 5, 'gamma': 1.6907684810181365, 'colsample_bytree': 0.7615149755883616, 'colsample_bylevel': 0.7440588609947394, 'colsample_bynode': 0.9843402828590834, 'reg_alpha': 0.553971009858408, 'reg_lambda': 0.581507741336692, 'scale_pos_weight': 88.18441128128285, 'max_delta_step': 5, 'booster': 'gbtree', 'tree_method': 'auto'}

0.35294117647058826.



{'n_estimators': 302, 'learning_rate': 0.01900359581276545, 'max_depth': 28, 'min_child_weight': 3, 'gamma': 3.321420455877387, 'colsample_bytree': 0.5977693012361819, 'colsample_bylevel': 0.560548462242929, 'colsample_bynode': 0.5936493128007223, 'reg_alpha': 0.5371370983785317, 'reg_lambda': 0.37044885052723275, 'scale_pos_weight': 45.143758969825164, 'max_delta_step': 8, 'booster': 'gbtree', 'tree_method': 'auto'}.

Best is trial 133 with value: 0.36764705882352944.44

In [None]:
model_xgb = xgb.XGBClassifier(random_state=42,
                              n_estimators=100,
                              learning_rate=0.08787865227073664,
                              max_depth=34,
                              min_child_weight=5,
                              gamma=1.6907684810181365,
                              colsample_bytree=0.7615149755883616,
                              colsample_bylevel=0.7440588609947394,
                              colsample_bynode=0.9843402828590834,
                              reg_alpha=0.553971009858408,
                              reg_lambda=0.581507741336692,
                              scale_pos_weight=88.18441128128285,
                              max_delta_step=5,
                              booster='gbtree',
                              tree_method='auto',
                              n_jobs=-1,
                             )
                              
              
model_xgb.fit(reshaped_input_train, reshaped_output_train)


# 예측
y_pred_proba = model_xgb.predict_proba(reshaped_input_test)
# y_pred_proba = np.array([proba[:, 1] for proba in y_pred_proba]).T
top_3_indices = np.argsort(y_pred_proba, axis=1)[:, -3:]

# y_pred_max 초기화 (모든 값을 0으로 설정)
y_pred_max = np.zeros_like(y_pred_proba)

# 상위 3개 인덱스의 위치에 1 할당
for i, indices in enumerate(top_3_indices):
    y_pred_max[i, indices] = 1

# perfect_match 계산
perfect_match = []
for true, pred in zip(reshaped_output_test, y_pred_max):
    # reshaped_output_test에서 1인 값이 y_pred_max에서도 1인 값에 포함되어 있는지 확인
    match = np.all(np.logical_or(np.logical_not(pred), true))
    perfect_match.append(match)

perfect_match = np.array(perfect_match, dtype=int)

accuracy = np.mean(perfect_match).round(4)
print("적중률: ", accuracy, '%')
# print("Training Time: ", end_time - start_time)

In [None]:
predict_value_xg = model_xgb.predict_proba(reshaped_input_sample)

In [None]:
pd.DataFrame(predict_value_xg, columns=[1,2,3,4,5,6,7])

In [None]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier

# 최적의 하이퍼파라미터 설정
optimal_params = {
    'n_estimators': 400,
    'learning_rate': 0.052822297669108224,
    'num_leaves': 74,
    'max_depth': 8,
    'reg_alpha': 0.09980125949843345,
    'boosting_type': 'gbdt',
    'reg_lambda': 0.45277624619552903,
    'num_threads': 2,
}

lgbm = LGBMClassifier(**optimal_params)
model_lgbm = MultiOutputClassifier(lgbm, n_jobs=4)

model_lgbm.fit(reshaped_input_train, reshaped_output_train)

# 예측
y_pred_proba = model_lgbm.predict_proba(reshaped_input_test)
y_pred_proba = np.array([proba[:, 1] for proba in y_pred_proba]).T
top_3_indices = np.argsort(y_pred_proba, axis=1)[:, -3:]

# y_pred_max 초기화 (모든 값을 0으로 설정)
y_pred_max = np.zeros_like(y_pred_proba)

# 상위 3개 인덱스의 위치에 1 할당
for i, indices in enumerate(top_3_indices):
    y_pred_max[i, indices] = 1

# perfect_match 계산
perfect_match = []
for true, pred in zip(reshaped_output_test, y_pred_max):
    # reshaped_output_test에서 1인 값이 y_pred_max에서도 1인 값에 포함되어 있는지 확인
    match = np.all(np.logical_or(np.logical_not(pred), true))
    perfect_match.append(match)

perfect_match = np.array(perfect_match, dtype=int)

accuracy = np.mean(perfect_match)
print("적중률 for perfect match:", accuracy)
print("Training Time: ", end_time - start_time)

In [None]:
predict_value_lgbm = model_lgbm.predict_proba(reshaped_input_sample)
predict_value_lgbm = np.array([proba[:, 1] for proba in predict_value_lgbm]).T

In [None]:
pd.DataFrame(predict_value_lgbm, columns=[1,2,3,4,5,6,7])

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_predict, KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# 기본 모델 정의
base_models = [
    MultiOutputClassifier(XGBClassifier(n_estimators=100,
                                        learning_rate=0.08787865227073664,
                                        max_depth=34,
                                        min_child_weight=5,
                                        gamma=1.6907684810181365,
                                        colsample_bytree=0.7615149755883616,
                                        colsample_bylevel=0.7440588609947394,
                                        colsample_bynode=0.9843402828590834,
                                        reg_alpha=0.553971009858408,
                                        reg_lambda=0.581507741336692,
                                        scale_pos_weight=88.18441128128285,
                                        max_delta_step=5,
                                        booster='gbtree',
                                        tree_method='auto',
                                        n_jobs=-1,
                                        random_state=42)),
    MultiOutputClassifier(LGBMClassifier(n_estimators=400,
                                         learning_rate=0.052822297669108224,
                                         num_leaves=74,
                                         max_depth=8,
                                         reg_alpha=0.09980125949843345,
                                         boosting_type='gbdt',
                                         reg_lambda=0.45277624619552903,
                                         num_threads=2))
]

# 메타 모델 정의
meta_model = RandomForestClassifier(n_estimators=100,
                                    max_features='log2',
                                    max_depth=10,
                                    min_samples_split=19,
                                    min_samples_leaf=11,
                                    bootstrap=True,
                                    criterion='gini',
                                    n_jobs=-1,
                                    random_state=42)

# KFold 교차 검증 준비
kf = KFold(n_splits=5, shuffle=True, random_state=42)

num_samples = reshaped_input_train.shape[0]
num_base_models = len(base_models)
num_outputs = reshaped_output_train.shape[1]
num_meta_features = num_base_models * num_outputs

# 메타 특성 배열 초기화 (샘플 수 x (기본 모델 수 * 출력 수))
meta_features = np.zeros((num_samples, num_meta_features))

for i, model in enumerate(base_models):
    probas = cross_val_predict(model, reshaped_input_train, reshaped_output_train, cv=kf, method='predict_proba')
    
    # 각 출력별 긍정 클래스의 확률을 메타 특성으로 사용
    for output_index in range(reshaped_output_train.shape[1]):  # 출력별로 반복
        # output_index번째 출력에 대한 긍정 클래스 확률
        meta_features[:, i*reshaped_output_train.shape[1] + output_index] = probas[output_index][:, 1]
# 메타 모델 학습
meta_model.fit(meta_features, reshaped_output_train)

# 테스트 데이터에 대해 메타 특성 생성
test_meta_features = np.zeros((reshaped_input_test.shape[0], len(base_models)))
for i, model in enumerate(base_models):
    model.fit(reshaped_input_train, reshaped_output_train)  # 전체 데이터에 대해 기본 모델 학습
    test_meta_features[:, i] = model.predict_proba(reshaped_input_test)[:, 1]

# 메타 모델을 사용하여 최종 예측
final_predictions = meta_model.predict(test_meta_features)

In [None]:
display(pd.DataFrame(predict_value_xg, columns=[1,2,3,4,5,6,7]),
        pd.DataFrame(predict_value_rf, columns=[1,2,3,4,5,6,7]))