In [13]:
import pandas as pd
import numpy as np
import os
from encoding_function import low_frequency_to_others

import warnings
warnings.filterwarnings('ignore')

### 데이터 불러오기
- 학습: 2023년도 이전
- 검증: 2023년
- 테스트: 2024년

In [14]:
ROOT_DIR = "data"
RANDOM_STATE = 999

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
val = pd.read_csv(os.path.join(ROOT_DIR, "val.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

print(train.shape, val.shape)

(48522, 43) (8952, 43)


## 사용하지 않는 feature drop

In [15]:
def drop_columns_from_datasets(df):
    drop_cols = [
        '연도', '회차', '일차', '경주번호',
        '금일출주경주',
        '모터번호', '전탑승선수1', '전탑승선수2',
        '보트번호', '특이사항'
    ]

    df = df.drop(drop_cols, axis=1)

    return df


train = drop_columns_from_datasets(train)
val = drop_columns_from_datasets(val)

print(train.shape, val.shape)

(48522, 33) (8952, 33)


## 일부 숫자형 변수 변환

- 코스별 성적/경기수 분리

In [16]:
def separation_course(df):
    col_list = [
        '코스_1코스', '코스_2코스', '코스_3코스', '코스_4코스', '코스_5코스', '코스_6코스'
    ]
    for col in col_list:
        df[[f'{col[3:]}_성적', f'{col[3:]}_경기수']] = df[col].fillna('').str.split('/', expand=True)

    df.drop(col_list, axis=1, inplace=True)

    return df
        


train = separation_course(train)
val = separation_course(val)

print(train.shape, val.shape)

(48522, 39) (8952, 39)


- 코스별 성적 스무딩

In [17]:
def apply_laplace_smoothing(df, col, global_mean, alpha):
    # 경기수 0인 값이 너무 높게 나오는 경향이 있어 분모에 상수 1 추가(없애도 됨)
    encoded_value = (df[f'{col}_성적'] * df[f'{col}_경기수'] + global_mean * alpha) / (1 + df[f'{col}_경기수'] + alpha)
    df[f'{col}_성적'] = encoded_value

    return df

def laplace_smoothing_to_course(train, val=None, alpha=1):
    col_list = [
        '1코스', '2코스', '3코스', '4코스', '5코스', '6코스'
    ]
    for col in col_list:
        train[f'{col}_성적'] = train[f'{col}_성적'].astype(float)
        train[f'{col}_경기수'] = train[f'{col}_경기수'].astype(float)
        if val is not None:
            val[f'{col}_성적'] = val[f'{col}_성적'].astype(float)
            val[f'{col}_경기수'] = val[f'{col}_경기수'].astype(float)

    # Train 데이터에서 글로벌 평균 계산
    global_means = {col: train[f'{col}_성적'].mean() for col in col_list}

    for col in col_list:
        # Train 데이터에 라플라스 스무딩 적용
        train = apply_laplace_smoothing(train, col, global_means[col], alpha)
        train.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        for col in col_list:
            # Validation 데이터에 Train에서 구한 글로벌 평균으로 라플라스 스무딩 적용
            val = apply_laplace_smoothing(val, col, global_means[col], alpha)
            val.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        return train, val
    else:
        return train


train, val = laplace_smoothing_to_course(train, val, alpha=1) # 알파가 작을수록 빈도수에 가깝세, 알파가 클수록 전체 평균에 가깝게
print(train.shape, val.shape)

(48522, 33) (8952, 33)


- 최근 8경기 착순 분리

In [18]:
def split_last_eight_rank(df):
    for i in range(0, 4):
        df[f'최근{i+1}경기_착순'] = df['최근8경주_착순'].str[i]
    for j in range(5, 9):
        df[f'최근{j}경기_착순'] = df['최근8경주_착순'].str[j]
        
    df.drop('최근8경주_착순', axis=1, inplace=True)
        
    return df

def adjust_for_top3(df):
    col_list = [
        '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
        '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
    ]
    
    for col in col_list:
    # 순위가 1, 2, 3이 아닌 경우, 결측인 경우, 6으로 조정
    # (일반화된 성능을 위해 + 3등내에 드는게 중요)
        df[col] = df[col].apply(lambda x: x if x in ['1', '2', '3'] else '6')

    return df


train = split_last_eight_rank(train)
train = adjust_for_top3(train)

val = split_last_eight_rank(val)
val = adjust_for_top3(val)

print(train.shape, val.shape)

(48522, 40) (8952, 40)


## 범주형 변수 확인


In [19]:
def cal_cat_cols(train, val=None):
    num_features = []
    objective_cols = []

    # 특정 문자열이 포함된 열을 범주형 변수로 지정
    word_list = ['번호', '기수', '경기_착순', 'Race_ID']
    for col in train.columns:
        if any(sub in col for sub in word_list):
            objective_cols.append(col)
            train[col] = train[col].astype('str')
            if val is not None:
                val[col] = val[col].astype('str')

    # 나머지 열에 대해 숫자형 변환 시도
    for col in train.columns:
        if col in objective_cols:
            continue  # 이미 범주형으로 처리된 열은 제외
        try:
            # 'float' 타입으로 변환 시도
            train[col] = train[col].astype('float')
            if val is not None:
                val[col] = val[col].astype('float')
            num_features.append(col)
        except:
            objective_cols.append(col)

    cat_features = list(set(objective_cols) - set(['rank', 'Race_ID']))

    return num_features, cat_features


num_features, cat_features = cal_cat_cols(train, val)
cat_features

['최근4경기_착순',
 '번호',
 '최근7경기_착순',
 '성별',
 '최근8경기_착순',
 '최근3경기_착순',
 'FL',
 '최근5경기_착순',
 '최근2경기_착순',
 '최근1경기_착순',
 '최근6경기_착순',
 '전일성적',
 '선수명',
 '등급',
 '기수']

- 낮은 빈도 데이터 통합

In [20]:
def low_to_others(train, val, threshold=5, verbose=False):
    _, cat_features = cal_cat_cols(train, val)

    for col in cat_features:
        unifier = low_frequency_to_others(threshold=threshold, verbose=verbose)
        train[col] = unifier.fit_transform(train[col])
        val[col] = unifier.transform(val[col])

    return train, val

train, val = low_to_others(train, val, verbose=True)

Columns:(최근4경기_착순) 변환 X
Columns:(최근4경기_착순) 변환 X
Columns:(번호) 변환 X
Columns:(번호) 변환 X
Columns:(최근7경기_착순) 변환 X
Columns:(최근7경기_착순) 변환 X
Columns:(성별) 변환 X
Columns:(성별) 변환 X
Columns:(최근8경기_착순) 변환 X
Columns:(최근8경기_착순) 변환 X
Columns:(최근3경기_착순) 변환 X
Columns:(최근3경기_착순) 변환 X
Columns:(FL) 변환 X
Columns:(FL) 변환 X
Columns:(최근5경기_착순) 변환 X
Columns:(최근5경기_착순) 변환 X
Columns:(최근2경기_착순) 변환 X
Columns:(최근2경기_착순) 변환 X
Columns:(최근1경기_착순) 변환 X
Columns:(최근1경기_착순) 변환 X
Columns:(최근6경기_착순) 변환 X
Columns:(최근6경기_착순) 변환 X
Columns:(전일성적) 'others'로 33252개 변환
Columns:(전일성적) 'others'로 6184개 변환
Columns:(선수명) 'others'로 4개 변환
Columns:(선수명) 변환 X
Columns:(등급) 변환 X
Columns:(등급) 변환 X
Columns:(기수) 변환 X
Columns:(기수) 변환 X


## X, y 분리

In [21]:
def add_y(df, target='복승', is_train=True):
    df['rank'] = df['rank'].replace(0, 6)
    df['target'] = 0
    
    if is_train:
        if target=='단승': # 1등여부
            condition_target = df['rank'] <= 1
        elif target=='복승':
            condition_target = df['rank'] <= 2
        elif target=='삼복승':
            condition_target = df['rank'] <= 3
        
        df.loc[condition_target, 'target'] = 1

    return df



train = add_y(train)
val = add_y(val, target='단승')

drop_cols = [
    '전일성적', 'rank', 'target'
]

X_train = train.drop(drop_cols, axis=1)
y_train = train[['Race_ID', '번호', 'target']]
X_val = val.drop(drop_cols, axis=1)
y_val = val[['Race_ID', '번호', 'target']]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(48522, 38) (48522, 3) (8952, 38) (8952, 3)


In [22]:
def create_player_df(df, player_number):
    df['번호'] = df['번호'].astype('float')
    player_df = df[df['번호'] == player_number].copy()
    player_df.drop('번호', axis=1, inplace=True)

    # 컬럼명에 선수 번호를 추가
    new_columns = {col: f'{col}_{player_number}번선수' for col in player_df.columns if col != 'Race_ID'}
    player_df.rename(columns=new_columns, inplace=True)

    # 'Race_ID' 컬럼만 유지하고 나머지는 선수 번호가 붙은 컬럼으로 변경
    player_df = player_df[['Race_ID'] + list(new_columns.values())]

    return player_df

def merge_all_players(df):
    merged_df = None

    for player_number in range(1, 7):
        player_df = create_player_df(df, player_number)

        if merged_df is None:
            merged_df = player_df
        else:
            merged_df = pd.merge(merged_df, player_df, on='Race_ID', how='inner')
            
    merged_df.drop('Race_ID', axis=1, inplace=True)

    return merged_df

X_train_merged = merge_all_players(X_train)
y_train_merged = merge_all_players(y_train)
y_train_merged = np.array(y_train_merged)

X_val_merged = merge_all_players(X_val)
y_val_merged = merge_all_players(y_val)
y_val_merged = np.array(y_val_merged)

print(X_train_merged.shape, y_train_merged.shape, X_val_merged.shape, y_val_merged.shape)

(8087, 216) (8087, 6) (1492, 216) (1492, 6)


In [36]:
def all_precoess(train, val, target_value='단승', is_train=True):
    RANDOM_STATE = 999
    
    train = drop_columns_from_datasets(train)
    val = drop_columns_from_datasets(val)

    train = separation_course(train)
    val = separation_course(val)

    train, val = laplace_smoothing_to_course(train, val, alpha=1)

    train = split_last_eight_rank(train)
    train = adjust_for_top3(train)
    
    val = split_last_eight_rank(val)
    val = adjust_for_top3(val)

    # num_features, cat_features = cal_cat_cols(train, val)

    # train, val = low_to_others(train, val)

    if is_train:
        train = add_y(train, target=target_value, is_train=True)
        val = add_y(val, target=target_value, is_train=True)
    else:
        train = add_y(train, target=target_value, is_train=False)
        val = add_y(val, target=target_value, is_train=False)

    drop_cols = [
        '전일성적', 'rank', 'target'
    ]
    X_train = train.drop(drop_cols, axis=1)
    y_train = train[['Race_ID', '번호', 'target']]
    X_val = val.drop(drop_cols, axis=1)
    y_val = val[['Race_ID', '번호', 'target']]


    X_train_merged = merge_all_players(X_train)
    y_train_merged = merge_all_players(y_train)
    y_train_merged = np.array(y_train_merged)
    
    X_val_merged = merge_all_players(X_val)
    y_val_merged = merge_all_players(y_val)
    y_val_merged = np.array(y_val_merged)
    

    X_train_merged, X_val_merged = low_to_others(X_train_merged, X_val_merged, threshold=5, verbose=False)
    
    
    return X_train_merged, y_train_merged, X_val_merged, y_val_merged


train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
val = pd.read_csv(os.path.join(ROOT_DIR, "val.csv"))

target_value='단승'
X_train, y_train, X_val, y_val = all_precoess(train, val, target_value=target_value, is_train=True)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(8087, 216) (8087, 6) (1492, 216) (1492, 6)


In [37]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

class custom_CatBoostClassifier():
    def __init__(self, params):
        self.models = [CatBoostClassifier(**params) for _ in range(6)]

    def fit(self, X, y, eval_set=None, cat_features=None):
        y = np.array(y)  # y를 numpy 배열로 변환
        if eval_set is not None:
            X_val, y_val = eval_set
            y_val = np.array(y_val)  # y_val을 numpy 배열로 변환

        for i in range(6):
            y_i = y[:, i]  # i번째 레이블에 대한 y 값

            if eval_set is not None:
                y_val_i = y_val[:, i]  # i번째 레이블에 대한 y_val 값
                eval_set_i = (X_val, y_val_i)
            else:
                eval_set_i = None

            self.models[i].fit(
                X, y_i,
                eval_set=eval_set_i,
                cat_features=cat_features
            )
            print(f'{i+1}번째 레이블 학습 완료')

    def predict(self, X):
        preds = [model.predict(X) for model in self.models]
        return np.vstack(preds).T

    def predict_proba(self, X):
        probas = [model.predict_proba(X)[:, 1] for model in self.models]
        return np.vstack(probas).T
    
    
params = {
    'random_seed': RANDOM_STATE,
    'iterations': 1000,
    'learning_rate': 0.05,
    'early_stopping_rounds': 100,
    # 'auto_class_weights': 'Balanced',
    'eval_metric': 'Accuracy',
    'verbose': 200,
    'thread_count':-1
}

model = custom_CatBoostClassifier(params)
_, cat_features = cal_cat_cols(X_train, X_val)
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
)


0:	learn: 0.6616792	test: 0.6776139	best: 0.6776139 (0)	total: 164ms	remaining: 2m 43s
200:	learn: 0.7641894	test: 0.7017426	best: 0.7037534 (166)	total: 21.4s	remaining: 1m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7057640751
bestIteration = 221

Shrink model to first 222 iterations.
1번째 레이블 학습 완료
0:	learn: 0.7833560	test: 0.7506702	best: 0.7506702 (0)	total: 99ms	remaining: 1m 38s
200:	learn: 0.8279955	test: 0.7647453	best: 0.7680965 (148)	total: 21.4s	remaining: 1m 25s
400:	learn: 0.8723878	test: 0.7687668	best: 0.7721180 (305)	total: 41.4s	remaining: 1m 1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7721179625
bestIteration = 305

Shrink model to first 306 iterations.
2번째 레이블 학습 완료
0:	learn: 0.8413503	test: 0.8438338	best: 0.8438338 (0)	total: 101ms	remaining: 1m 41s
200:	learn: 0.8627427	test: 0.8491957	best: 0.8491957 (192)	total: 19s	remaining: 1m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8512

In [34]:
# 모델 예측
y_pred = model.predict_proba(X_val)

y_pred_max = np.zeros_like(y_pred)
for i in range(y_pred.shape[0]):  # 각 샘플에 대해 반복
    if target_value=='단승':
        max_indices = np.argsort(y_pred[i])[-1:]  # 가장 큰 두 개의 값의 인덱스 찾기
    elif target_value=='복승':
        max_indices = np.argsort(y_pred[i])[-2:]
    elif target_value=='삼복승':
        max_indices = np.argsort(y_pred[i])[-3:]
    y_pred_max[i, max_indices] = 1  # 해당 인덱스에 1 설정


In [35]:
accuracy = accuracy_score(y_pred_max, y_val)
print(f'Val Accuracy: {accuracy}') # val 경기중, 23%는 모두 맞췄다는 의미

Val Accuracy: 0.4343163538873995


# Test

In [30]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

target_value='단승'
X_train, y_train, X_test, y_test = all_precoess(train, test, target_value=target_value, is_train=True)
print(X_train.shape, y_train.shape, X_val.shape, y_test.shape)

(8087, 216) (8087, 6) (976, 216) (976, 6)


In [31]:
# 모델 예측
y_pred_test = model.predict_proba(X_test)

y_pred_max = np.zeros_like(y_pred_test)
for i in range(y_pred_test.shape[0]):  # 각 샘플에 대해 반복
    if target_value=='단승':
        max_indices = np.argsort(y_pred_test[i])[-1:]  # 가장 큰 두 개의 값의 인덱스 찾기
    elif target_value=='복승':
        max_indices = np.argsort(y_pred_test[i])[-2:]
    elif target_value=='삼복승':
        max_indices = np.argsort(y_pred_test[i])[-3:]
    y_pred_max[i, max_indices] = 1  # 해당 인덱스에 1 설정


In [32]:
accuracy = accuracy_score(y_pred_max, y_test)
print(f'Val Accuracy: {accuracy}') # val 경기중, 23%는 모두 맞췄다는 의미

Val Accuracy: 0.4047131147540984
