In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from encoding_function import low_frequency_to_others
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

### 데이터 불러오기
- 학습: 2023년도 이전
- 검증: 2023년
- 테스트: 2024년

In [2]:
def add_y(df):
    df['rank'] = df['rank'].replace(0, 6)

    target_cols = [
        '단승', '복승', '삼복승'
    ]
    for i, col in enumerate(target_cols):
        condition_target = df['rank'] <= i+1
        df.loc[condition_target, col] = 1

    df[target_cols] = df[target_cols].fillna(0)
    df.drop(['rank'], axis=1, inplace=True)
    return df


ROOT_DIR = "data"
RANDOM_STATE = 999

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

train = add_y(train)
test = add_y(test)

print(train.shape, test.shape)

(57474, 45) (6036, 45)


## 사용하지 않는 feature drop

- 전일 성적을 어떻게 반영할지

In [4]:
def drop_columns_from_datasets(df):
    drop_cols = [
        '연도', '회차', '일차', '경주번호',
        '금일출주경주',
        '모터번호', '전탑승선수1', '전탑승선수2',
        '보트번호', '특이사항',
        '전일성적'
    ]

    df = df.drop(drop_cols, axis=1)

    return df


train = drop_columns_from_datasets(train)
test = drop_columns_from_datasets(test)

print(train.shape, test.shape)

(57474, 34) (6036, 34)


## 일부 숫자형 변수 변환

- 코스별 성적/경기수 분리

In [5]:
def separation_course(df):
    col_list = [
        '코스_1코스', '코스_2코스', '코스_3코스', '코스_4코스', '코스_5코스', '코스_6코스'
    ]
    for col in col_list:
        df[[f'{col[3:]}_성적', f'{col[3:]}_경기수']] = df[col].fillna('').str.split('/', expand=True)

    df.drop(col_list, axis=1, inplace=True)

    return df
        


train = separation_course(train)
test = separation_course(test)

print(train.shape, test.shape)

(57474, 40) (6036, 40)


- 코스별 성적 스무딩

In [6]:
def apply_laplace_smoothing(df, col, global_mean, alpha):
    # 경기수 0인 값이 너무 높게 나오는 경향이 있어 분모에 상수 1 추가(없애도 됨)
    encoded_value = (df[f'{col}_성적'] * df[f'{col}_경기수'] + global_mean * alpha) / (1 + df[f'{col}_경기수'] + alpha)
    df[f'{col}_성적'] = encoded_value

    return df

def laplace_smoothing_to_course(train, val=None, alpha=1):
    col_list = [
        '1코스', '2코스', '3코스', '4코스', '5코스', '6코스'
    ]
    for col in col_list:
        train[f'{col}_성적'] = train[f'{col}_성적'].astype(float)
        train[f'{col}_경기수'] = train[f'{col}_경기수'].astype(float)
        if val is not None:
            val[f'{col}_성적'] = val[f'{col}_성적'].astype(float)
            val[f'{col}_경기수'] = val[f'{col}_경기수'].astype(float)

    # Train 데이터에서 글로벌 평균 계산
    global_means = {col: train[f'{col}_성적'].mean() for col in col_list}

    for col in col_list:
        # Train 데이터에 라플라스 스무딩 적용
        train = apply_laplace_smoothing(train, col, global_means[col], alpha)
        train.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        for col in col_list:
            # Validation 데이터에 Train에서 구한 글로벌 평균으로 라플라스 스무딩 적용
            val = apply_laplace_smoothing(val, col, global_means[col], alpha)
            val.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        return train, val
    else:
        return train


train, test = laplace_smoothing_to_course(train, test, alpha=1) # 알파가 작을수록 빈도수에 가깝세, 알파가 클수록 전체 평균에 가깝게
print(train.shape, test.shape)

(57474, 34) (6036, 34)


In [7]:
def set_course_scores(df):
    # '코스_성적' 열을 초기화
    df['코스_성적'] = 0

    # '번호' 열에 따른 '코스_성적' 값 설정
    for i in range(1, 7):
        mask = df['번호'] == i
        df.loc[mask, '코스_성적'] = df.loc[mask, f'{i}코스_성적']

    # 제거할 열 목록
    drop_cols = [f'{i}코스_성적' for i in range(1, 7)]

    # 열 제거
    df = df.drop(columns=drop_cols)

    return df


from sklearn.decomposition import PCA
def set_course_scores_with_pca(df):
    # '코스_성적' 열을 초기화
    df['코스_성적'] = 0

    # '번호' 열에 따른 '코스_성적' 값 설정
    for i in range(1, 7):
        mask = df['번호'] == i
        df.loc[mask, '코스_성적'] = df.loc[mask, f'{i}코스_성적']

    # PCA를 적용할 열 목록
    course_cols = [f'{i}코스_성적' for i in range(1, 7)]

    # PCA 적용
    pca = PCA(n_components=n_components)
    pca_results = pca.fit_transform(df[course_cols])

    # PCA 결과를 새로운 열로 추가
    for i in range(n_components):
        df[f'PCA_{i+1}'] = pca_results[:, i]

    # 기존 코스 성적 열은 그대로 유지하고 '코스_성적' 열과 PCA 결과를 함께 반환
    return df


train = set_course_scores(train)
test = set_course_scores(test)

- 최근 8경기 착순 분리

- 최근 8경기를 어떻게 반영할지

In [8]:
def split_last_eight_rank(df):
    for i in range(0, 4):
        df[f'최근{i+1}경기_착순'] = df['최근8경주_착순'].str[i]
    for j in range(5, 9):
        df[f'최근{j}경기_착순'] = df['최근8경주_착순'].str[j]
        
    df.drop('최근8경주_착순', axis=1, inplace=True)
        
    return df

def adjust_for_top3(df):
    col_list = [
        '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
        '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
    ]
    
    for col in col_list:
    # 순위가 1, 2, 3이 아닌 경우, 결측인 경우, 6으로 조정
    # (일반화된 성능을 위해 + 3등내에 드는게 중요)
        df[col] = df[col].apply(lambda x: x if x in ['1', '2', '3'] else '-1')

    return df


train = split_last_eight_rank(train)
train = adjust_for_top3(train)

test = split_last_eight_rank(test)
test = adjust_for_top3(test)

print(train.shape, test.shape)

(57474, 36) (6036, 36)


## 범주형 변수 확인


In [9]:
def cal_cat_cols(train, val=None):
    objective_cols = []

    # 특정 문자열이 포함된 열을 범주형 변수로 지정
    cat_kerword_list = ['번호', '기수', '경기_착순']
    for col in train.columns:
        if any(sub in col for sub in cat_kerword_list):
            objective_cols.append(col)
            train[col] = train[col].astype('str')
            val[col] = val[col].astype('str')
            
    # 나머지 열에 대해 숫자형 변환 시도
    for col in train.columns:
        if col in objective_cols:
            continue  # 이미 범주형으로 처리된 열은 제외
        try:
            # 'float' 타입으로 변환 시도
            train[col] = train[col].astype('float')
            val[col] = val[col].astype('float')
        except:
            objective_cols.append(col)

    cat_features = list(set(objective_cols) - set(['rank', 'Race_ID']))
    cat_features = [feature for feature in cat_features if '단승' not in feature]
    cat_features = [feature for feature in cat_features if '복승' not in feature]
    cat_features = [feature for feature in cat_features if '삼복승' not in feature]

    return cat_features


# cat_features = cal_cat_cols(train, test)
# cat_features

In [10]:
plt.rcParams['font.family'] = 'Malgun Gothic'

cols = [
    '단승', '복승', '삼복승'
]
condition = train['Race_ID'].str[9:].astype(int) >= 15

for col in cols:
    mean_values = train.loc[condition, :].groupby('번호')[col].mean()

    plt.figure(figsize=(14, 7))
    mean_values.plot(kind='bar', color='skyblue')
    plt.title(f'번호 별 {col} 값의 평균')
    plt.xlabel('번호')
    plt.ylabel(f'{col} 값의 평균')
    plt.xticks(rotation=0)
    plt.show()

ValueError: invalid literal for int() with base 10: '_1'

## X, y 분리

In [12]:
drop_cols = [
    'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수'
]

X_train = train.drop(drop_cols, axis=1)
y_train = train[['단승']]
X_test = test.drop(drop_cols, axis=1)
y_test = test[['단승']]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(57474, 29) (57474, 1) (6036, 29) (6036, 1)


In [13]:
def reshape_race_data(df, players_per_race=6):
    # numpy 배열로 변환
    values = df.values

    # 데이터 형태 변환 (6개의 행을 1개의 행으로)
    reshaped_values = values.reshape(-1, players_per_race * values.shape[1])

    # 새로운 컬럼 이름 생성
    columns = [f'{col}_{i+1}번선수' for i in range(players_per_race) for col in df.columns]

    # 재구성된 DataFrame 생성
    reshaped_df = pd.DataFrame(reshaped_values, columns=columns)

    return reshaped_df

X_train = reshape_race_data(X_train, players_per_race=6)
y_train = reshape_race_data(y_train, players_per_race=6)
y_train = np.array(y_train).astype(int)

X_test = reshape_race_data(X_test, players_per_race=6)
y_test = reshape_race_data(y_test, players_per_race=6)
y_test = np.array(y_test).astype(int)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(9579, 174) (9579, 6) (1006, 174) (1006, 6)


In [14]:
def low_to_others(train, val, threshold=5, is_train=True, verbose=False):
    cat_features = cal_cat_cols(train, val)
    if is_train==False:
        cat_features = list(set(cat_features) - set(['Race_ID_1번선수', 'Race_ID_2번선수', 'Race_ID_3번선수', 'Race_ID_4번선수', 'Race_ID_5번선수', 'Race_ID_6번선수']))

    for col in cat_features:
        unifier = low_frequency_to_others(threshold=threshold, verbose=verbose)
        train[col] = unifier.fit_transform(train[col])
        val[col] = unifier.transform(val[col])

    return train, val

X_train, X_test = low_to_others(X_train, X_test, threshold=5, verbose=False)

In [15]:
def all_precoess(train, val, target='단승', is_train=True):
    train = add_y(train)
    if is_train:
        val = add_y(val)
    
    train = drop_columns_from_datasets(train)
    val = drop_columns_from_datasets(val)

    train = separation_course(train)
    val = separation_course(val)

    train, val = laplace_smoothing_to_course(train, val, alpha=1)

    # train = set_course_scores(train)
    # val = set_course_scores(val)

    train = split_last_eight_rank(train)
    train = adjust_for_top3(train)
    
    val = split_last_eight_rank(val)
    val = adjust_for_top3(val)

    # train, val = low_to_others(train, val, threshold=10, verbose=False)
    
    if is_train:
        drop_cols = [
            'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수'
        ]
    else:
        drop_cols = [
            '번호', '선수명', '기수'
        ]
    
    X_train = train.drop(drop_cols, axis=1)
    y_train = train[[target]]
    X_train = reshape_race_data(X_train, players_per_race=6)
    y_train = reshape_race_data(y_train, players_per_race=6)
    y_train = np.array(y_train).astype(int)

    X_val = val.drop(drop_cols, axis=1)
    X_val = reshape_race_data(X_val, players_per_race=6)
    if is_train:
        y_val = val[[target]]
        y_val = reshape_race_data(y_val, players_per_race=6)
        y_val = np.array(y_val).astype(int)

    X_train, X_val = low_to_others(X_train, X_val, threshold=5, is_train=is_train, verbose=False)
    
    if is_train:
        return X_train, y_train, X_val, y_val
    else:
        return X_train, y_train, X_val
    
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

X_train, y_train, X_test, y_test = all_precoess(train, test, is_train=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(9579, 204) (9579, 6) (1006, 204) (1006, 6)


In [16]:
# ### 테스트용
# 
# from ctgan import CTGAN
# 
# # CTGAN 모델 생성
# ctgan = CTGAN(verbose=True)
# 
# cat_features = cal_cat_cols(X_train, X_test)
# 
# # 모델 학습 (target도 다른 변수와 함께 사용됨)
# ctgan.fit(X_train, discrete_columns=cat_features)
# 
# # 새로운 데이터 생성
# new_data = ctgan.sample(10)
# new_data

In [17]:
from sklearn.metrics import accuracy_score, f1_score

class custom_CatBoostClassifier():
    def __init__(self, params):
        self.models = [CatBoostClassifier(**params) for _ in range(6)]

    def fit(self, X, y, eval_set=None, cat_features=None):
        y = np.array(y)  # y를 numpy 배열로 변환
        if eval_set is not None:
            X_val, y_val = eval_set

        for i in range(6):
            y_i = y[:, i]  # i번째 레이블에 대한 y 값

            if eval_set is not None:
                y_val_i = y_val[:, i]  # i번째 레이블에 대한 y_val 값
                eval_set_i = (X_val, y_val_i)
            else:
                eval_set_i = None

            self.models[i].fit(
                X, y_i,
                eval_set=eval_set_i,
                cat_features=cat_features
            )
            # print(f'{i+1}번째 레이블 학습 완료')

    def predict(self, X):
        preds = [model.predict(X) for model in self.models]
        return np.vstack(preds).T

    def predict_proba(self, X):
        probas = [model.predict_proba(X)[:, 1] for model in self.models]
        return np.vstack(probas).T

def evaluate_(y_pred, y_val, target_value='단승'):
    y_pred_max = np.zeros_like(y_pred)

    # 각 샘플에 대해 가장 높은 확률의 인덱스를 선택
    for i in range(y_pred.shape[0]):  # 각 샘플에 대해 반복
        if target_value == '단승':
            max_indices = np.argsort(y_pred[i])[-1:]  # 가장 큰 값의 인덱스 찾기
        elif target_value == '복승':
            max_indices = np.argsort(y_pred[i])[-2:]
        elif target_value == '삼복승':
            max_indices = np.argsort(y_pred[i])[-3:]
        y_pred_max[i, max_indices] = 1  # 해당 인덱스에 1 설정

    accuracy = accuracy_score(y_pred_max, y_val)
    return accuracy
    

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
target_value = '복승'
if target_value=='단승':
    params = {
        'random_seed': RANDOM_STATE,
        'iterations': 3000,
        'learning_rate': 0.05,
        'early_stopping_rounds': 100,
        'auto_class_weights': 'SqrtBalanced',
        'eval_metric': 'Accuracy',
        'verbose': 0,
        'thread_count':-1
    }
elif target_value=='복승':
    params = {
        'random_seed': RANDOM_STATE,
        'iterations': 3000,
        'learning_rate': 0.05,
        'early_stopping_rounds': 100,
        'auto_class_weights': 'SqrtBalanced',
        'eval_metric': 'Accuracy',
        'verbose': 0,
        'thread_count':-1
    }
elif target_value=='삼복승':
    params = {
        'random_seed': RANDOM_STATE,
        'iterations': 3000,
        'learning_rate': 0.05,
        'early_stopping_rounds': 100,
        'auto_class_weights': 'SqrtBalanced',
        'eval_metric': 'Accuracy',
        'verbose': 0,
        'thread_count':-1
    }


seed = 42
unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []
scores = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]
    val_ids = unique_race_ids[val_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    val_fold = train[train['Race_ID'].isin(val_ids)].reset_index(drop=True)

    X_train, y_train, X_val, y_val = all_precoess(train_fold, val_fold, target=target_value, is_train=True)

    model = custom_CatBoostClassifier(params)
    cat_features = cal_cat_cols(X_train, X_test)
    
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        cat_features=cat_features,
    )
    
    models.append(model)
    y_pred = models[i].predict_proba(X_val)
    accuracy = evaluate_(y_pred, y_val, target_value)
    scores.append(accuracy)
    print(f'folds {i+1}')
    print(f'Accuracy: {accuracy}')


print(f'최종 스코어: Avg. Accuracy of validset: {np.mean(scores)}, Std. Accuracy of validset: {np.std(scores)}')

folds 1
Accuracy: 0.2520876826722338
folds 2
Accuracy: 0.21450939457202506
folds 3
Accuracy: 0.23747390396659707
folds 4
Accuracy: 0.24321503131524008
folds 5
Accuracy: 0.25274151436031334
최종 스코어: Avg. Accuracy of validset: 0.24000550537728188, Std. Accuracy of validset: 0.013963858074654377


- Test 데이터

In [18]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

all_predictions = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    test_fold = test.copy()

    X_train, y_train, X_test, y_test = all_precoess(train_fold, test_fold, target=target_value, is_train=True)
    
    y_pred = models[i].predict_proba(X_test)
    all_predictions.append(y_pred)


mean_pred = np.mean(all_predictions, axis=0)
accuracy = evaluate_(mean_pred, y_test, target_value)

print(f'최종 스코어: Accuracy of testset: {accuracy}')

최종 스코어: Accuracy of testset: 0.2644135188866799


- Base_line: F1 of testset: 0.26741803278688525

# 실제 경기 예측

In [25]:
from crawlling_entry import crawl_race_entries

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
연도 = 2024
회차 = 36
일차 = 2
sub = crawl_race_entries(연도, 회차, 일차)

unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

all_predictions = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    test_fold = sub.copy()

    X_train, y_train, X_test = all_precoess(train_fold, test_fold, target=target_value, is_train=False)

    y_pred = models[i].predict_proba(X_test)
    all_predictions.append(y_pred)


mean_pred = np.mean(all_predictions, axis=0)
mean_pred_df = pd.DataFrame(mean_pred)
mean_pred_df.index = [f"{i+1}경기" for i in range(len(mean_pred_df))]
mean_pred_df.columns = [f"{i+1}번" for i in range(mean_pred_df.shape[1])]
mean_pred_df

Unnamed: 0,1번,2번,3번,4번,5번,6번
1경기,0.524258,0.478402,0.349909,0.381039,0.359977,0.353927
2경기,0.446437,0.748016,0.432206,0.348523,0.463073,0.362473
3경기,0.590354,0.330162,0.366036,0.390308,0.53425,0.304871
4경기,0.508091,0.497927,0.554491,0.460598,0.335899,0.400207
5경기,0.457585,0.407186,0.456112,0.497907,0.365661,0.403018
6경기,0.55216,0.595257,0.300297,0.535264,0.325186,0.327499
7경기,0.48418,0.452623,0.445703,0.470931,0.458778,0.406981
8경기,0.493261,0.523944,0.318981,0.532959,0.46678,0.302319
9경기,0.488885,0.548327,0.479107,0.511543,0.359405,0.307082
10경기,0.548443,0.549675,0.534804,0.402283,0.327305,0.305078


In [34]:
mean_pred_df.to_excel(f'./result/{연도}_{회차}_{일차}_{target_value}.xlsx', index=True, float_format="%.2f")