In [1]:
import pandas as pd
import numpy as np
import os

from encoding_function import low_frequency_to_others
from sklearn.model_selection import KFold
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

### 데이터 불러오기

In [2]:
def add_y(df):
    df['rank'] = df['rank'].replace(0, 6)

    target_cols = [
        '단승', '복승', '삼복승'
    ]
    for i, col in enumerate(target_cols):
        condition_target = df['rank'] <= i+1
        df.loc[condition_target, col] = 1

    df[target_cols] = df[target_cols].fillna(0)
    df.drop(['rank'], axis=1, inplace=True)
    return df

def add_y_(df):
    df['rank'] = df['rank'].replace(0, 6)

    target_cols = [
        '단승', '복승', '삼복승'
    ]
    for i, col in enumerate(target_cols):
        condition_target = df['rank'] <= i+1
        df.loc[condition_target, col] = df['rank']

    df[target_cols] = df[target_cols].fillna(0)
    df.drop(['rank'], axis=1, inplace=True)
    return df



ROOT_DIR = "data"
RANDOM_STATE = 42

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

train['성별'] = train['성별'].map({'남': 0, '여': 1})
test['성별'] = test['성별'].map({'남': 0, '여': 1})

train = add_y(train)
test = add_y(test)

print(train.shape, test.shape)
train.head()

(58344, 46) (6528, 46)


Unnamed: 0,Race_ID,연도,회차,일차,경주번호,경기종류,번호,등급,기수,선수명,...,모터_연대율3,전탑승선수1,전탑승선수2,보트번호,보트_평균착순점,보트_연대율,특이사항,단승,복승,삼복승
0,2016_1_1_1,2016,1,1,1,플라잉,1,A1,1,정용진,...,0.0,,,27,0.0,0.0,,0.0,0.0,1.0
1,2016_1_1_1,2016,1,1,1,플라잉,2,A1,13,김민준,...,0.0,,,50,0.0,0.0,,1.0,1.0,1.0
2,2016_1_1_1,2016,1,1,1,플라잉,3,B2,11,김현덕,...,0.0,,,71,0.0,0.0,주선보류 후 출전,0.0,0.0,0.0
3,2016_1_1_1,2016,1,1,1,플라잉,4,A1,8,한종석,...,0.0,,,53,0.0,0.0,,0.0,1.0,1.0
4,2016_1_1_1,2016,1,1,1,플라잉,5,B1,9,정재용,...,0.0,,,64,0.0,0.0,,0.0,0.0,0.0


In [3]:
def add_weight_penalty(df):
    weight_limit_male = 55
    weight_limit_female = 51
    df['체중'] = df['체중'].astype(float)
    
    # 중량 부과 여부 계산 (성별에 따라 다른 기준 적용)
    df['중량부과여부'] = df.apply(lambda row: 1 if (row['성별'] == 0 and row['체중'] < weight_limit_male) or
                                             (row['성별'] == 1 and row['체중'] < weight_limit_female) else 0, axis=1)

    # 부과된 중량 계산 (성별에 따라 차감된 체중을 계산)
    df['부과된중량'] = df.apply(lambda row: max(0, weight_limit_male - row['체중']) if row['성별'] == 0 else
    max(0, weight_limit_female - row['체중']), axis=1)
    
    df.drop('체중', axis=1, inplace=True)

    return df

train = add_weight_penalty(train)
test = add_weight_penalty(test)

In [4]:
def bin_age(df):
    # 나이를 5개 구간으로 나누어 bin화 (예: 0-20, 20-30, 30-40, 40-50, 50+)
    bins = [20, 30, 40, 50, 100]
    labels = ['20-30', '30-40', '40-50', '50+']

    # 나이 변수를 bin화
    df['나이'] = pd.cut(df['나이'], bins=bins, labels=labels, right=False)

    return df

train = bin_age(train)
test = bin_age(test)

## 사용하지 않는 feature drop

In [5]:
def drop_columns_from_datasets(df):
    drop_cols = [
        '연도', '회차', '일차', '경주번호', # 일차도 제거?
        '금일출주경주',
        '모터번호', '전탑승선수1', '전탑승선수2',
        '보트번호', '특이사항',
        'FL' # F뒤의 숫자는 현재 반기의 실격 횟수, L뒤의 숫자는 반칙횟수?
    ]

    df = df.drop(drop_cols, axis=1)

    return df


train = drop_columns_from_datasets(train)
test = drop_columns_from_datasets(test)

print(train.shape, test.shape)

(58344, 36) (6528, 36)


In [6]:
def reverse_rank_values(df_train, df_val):
    cols_to_reverse = [
        '최근6회차_평균착순점', '최근6회차_평균득점',
        '연간성적_평균착순점',
        '모터_평균착순점',
        '보트_평균착순점'
    ]
    for col in cols_to_reverse:
        max_rank = df_train[col].max()
        df_train[col] = max_rank - df_train[col]
        df_val[col] = max_rank - df_val[col]
    
        df_train[col].fillna(0, inplace=True)
        df_val[col].fillna(0, inplace=True)
    
    return df_train, df_val


train, test = reverse_rank_values(train, test)

In [7]:
import re

def extract_numbers(result):
    if isinstance(result, str):  # result가 문자열인 경우에만 처리
        return re.findall(r'-(\d+)-', result)  # -숫자- 형식에서 중간 숫자 추출
    return []  # 문자열이 아니면 빈 리스트 반환

def calculate_mean(numbers):
    numbers = [int(num) for num in numbers]  # 리스트 안의 문자열 숫자를 정수로 변환
    if len(numbers) > 0:
        return np.mean(numbers)  # 리스트가 비어있지 않으면 평균 계산
    else:
        return np.nan  # 빈 리스트인 경우 NaN 반환

def last_race_process(df_train, df_val):
    df_train['전일성적'] = df_train['전일성적'].apply(extract_numbers)  # 순위만 추출 (코스도 같이 추출?)
    df_train['전일성적'] = df_train['전일성적'].apply(calculate_mean) # 평균 계산

    df_val['전일성적'] = df_val['전일성적'].apply(extract_numbers)
    df_val['전일성적'] = df_val['전일성적'].apply(calculate_mean)

    max_rank = df_train['전일성적'].max()
    df_train['전일성적'] = max_rank - df_train['전일성적'] + 1
    df_val['전일성적'] = max_rank - df_val['전일성적'] + 1
    
    df_train['전일성적'].fillna(1, inplace=True)
    df_val['전일성적'].fillna(1, inplace=True)
    
    return df_train, df_val

train, test = last_race_process(train, test)

## 일부 숫자형 변수 변환

- 코스별 성적/경기수 분리

In [8]:
def separation_course(df):
    col_list = [
        '코스_1코스', '코스_2코스', '코스_3코스', '코스_4코스', '코스_5코스', '코스_6코스'
    ]
    for col in col_list:
        df[[f'{col[3:]}_성적', f'{col[3:]}_경기수']] = df[col].fillna('').str.split('/', expand=True)

    df.drop(col_list, axis=1, inplace=True)

    return df
        


train = separation_course(train)
test = separation_course(test)

print(train.shape, test.shape)

(58344, 42) (6528, 42)


- 코스별 성적 스무딩

In [48]:
def apply_laplace_smoothing(df, col, global_mean, alpha):
    # 경기수 0인 값이 너무 높게 나오는 경향이 있어 분모에 상수 1 추가(없애도 됨)
    encoded_value = (df[f'{col}_성적'] * df[f'{col}_경기수'] + global_mean * alpha) / (1 + df[f'{col}_경기수'] + alpha)
    df[f'{col}_성적'] = encoded_value

    return df

def laplace_smoothing_to_course(train, val=None, alpha=1):
    col_list = [
        '1코스', '2코스', '3코스', '4코스', '5코스', '6코스'
    ]
    for col in col_list:
        train[f'{col}_성적'] = train[f'{col}_성적'].astype(float)
        train[f'{col}_경기수'] = train[f'{col}_경기수'].astype(float)
        if val is not None:
            val[f'{col}_성적'] = val[f'{col}_성적'].astype(float)
            val[f'{col}_경기수'] = val[f'{col}_경기수'].astype(float)

    # Train 데이터에서 글로벌 평균 계산
    global_means = {col: train[f'{col}_성적'].mean() for col in col_list}

    for col in col_list:
        # Train 데이터에 라플라스 스무딩 적용
        train = apply_laplace_smoothing(train, col, global_means[col], alpha)
        train.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        for col in col_list:
            # Validation 데이터에 Train에서 구한 글로벌 평균으로 라플라스 스무딩 적용
            val = apply_laplace_smoothing(val, col, global_means[col], alpha)
            val.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        return train, val
    else:
        return train


train, test = laplace_smoothing_to_course(train, test, alpha=1) # 알파가 작을수록 빈도수에 가깝세, 알파가 클수록 전체 평균에 가깝게
print(train.shape, test.shape)

(58344, 36) (6528, 36)


In [49]:
def set_course_scores(df):
    # '코스_성적' 열을 초기화
    df['코스_성적'] = 0

    # '번호' 열에 따른 '코스_성적' 값 설정
    for i in range(1, 7):
        mask = df['번호'] == i
        df.loc[mask, '코스_성적'] = df.loc[mask, f'{i}코스_성적']

    # 제거할 열 목록
    drop_cols = [f'{i}코스_성적' for i in range(1, 7)]

    # 열 제거
    df = df.drop(columns=drop_cols)

    return df


from sklearn.decomposition import PCA
def apply_pca(df_train, df_val, n_components=2):
    course_cols = [f'{i}코스_성적' for i in range(1, 7)]

    pca = PCA(n_components=n_components)
    pca.fit(df_train[course_cols])

    # 훈련 데이터에 PCA 변환 적용 (transform)
    train_pca = pca.transform(df_train[course_cols])
    val_pca = pca.transform(df_val[course_cols])
    for i in range(n_components):
        col_name = f'PCA_코스성적_{i+1}'
        df_train[col_name] = train_pca[:, i]
        df_val[col_name] = val_pca[:, i]

    # 각 컴포넌트별 분산 설명 비율 출력
    # explained_variance_ratios = pca.explained_variance_ratio_
    # for i, ratio in enumerate(explained_variance_ratios):
    #     print(f"PCA Component {i+1}: {ratio:.4f} variance explained")
    
    # n_components=2가 제일 좋아보임

    return df_train, df_val


train, test = apply_pca(train, test, n_components=2)
train = set_course_scores(train)
test = set_course_scores(test)

- 최근 8경기 착순 분리

In [50]:
def split_last_eight_rank(df):
    for i in range(0, 4):
        df[f'최근{i+1}경기_착순'] = df['최근8경주_착순'].str[i]
    for j in range(5, 9):
        df[f'최근{j}경기_착순'] = df['최근8경주_착순'].str[j]
        
    df.drop('최근8경주_착순', axis=1, inplace=True)
    
    return df

# def adjust_for_top3(df):
#     col_list = [
#         '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
#         '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
#     ]
# 
#     for col in col_list:
#     # 순위가 1, 2, 3이 아닌 경우, 결측인 경우, 6으로 조정
#     # (일반화된 성능을 위해 + 3등내에 드는게 중요)
#         df[col] = df[col].apply(lambda x: x if x in ['1', '2', '3'] else '-1')
# 
#     return df

def adjust_last_eight_rank(df_train, df_val):
    col_list = [
        '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
        '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
    ]

    for col in col_list:
        df_train[col] = df_train[col].fillna(6).astype(int)
        df_val[col] = df_val[col].fillna(6).astype(int)
        
        # df_train[col] = df_train[col].replace(0, 1)  # 가끔씩 0이 있는 경우가 존재
        # df_val[col] = df_val[col].replace(0, 1)
        # 그냥 냅두는게 성능이 젤 높음

        max_rank = df_train[col].max()
        df_train[col] = max_rank - df_train[col]
        df_val[col] = max_rank - df_val[col]
    
    first_mean_cols = 3
    col_to_mean_1 = col_list[:first_mean_cols]

    df_train[f'최근{first_mean_cols}경기_평균'] = df_train[col_to_mean_1].mean(axis=1)
    df_val[f'최근{first_mean_cols}경기_평균'] = df_val[col_to_mean_1].mean(axis=1)

    df_train.drop(col_list, axis=1, inplace=True)
    df_val.drop(col_list, axis=1, inplace=True)

    return df_train, df_val


train = split_last_eight_rank(train)
# train = adjust_for_top3(train)

test = split_last_eight_rank(test)
# test = adjust_for_top3(test)

train, test = adjust_last_eight_rank(train, test)

print(train.shape, test.shape)

(58344, 33) (6528, 33)


In [91]:
def encode_categorical_features(df_train, df_val, target_value='복승'):
    df_train['번호_등급'] = df_train['번호'].astype(str) + df_train['등급'].astype(str)
    df_val['번호_등급'] = df_val['번호'].astype(str) + df_val['등급'].astype(str)
    
    target_encoder = TargetEncoder(smoothing=2)
    target_encoder.fit(df_train['번호_등급'], df_train[target_value])
    
    df_train['등급'] = target_encoder.transform(df_train['번호_등급'])
    df_val['등급'] = target_encoder.transform(df_val['번호_등급'])
    
    df_train.drop('번호_등급', axis=1, inplace=True)
    df_val.drop('번호_등급', axis=1, inplace=True)
    
    map_age = {
        '20-30':25,
        '30-40':35,
        '40-50':45,
        '50+':55,
    }
    df_train['나이'] = df_train['나이'].map(map_age)
    df_val['나이'] = df_val['나이'].map(map_age)

    train_racetype = df_train[['경기종류']].copy()
    val_racetype = df_val[['경기종류']].copy()

    train_racetype[['플라잉', '온라인']] = 0
    val_racetype[['플라잉', '온라인']] = 0
    
    train_racetype.loc[train_racetype['경기종류']=='플라잉', '플라잉'] = 1
    train_racetype.loc[train_racetype['경기종류']=='온라인', '온라인'] = 1

    val_racetype.loc[val_racetype['경기종류']=='플라잉', '플라잉'] = 1
    val_racetype.loc[val_racetype['경기종류']=='온라인', '온라인'] = 1

    train_racetype.drop('경기종류', axis=1, inplace=True)
    val_racetype.drop('경기종류', axis=1, inplace=True)
    
    df_train = pd.concat([train_racetype, df_train], axis=1)
    df_val = pd.concat([val_racetype, df_val], axis=1)
    
    df_train.drop('경기종류', axis=1, inplace=True)
    df_val.drop('경기종류', axis=1, inplace=True)
    
    return df_train, df_val

train, test = encode_categorical_features(train, test)
train

Unnamed: 0,플라잉,온라인,Race_ID,연도,회차,일차,경주번호,번호,등급,기수,...,전탑승선수2,보트번호,보트_평균착순점,보트_연대율,특이사항,단승,복승,삼복승,중량부과여부,부과된중량
0,1,0,2016_1_1_1,2016,1,1,1,1,0.755632,1,...,,27,0.00,0.0,,0.0,0.0,1.0,0,0.0
1,1,0,2016_1_1_1,2016,1,1,1,2,0.633299,13,...,,50,0.00,0.0,,1.0,1.0,1.0,0,0.0
2,1,0,2016_1_1_1,2016,1,1,1,3,0.219374,11,...,,71,0.00,0.0,주선보류 후 출전,0.0,0.0,0.0,0,0.0
3,1,0,2016_1_1_1,2016,1,1,1,4,0.451251,8,...,,53,0.00,0.0,,0.0,1.0,1.0,0,0.0
4,1,0,2016_1_1_1,2016,1,1,1,5,0.181429,9,...,,64,0.00,0.0,,0.0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58339,1,0,2023_52_2_17,2023,52,2,17,2,0.373120,1,...,이경섭/662,35,5.02,29.4,,0.0,0.0,1.0,0,0.0
58340,1,0,2023_52_2_17,2023,52,2,17,3,0.388318,7,...,이진우/652,64,5.08,32.1,,0.0,0.0,0.0,0,0.0
58341,1,0,2023_52_2_17,2023,52,2,17,4,0.324412,12,...,조성인/411,14,4.66,26.1,,0.0,1.0,1.0,0,0.0
58342,1,0,2023_52_2_17,2023,52,2,17,5,0.142238,15,...,기광서/645,109,5.18,30.7,,0.0,0.0,0.0,0,0.0


## 범주형 변수 확인


In [92]:
def cal_cat_cols(train, val=None):
    objective_cols = []

    # 특정 문자열이 포함된 열을 범주형 변수로 지정
    # cat_kerword_list = ['번호', '기수', '경기_착순']
    cat_kerword_list = ['번호', '경기종류', 'Race_ID']
    for col in train.columns:
        if any(sub in col for sub in cat_kerword_list):
            objective_cols.append(col)
            train[col] = train[col].astype('str')
            val[col] = val[col].astype('str')
            
    # 나머지 열에 대해 숫자형 변환 시도
    for col in train.columns:
        if col in objective_cols:
            continue  # 이미 범주형으로 처리된 열은 제외
        try:
            # 'float' 타입으로 변환 시도
            train[col] = train[col].astype('float')
            val[col] = val[col].astype('float')
        except:
            objective_cols.append(col)

    cat_features = list(set(objective_cols) - set(['rank', 'Race_ID']))
    cat_features = [feature for feature in cat_features if '단승' not in feature]
    cat_features = [feature for feature in cat_features if '복승' not in feature]
    cat_features = [feature for feature in cat_features if '삼복승' not in feature]

    return cat_features


cat_features = cal_cat_cols(train, test)
cat_features

['경주번호',
 '번호',
 '모터번호',
 '전일성적',
 '보트번호',
 '코스_6코스',
 '전탑승선수2',
 '코스_2코스',
 '코스_1코스',
 '코스_3코스',
 '선수명',
 'FL',
 '전탑승선수1',
 '코스_4코스',
 '최근8경주_착순',
 '특이사항',
 '코스_5코스']

## X, y 분리

In [93]:
drop_cols = [
    'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수', '플라잉', '온라인'
]

train_type = train[['Race_ID', '플라잉', '온라인']]  # 일차까지?
X_train = train.drop(drop_cols, axis=1)
y_train = train[['단승']]

test_type = test[['Race_ID', '플라잉', '온라인']]
X_test = test.drop(drop_cols, axis=1)
y_test = test[['단승']]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(58344, 39) (58344, 1) (6528, 39) (6528, 1)


In [94]:
def reshape_race_data(df, players_per_race=6):
    # numpy 배열로 변환
    values = df.values

    # 데이터 형태 변환 (6개의 행을 1개의 행으로)
    reshaped_values = values.reshape(-1, players_per_race * values.shape[1])

    # 새로운 컬럼 이름 생성
    columns = [f'{col}_{i+1}번선수' for i in range(players_per_race) for col in df.columns]

    # 재구성된 DataFrame 생성
    reshaped_df = pd.DataFrame(reshaped_values, columns=columns)

    return reshaped_df

X_train = reshape_race_data(X_train, players_per_race=6)
y_train = reshape_race_data(y_train, players_per_race=6)
y_train = np.array(y_train).astype(int)

X_test = reshape_race_data(X_test, players_per_race=6)
y_test = reshape_race_data(y_test, players_per_race=6)
y_test = np.array(y_test).astype(int)

train_type.drop_duplicates(inplace=True)
test_type.drop_duplicates(inplace=True)

train_type = train_type.reset_index(drop=True)
test_type = test_type.reset_index(drop=True)

X_train = pd.concat([train_type, X_train], axis=1)
X_test = pd.concat([test_type, X_test], axis=1)

X_train.drop('Race_ID', axis=1, inplace=True)
X_test.drop('Race_ID', axis=1, inplace=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(9724, 236) (9724, 6) (1088, 236) (1088, 6)


In [95]:
X_train

Unnamed: 0,플라잉,온라인,연도_1번선수,회차_1번선수,일차_1번선수,경주번호_1번선수,등급_1번선수,성별_1번선수,나이_1번선수,최근6회차_평균착순점_1번선수,...,모터_연대율2_6번선수,모터_연대율3_6번선수,전탑승선수1_6번선수,전탑승선수2_6번선수,보트번호_6번선수,보트_평균착순점_6번선수,보트_연대율_6번선수,특이사항_6번선수,중량부과여부_6번선수,부과된중량_6번선수
0,1.0,0.0,2016.0,1.0,1.0,1,0.755632,0.0,45.0,6.67,...,0.0,0.0,,,61,0.0,0.0,,1.0,3.0
1,1.0,0.0,2016.0,1.0,1.0,2,0.755632,0.0,35.0,6.44,...,0.0,0.0,,,2,0.0,0.0,,0.0,0.0
2,1.0,0.0,2016.0,1.0,1.0,3,0.616358,0.0,45.0,6.33,...,0.0,0.0,,,21,0.0,0.0,,0.0,0.0
3,1.0,0.0,2016.0,1.0,1.0,4,0.526792,0.0,45.0,2.8,...,0.0,0.0,,,8,0.0,0.0,,0.0,0.0
4,1.0,0.0,2016.0,1.0,1.0,5,0.755632,0.0,35.0,7.89,...,0.0,0.0,,,70,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,1.0,0.0,2023.0,52.0,2.0,13,0.526792,0.0,35.0,5.81,...,26.9,41.9,김계영/333,최재원/342,75,5.8,40.7,,0.0,0.0
9720,1.0,0.0,2023.0,52.0,2.0,14,0.755632,0.0,55.0,6.47,...,29.3,50.3,김지영/343,이지은/55,93,5.45,37.9,,1.0,2.0
9721,1.0,0.0,2023.0,52.0,2.0,15,0.526792,0.0,35.0,4.94,...,28.9,46.0,김동경/551,류해광/24,81,4.59,27.6,,0.0,0.0
9722,1.0,0.0,2023.0,52.0,2.0,16,0.526792,0.0,45.0,3.63,...,30.5,45.6,정훈민/345,안지민/2242,55,5.47,37.7,,0.0,0.0


In [96]:
def low_to_others(train, val, threshold=5, is_train=True, verbose=False):
    cat_features = cal_cat_cols(train, val)
    if is_train==False:
        cat_features = list(set(cat_features) - set(['Race_ID_1번선수', 'Race_ID_2번선수', 'Race_ID_3번선수', 'Race_ID_4번선수', 'Race_ID_5번선수', 'Race_ID_6번선수']))

    for col in cat_features:
        unifier = low_frequency_to_others(threshold=threshold, verbose=verbose)
        train[col] = unifier.fit_transform(train[col])
        val[col] = unifier.transform(val[col])

    return train, val

X_train, X_test = low_to_others(X_train, X_test, threshold=5, verbose=True)

Columns:(FL_6번선수) 변환 X
Columns:(FL_6번선수) 변환 X
Columns:(최근8경주_착순_3번선수) 'others'로 9701개 변환
Columns:(최근8경주_착순_3번선수) 'others'로 1088개 변환
Columns:(전탑승선수2_3번선수) 'others'로 9605개 변환
Columns:(전탑승선수2_3번선수) 'others'로 1085개 변환
Columns:(코스_2코스_4번선수) 'others'로 9개 변환
Columns:(코스_2코스_4번선수) 'others'로 2개 변환
Columns:(코스_5코스_5번선수) 'others'로 27개 변환
Columns:(코스_5코스_5번선수) 'others'로 13개 변환
Columns:(코스_1코스_3번선수) 'others'로 44개 변환
Columns:(코스_1코스_3번선수) 'others'로 4개 변환
Columns:(보트번호_1번선수) 변환 X
Columns:(보트번호_1번선수) 변환 X
Columns:(최근8경주_착순_2번선수) 'others'로 9696개 변환
Columns:(최근8경주_착순_2번선수) 'others'로 1088개 변환
Columns:(FL_5번선수) 'others'로 4개 변환
Columns:(FL_5번선수) 변환 X
Columns:(코스_3코스_2번선수) 'others'로 32개 변환
Columns:(코스_3코스_2번선수) 'others'로 17개 변환
Columns:(코스_2코스_5번선수) 'others'로 29개 변환
Columns:(코스_2코스_5번선수) 'others'로 12개 변환
Columns:(코스_3코스_4번선수) 'others'로 25개 변환
Columns:(코스_3코스_4번선수) 'others'로 30개 변환
Columns:(코스_4코스_1번선수) 'others'로 22개 변환
Columns:(코스_4코스_1번선수) 'others'로 13개 변환
Columns:(보트번호_4번선수) 변환 X
Columns:(보트번호_4번선수) 변환 X


In [132]:
def all_process(train, val, target='단승', is_train=True, is_dddd=False):
    train['성별'] = train['성별'].map({'남': 0, '여': 1})
    val['성별'] = val['성별'].map({'남': 0, '여': 1}) # 성별 mapping 남:0, 여:1
     
    train = add_y(train)
    if is_train:
        if is_dddd:
            val = add_y_(val)
        else:
            val = add_y(val)
            
    train = add_weight_penalty(train) # 중량부과여부, 부과된 중량 추가 / 체중은 drop
    val = add_weight_penalty(val) # 연승 정확도를 높이려면 제거

    train = bin_age(train) # 나이 binning
    val = bin_age(val)
    
    train = drop_columns_from_datasets(train)
    val = drop_columns_from_datasets(val)

    train, val = last_race_process(train, val) # 전일 성적 평균 추가

    train, val = reverse_rank_values(train, val) # 착순 관련 데이터 reverse

    train = separation_course(train)
    val = separation_course(val)

    train, val = laplace_smoothing_to_course(train, val, alpha=1)

    train, val = apply_pca(train, val, n_components=2) # 코스별 성적 pca + 해당 코스만 남김
    train = set_course_scores(train)
    val = set_course_scores(val) 
    
    train = split_last_eight_rank(train)
    # train = adjust_for_top3(train)
    
    val = split_last_eight_rank(val)
    # val = adjust_for_top3(val)

    train, val = adjust_last_eight_rank(train, val) # 최근 8경기 중 3개는 평균, 나머지 버림

    train, val = encode_categorical_features(train, val)

    if is_train:
        drop_cols = [
            'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수', '플라잉', '온라인'
        ]
    else:
        drop_cols = [
            'Race_ID', '번호', '선수명', '기수', '플라잉', '온라인'
        ]

    train_type = train[['Race_ID', '플라잉', '온라인']]  # 일차까지? 
    val_type = val[['Race_ID', '플라잉', '온라인']]
    
    X_train = train.drop(drop_cols, axis=1)
    y_train = train[[target]]
    X_train = reshape_race_data(X_train, players_per_race=6)
    y_train = reshape_race_data(y_train, players_per_race=6)
    # y_train = np.array(y_train).astype(int)

    X_val = val.drop(drop_cols, axis=1)
    X_val = reshape_race_data(X_val, players_per_race=6)
    if is_train:
        y_val = val[[target]]
        y_val = reshape_race_data(y_val, players_per_race=6)
        # y_val = np.array(y_val).astype(int)

    train_type.drop_duplicates(inplace=True)
    val_type.drop_duplicates(inplace=True)
    
    train_type = train_type.reset_index(drop=True)
    val_type = val_type.reset_index(drop=True)
    
    X_train = pd.concat([train_type, X_train], axis=1)
    X_val = pd.concat([val_type, X_val], axis=1)
    
    X_train.drop('Race_ID', axis=1, inplace=True)
    X_val.drop('Race_ID', axis=1, inplace=True)

    X_train, X_val = low_to_others(X_train, X_val, threshold=5, is_train=is_train, verbose=False)
    
    if is_train:
        return X_train, y_train, X_val, y_val
    else:
        return X_train, y_train, X_val
    
    
def cal_y_val_for_연승(train, test):
    df_train = train.copy()
    df_test = test.copy()
    _, _, _, y_val = all_process(df_train, df_test, target='복승', is_train=True)
    
    return y_val
    
    
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

X_train, y_train, X_test, y_test = all_process(train, test, target='복승', is_train=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(9724, 152) (9724, 6) (1088, 152) (1088, 6)


In [102]:
X_train

Unnamed: 0,플라잉,온라인,등급_1번선수,성별_1번선수,나이_1번선수,최근6회차_평균착순점_1번선수,최근6회차_평균득점_1번선수,최근6회차_승률_1번선수,최근6회차_연대율2_1번선수,최근6회차_연대율3_1번선수,...,모터_연대율2_6번선수,모터_연대율3_6번선수,보트_평균착순점_6번선수,보트_연대율_6번선수,중량부과여부_6번선수,부과된중량_6번선수,PCA_코스성적_1_6번선수,PCA_코스성적_2_6번선수,코스_성적_6번선수,최근3경기_평균_6번선수
0,1.0,0.0,0.755632,0.0,45.0,3.33,3.40,40.0,40.0,60.0,...,0.0,0.0,10.00,0.0,1.0,3.0,-15.966908,-36.511023,12.682293,3.000000
1,1.0,0.0,0.755632,0.0,35.0,3.56,3.72,12.5,43.8,75.0,...,0.0,0.0,10.00,0.0,0.0,0.0,-11.700030,8.850585,1.560071,1.333333
2,1.0,0.0,0.616358,0.0,45.0,3.67,3.67,20.0,53.3,66.7,...,0.0,0.0,10.00,0.0,0.0,0.0,14.724391,-17.554835,30.577234,2.000000
3,1.0,0.0,0.526792,0.0,45.0,7.20,7.20,6.7,6.7,13.3,...,0.0,0.0,10.00,0.0,0.0,0.0,-28.204553,-8.589447,2.340106,1.333333
4,1.0,0.0,0.755632,0.0,35.0,2.11,2.17,50.0,66.7,83.3,...,0.0,0.0,10.00,0.0,0.0,0.0,-31.171677,12.920544,22.788127,1.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,1.0,0.0,0.526792,0.0,35.0,4.19,4.25,31.3,50.0,56.3,...,26.9,41.9,4.20,40.7,0.0,0.0,29.333144,20.813775,14.280079,2.000000
9720,1.0,0.0,0.755632,0.0,55.0,3.53,3.71,29.4,52.9,70.6,...,29.3,50.3,4.55,37.9,1.0,2.0,-34.080408,12.627854,2.005805,3.333333
9721,1.0,0.0,0.526792,0.0,35.0,5.06,5.28,16.7,22.2,44.4,...,28.9,46.0,5.41,27.6,0.0,0.0,-25.679528,-15.868776,1.755079,2.333333
9722,1.0,0.0,0.526792,0.0,45.0,6.37,6.75,6.3,12.5,31.3,...,30.5,45.6,4.53,37.7,0.0,0.0,15.242146,-26.309298,12.682293,3.666667


In [103]:
# from ctgan import CTGAN

# ctgan_list = []
# for i in range(6):
#     y_i = y_train.iloc[:, i]  # i번째 레이블에 대한 y 값

#     Xy_train = pd.concat([X_train, y_i], axis=1)
#     ctgan = CTGAN(batch_size=600, discriminator_steps=1, verbose=True, epochs=100, pac=1, cuda=True)

#     cat_features = Xy_train.columns[Xy_train.columns.str.contains('성별|등급|나이|복승')].tolist()
#     # cat_features = Xy_train.columns[Xy_train.columns.str.contains('복승')].tolist()
#     print(cat_features)

#     ctgan.fit(Xy_train, discrete_columns=cat_features)
#     ctgan_list.append(ctgan)
    
# print(len(ctgan_list))

In [104]:
# new_data = ctgan_list[5].sample(3000)


In [172]:
target_value = '삼복승'
seed = 42

## CatBoost

In [173]:
from catboost_function import cal_params
from catboost_function import custom_CatBoostClassifier
from catboost_function import evaluate_

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

params_list = cal_params(target_value=target_value, seed=seed)

unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

smote_strategies = []
for i in range(6):
    smote_strategies.append(1-0.15*i)

models_cb = []
scores = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]
    val_ids = unique_race_ids[val_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    val_fold = train[train['Race_ID'].isin(val_ids)].reset_index(drop=True)

    X_train, y_train, X_val, y_val = all_process(train_fold, val_fold, target=target_value, is_train=True)
    
    model = custom_CatBoostClassifier(params_list)
    cat_features = cal_cat_cols(X_train, X_test)
    if i ==0:
        print('범주형 변수: ')
        print(cat_features)
    
    model.fit(X_train, y_train)

    models_cb.append(model)
    y_pred = models_cb[i].predict_proba(X_val)
    accuracy = evaluate_(y_pred, y_val, target_value)
    scores.append(accuracy)
    print(f'folds {i+1}')
    print(f'Accuracy: {accuracy}')


print(f'최종 스코어: Avg. Accuracy of validset: {np.mean(scores)}, Std. Accuracy of validset: {np.std(scores)}')

범주형 변수: 
[]
folds 1
Accuracy: 0.21131105398457584
folds 2
Accuracy: 0.21388174807197943
folds 3
Accuracy: 0.19794344473007713
folds 4
Accuracy: 0.21388174807197943
folds 5
Accuracy: 0.20473251028806586
최종 스코어: Avg. Accuracy of validset: 0.20835010102933554, Std. Accuracy of validset: 0.006186290302178567


In [174]:
def cal_mean_feature_importance(models):
    importance_avg = None
    
    for model in models:
        importance_df = model.print_feature_importance()
        if importance_avg is None:
            importance_avg = importance_df.copy()
        else:
            importance_avg['Importance'] += importance_df['Importance']
    
    importance_avg['Importance'] /= len(models)
    
    importance_avg = importance_avg.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    return importance_avg
    
importance_avg = cal_mean_feature_importance(models_cb)
importance_avg

Unnamed: 0,Group,Importance
0,모터_연대율2,1.31658
1,모터_평균착순점,1.266761
2,모터_연대율3,1.170961
3,전일성적,1.159058
4,최근6회차_평균착순점,0.950716
5,등급,0.918167
6,PCA_코스성적_1,0.869951
7,최근6회차_평균득점,0.798888
8,연간성적_평균착순점,0.791048
9,최근6회차_연대율3,0.758192


- Test 데이터

### RF

In [175]:
from rf_function import custom_randomforest, cal_rf_params

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

models_rf = []
scores = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]
    val_ids = unique_race_ids[val_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    val_fold = train[train['Race_ID'].isin(val_ids)].reset_index(drop=True)

    X_train, y_train, X_val, y_val = all_process(train_fold, val_fold, target=target_value, is_train=True)

    params_list = cal_rf_params()
    model = custom_randomforest(params_list)
    model.fit(X_train, y_train)

    models_rf.append(model)

    y_pred = models_rf[i].predict_proba(X_val)

    accuracy = evaluate_(y_pred, y_val, target_value)
    scores.append(accuracy)
    print(f'folds {i+1}')
    print(f'Accuracy: {accuracy}')


print(f'최종 스코어: Avg. Accuracy of validset: {np.mean(scores)}, Std. Accuracy of validset: {np.std(scores)}')

folds 1
Accuracy: 0.19588688946015423
folds 2
Accuracy: 0.2
folds 3
Accuracy: 0.18868894601542416
folds 4
Accuracy: 0.2051413881748072
folds 5
Accuracy: 0.20267489711934156
최종 스코어: Avg. Accuracy of validset: 0.19847842415394545, Std. Accuracy of validset: 0.005777033879796884


In [176]:
importance_avg = cal_mean_feature_importance(models_rf)
importance_avg

Unnamed: 0,Group,Importance
0,PCA_코스성적_2,0.01155
1,PCA_코스성적_1,0.0112
2,모터_평균착순점,0.009839
3,모터_연대율2,0.009684
4,모터_연대율3,0.009362
5,최근6회차_평균착순점,0.009151
6,최근6회차_평균득점,0.009107
7,보트_평균착순점,0.008435
8,연간성적_평균착순점,0.008279
9,보트_연대율,0.008203


In [177]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

y_test_for_연승 = cal_y_val_for_연승(train, test)
y_test_for_연승 = np.array(y_test_for_연승)

In [178]:
race_id = test[['Race_ID']].copy()
race_id_unique = race_id.drop_duplicates().reset_index(drop=True)

In [182]:
unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

all_predictions_cb = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    test_fold = test.copy()

    X_train, y_train, X_test, y_test = all_process(train_fold, test_fold, target=target_value, is_train=True)

    y_pred_cb = models_cb[i].predict_proba(X_test)
    all_predictions_cb.append(y_pred_cb)

mean_pred_cb = np.mean(all_predictions_cb, axis=0)
# mean_pred_cb = mean_pred_cb / mean_pred_cb.sum(axis=1, keepdims=True)

y_test = np.array(y_test)
accuracy, 연승_score = evaluate_(mean_pred_cb, y_test, target_value, contain_연승=True, y_val_연승=y_test_for_연승)

print(f'최종 스코어: Accuracy of testset: {accuracy}, 연승 적중률: Accuracy of testset: {연승_score}')

최종 스코어: Accuracy of testset: 0.19393382352941177, 연승 적중률: Accuracy of testset: 0.6764705882352942


In [183]:
unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

all_predictions_rf = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    test_fold = test.copy()

    X_train, y_train, X_test, y_test = all_process(train_fold, test_fold, target=target_value, is_train=True)

    y_pred_rf = models_rf[i].predict_proba(X_test)
    all_predictions_rf.append(y_pred_rf)

mean_pred_rf = np.mean(all_predictions_rf, axis=0)
# mean_pred_rf = mean_pred_rf / mean_pred_rf.sum(axis=1, keepdims=True)

y_test = np.array(y_test)
accuracy, 연승_score = evaluate_(mean_pred_rf, y_test, target_value, contain_연승=True, y_val_연승=y_test_for_연승)

print(f'최종 스코어: Accuracy of testset: {accuracy}, 연승 적중률: Accuracy of testset: {연승_score}')

최종 스코어: Accuracy of testset: 0.17555147058823528, 연승 적중률: Accuracy of testset: 0.6507352941176471


In [184]:
mean_pred = (mean_pred_cb + mean_pred_rf) / 2

y_test = np.array(y_test)
accuracy, 연승_score = evaluate_(mean_pred, y_test, target_value, contain_연승=True, y_val_연승=y_test_for_연승)

print(f'최종 스코어: Accuracy of testset: {accuracy}, 연승 적중률: Accuracy of testset: {연승_score}')

최종 스코어: Accuracy of testset: 0.1957720588235294, 연승 적중률: Accuracy of testset: 0.6663602941176471


In [135]:
pd.DataFrame(mean_pred, columns=['1번', '2번', '3번', '4번', '5번', '6번'])

Unnamed: 0,1번,2번,3번,4번,5번,6번
0,0.330745,0.204855,0.179442,0.127167,0.065141,0.092650
1,0.243238,0.199895,0.315870,0.072328,0.156935,0.011735
2,0.329255,0.270028,0.206466,0.119977,0.038387,0.035886
3,0.429965,0.308791,0.109290,0.056959,0.085995,0.008999
4,0.319250,0.211408,0.243234,0.125818,0.061081,0.039209
...,...,...,...,...,...,...
1027,0.395945,0.089820,0.095773,0.171510,0.246297,0.000656
1028,0.427923,0.310202,0.044439,0.109751,0.087406,0.020281
1029,0.511534,0.331300,0.035404,0.090000,0.030955,0.000807
1030,0.434724,0.248380,0.141987,0.131950,0.041165,0.001794


In [136]:
mean_pred_복승 = pd.concat([race_id_unique, pd.DataFrame(mean_pred, columns=['1번', '2번', '3번', '4번', '5번', '6번'])], axis=1)
mean_pred_복승

Unnamed: 0,Race_ID,1번,2번,3번,4번,5번,6번
0,2024_1_1_1,0.330745,0.204855,0.179442,0.127167,0.065141,0.092650
1,2024_1_1_2,0.243238,0.199895,0.315870,0.072328,0.156935,0.011735
2,2024_1_1_3,0.329255,0.270028,0.206466,0.119977,0.038387,0.035886
3,2024_1_1_4,0.429965,0.308791,0.109290,0.056959,0.085995,0.008999
4,2024_1_1_5,0.319250,0.211408,0.243234,0.125818,0.061081,0.039209
...,...,...,...,...,...,...,...
1027,2024_36_2_12,0.395945,0.089820,0.095773,0.171510,0.246297,0.000656
1028,2024_36_2_13,0.427923,0.310202,0.044439,0.109751,0.087406,0.020281
1029,2024_36_2_14,0.511534,0.331300,0.035404,0.090000,0.030955,0.000807
1030,2024_36_2_15,0.434724,0.248380,0.141987,0.131950,0.041165,0.001794


In [137]:
mean_pred_복승.to_csv(f'./단승_확률값.csv', index=False, encoding='utf-8-sig')

In [148]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

X_train, y_train, X_test, y_test = all_process(train, test, target='삼복승', is_train=True, is_dddd=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(9724, 152) (9724, 6) (1088, 152) (1088, 6)


In [82]:
y_test

Unnamed: 0,삼복승_1번선수,삼복승_2번선수,삼복승_3번선수,삼복승_4번선수,삼복승_5번선수,삼복승_6번선수
0,0.0,3.0,2.0,0.0,1.0,0.0
1,2.0,0.0,1.0,0.0,0.0,3.0
2,0.0,2.0,3.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,2.0,3.0
4,1.0,0.0,0.0,2.0,0.0,3.0
...,...,...,...,...,...,...
1027,2.0,3.0,0.0,0.0,1.0,0.0
1028,0.0,1.0,2.0,3.0,0.0,0.0
1029,1.0,2.0,0.0,0.0,3.0,0.0
1030,1.0,2.0,0.0,0.0,3.0,0.0


In [83]:
y_test.to_csv(f'./삼복승_순위값.csv', index=False, encoding='utf-8-sig')

- Base_line_단승:  Accuracy of testset: 0.4532940019665683
- 

## 복승
- 최종 스코어: Accuracy of testset: 0.2693798449612403, 연승 적중률: Accuracy of testset: 0.6821705426356589 (범주형 변수 변환(등급, 나이))
- 최종 스코어: Accuracy of testset: 0.2751937984496124, 연승 적중률: Accuracy of testset: 0.6724806201550387 (범주형 변수 변환(등급, 나이)) 파라미터 optuna로 조정

- RF
- 최종 스코어: Accuracy of testset: 0.2761627906976744, 연승 적중률: Accuracy of testset: 0.6560077519379846

- 앙상블
- 최종 스코어: Accuracy of testset: 0.2800387596899225, 연승 적중률: Accuracy of testset: 0.6666666666666666


### 삼복승
- Base_line_삼복승: Accuracy of testset: 0.18682399213372664

# 실제 경기 예측

In [185]:
from crawlling_entry import crawl_race_entries

연도 = 2024
회차 = 37
일차 = 2
sub = crawl_race_entries(연도, 회차, 일차)
for col in sub.columns:
    try:
        sub[col] = sub[col].astype(float)
    except:
        continue

sub['Race_ID'] = sub.apply(lambda row: f"{int(row['연도'])}_{int(row['회차'])}_{int(row['일차'])}_{int(row['경주번호'])}", axis=1)
sub

Unnamed: 0,연도,회차,일차,경주번호,경기종류,번호,등급,기수,선수명,성별,...,모터_평균착순점,모터_연대율2,모터_연대율3,전탑승선수1,전탑승선수2,보트번호,보트_평균착순점,보트_연대율,특이사항,Race_ID
0,2024.0,37.0,2.0,1.0,플라잉,1.0,B2,17.0,박지윤,여,...,5.00,38.2,50.0,권명호/632,이용세/236,35.0,4.31,21.9,,2024_37_2_1
1,2024.0,37.0,2.0,1.0,플라잉,2.0,B2,17.0,김태훈,남,...,4.77,17.9,51.3,김동민/534,김기한/352,43.0,7.31,69.2,,2024_37_2_1
2,2024.0,37.0,2.0,1.0,플라잉,3.0,B2,17.0,김미연,여,...,5.51,40.0,54.3,최인원/15,우진수/646,45.0,4.63,20.0,,2024_37_2_1
3,2024.0,37.0,2.0,1.0,플라잉,4.0,B2,17.0,이현준,남,...,6.68,48.6,70.3,김종민/141,박원규/11,32.0,5.33,29.2,,2024_37_2_1
4,2024.0,37.0,2.0,1.0,플라잉,5.0,B2,17.0,조미화,여,...,5.38,40.0,50.0,서 휘/1223,배혜민/566,102.0,5.09,28.6,,2024_37_2_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,2024.0,37.0,2.0,17.0,온라인,2.0,B1,1.0,오세준,남,...,5.68,37.8,56.8,염윤정/44,이재학/114,67.0,5.77,46.2,,2024_37_2_17
98,2024.0,37.0,2.0,17.0,온라인,3.0,B1,4.0,구현구,남,...,4.26,23.5,35.3,김재윤/666,문주엽/3555,3.0,6.57,52.2,,2024_37_2_17
99,2024.0,37.0,2.0,17.0,온라인,4.0,A2,11.0,전정환,남,...,4.74,32.3,45.2,이경섭/53,조규태/561,87.0,6.96,56.5,F 후 출전,2024_37_2_17
100,2024.0,37.0,2.0,17.0,온라인,5.0,A1,1.0,서화모,남,...,5.08,35.1,48.6,서종원/565,박준호/465,29.0,6.67,45.8,,2024_37_2_17


In [186]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

unique_race_ids = train['Race_ID'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

all_predictions_cb = []
all_predictions_rf = []
for i, (train_race_ids, val_race_ids) in enumerate(kf.split(unique_race_ids)):
    train_ids = unique_race_ids[train_race_ids]

    train_fold = train[train['Race_ID'].isin(train_ids)].reset_index(drop=True)
    test_fold = sub.copy()

    X_train, y_train, X_test = all_process(train_fold, test_fold, target=target_value, is_train=False)

    y_pred_cb = models_cb[i].predict_proba(X_test)
    all_predictions_cb.append(y_pred_cb)

    y_pred_rf = models_rf[i].predict_proba(X_test)
    all_predictions_rf.append(y_pred_rf)

mean_pred_cb = np.mean(all_predictions_cb, axis=0)
# mean_pred_cb = mean_pred_cb / mean_pred_cb.sum(axis=1, keepdims=True)

mean_pred_rf = np.mean(all_predictions_rf, axis=0)
# mean_pred_rf = mean_pred_rf / mean_pred_rf.sum(axis=1, keepdims=True)

mean_pred = (mean_pred_cb + mean_pred_rf) / 2

mean_pred_df = pd.DataFrame(mean_pred)
mean_pred_df.index = [f"{i+1}경기" for i in range(len(mean_pred_df))]
mean_pred_df.columns = [f"{i+1}번" for i in range(mean_pred_df.shape[1])]
mean_pred_df = mean_pred_df.div(mean_pred_df.sum(axis=1), axis=0)
mean_pred_df

Unnamed: 0,1번,2번,3번,4번,5번,6번
1경기,0.199887,0.171474,0.16297,0.198269,0.144743,0.122657
2경기,0.250998,0.208412,0.183113,0.123495,0.115578,0.118404
3경기,0.204015,0.19207,0.105075,0.161724,0.179477,0.157639
4경기,0.225451,0.219877,0.232464,0.108399,0.174131,0.039678
5경기,0.265736,0.164338,0.145411,0.110745,0.17567,0.138099
6경기,0.199789,0.186192,0.118456,0.142588,0.198839,0.154136
7경기,0.156265,0.198256,0.172374,0.234315,0.132305,0.106485
8경기,0.225353,0.194138,0.151535,0.115367,0.216255,0.097352
9경기,0.218999,0.200532,0.196015,0.194052,0.11771,0.072692
10경기,0.201496,0.243983,0.142349,0.156435,0.123862,0.131876


In [187]:
mean_pred_df.to_excel(f'./result/{연도}_{회차}_{일차}_{target_value}.xlsx', index=True, float_format="%.3f")

In [188]:
mean_pred_df.to_csv(f'./result/{연도}_{회차}_{일차}_{target_value}_scaling.csv', index=False, encoding='utf-8-sig')