In [1]:
import pandas as pd
import numpy as np
import os
from encoding_function import low_frequency_to_others


### 데이터 불러오기
- 학습: 2023년도 이전
- 검증: 2023년
- 테스트: 2024년

In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 999

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
val = pd.read_csv(os.path.join(ROOT_DIR, "val.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

print(train.shape, val.shape)

(48522, 43) (8952, 43)


## 사용하지 않는 feature drop

In [3]:
def drop_columns_from_datasets(df):
    drop_cols = [
        '연도', '회차', '일차', '경주번호',
        '금일출주경주',
        '모터번호', '전탑승선수1', '전탑승선수2',
        '보트번호', '특이사항'
    ]

    df = df.drop(drop_cols, axis=1)

    return df


train = drop_columns_from_datasets(train)
val = drop_columns_from_datasets(val)

print(train.shape, val.shape)

(48522, 33) (8952, 33)


## 일부 숫자형 변수 변환

- 코스별 성적/경기수 분리

In [4]:
def separation_course(df):
    col_list = [
        '코스_1코스', '코스_2코스', '코스_3코스', '코스_4코스', '코스_5코스', '코스_6코스'
    ]
    for col in col_list:
        df[[f'{col[3:]}_성적', f'{col[3:]}_경기수']] = df[col].fillna('').str.split('/', expand=True)

    df.drop(col_list, axis=1, inplace=True)

    return df
        


train = separation_course(train)
val = separation_course(val)

print(train.shape, val.shape)

(48522, 39) (8952, 39)


- 코스별 성적 스무딩

In [5]:
def apply_laplace_smoothing(df, col, global_mean, alpha):
    # 경기수 0인 값이 너무 높게 나오는 경향이 있어 분모에 상수 1 추가(없애도 됨)
    encoded_value = (df[f'{col}_성적'] * df[f'{col}_경기수'] + global_mean * alpha) / (1 + df[f'{col}_경기수'] + alpha)
    df[f'{col}_성적'] = encoded_value

    return df

def laplace_smoothing_to_course(train, val=None, alpha=1):
    col_list = [
        '1코스', '2코스', '3코스', '4코스', '5코스', '6코스'
    ]
    for col in col_list:
        train[f'{col}_성적'] = train[f'{col}_성적'].astype(float)
        train[f'{col}_경기수'] = train[f'{col}_경기수'].astype(float)
        if val is not None:
            val[f'{col}_성적'] = val[f'{col}_성적'].astype(float)
            val[f'{col}_경기수'] = val[f'{col}_경기수'].astype(float)

    # Train 데이터에서 글로벌 평균 계산
    global_means = {col: train[f'{col}_성적'].mean() for col in col_list}

    for col in col_list:
        # Train 데이터에 라플라스 스무딩 적용
        train = apply_laplace_smoothing(train, col, global_means[col], alpha)
        train.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        for col in col_list:
            # Validation 데이터에 Train에서 구한 글로벌 평균으로 라플라스 스무딩 적용
            val = apply_laplace_smoothing(val, col, global_means[col], alpha)
            val.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        return train, val
    else:
        return train


train, val = laplace_smoothing_to_course(train, val, alpha=1) # 알파가 작을수록 빈도수에 가깝세, 알파가 클수록 전체 평균에 가깝게
print(train.shape, val.shape)

(48522, 33) (8952, 33)


- 최근 8경기 착순 분리

In [6]:
def split_last_eight_rank(df):
    for i in range(0, 4):
        df[f'최근{i+1}경기_착순'] = df['최근8경주_착순'].str[i]
    for j in range(5, 9):
        df[f'최근{j}경기_착순'] = df['최근8경주_착순'].str[j]
        
    df.drop('최근8경주_착순', axis=1, inplace=True)
        
    return df

def adjust_for_top3(df):
    col_list = [
        '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
        '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
    ]
    
    for col in col_list:
    # 순위가 1, 2, 3이 아닌 경우, 결측인 경우, 6으로 조정
    # (일반화된 성능을 위해 + 3등내에 드는게 중요)
        df[col] = df[col].apply(lambda x: x if x in ['1', '2', '3'] else '6')

    return df


train = split_last_eight_rank(train)
train = adjust_for_top3(train)

val = split_last_eight_rank(val)
val = adjust_for_top3(val)

print(train.shape, val.shape)

(48522, 40) (8952, 40)


## 범주형 변수 확인


In [7]:
def cal_cat_cols(train, val=None):
    num_features = []
    objective_cols = []

    # 특정 문자열이 포함된 열을 범주형 변수로 지정
    word_list = ['번호', '기수', '경기_착순', 'Race_ID']
    for col in train.columns:
        if any(sub in col for sub in word_list):
            objective_cols.append(col)
            train[col] = train[col].astype('str')
            if val is not None:
                val[col] = val[col].astype('str')

    # 나머지 열에 대해 숫자형 변환 시도
    for col in train.columns:
        if col in objective_cols:
            continue  # 이미 범주형으로 처리된 열은 제외
        try:
            # 'float' 타입으로 변환 시도
            train[col] = train[col].astype('float')
            if val is not None:
                val[col] = val[col].astype('float')
            num_features.append(col)
        except:
            objective_cols.append(col)

    cat_features = list(set(objective_cols) - set(['rank', 'Race_ID']))

    return num_features, cat_features


num_features, cat_features = cal_cat_cols(train, val)
cat_features

['최근6경기_착순',
 '최근4경기_착순',
 '최근5경기_착순',
 '성별',
 '최근2경기_착순',
 '최근1경기_착순',
 '등급',
 '기수',
 'FL',
 '최근7경기_착순',
 '최근8경기_착순',
 '전일성적',
 '선수명',
 '번호',
 '최근3경기_착순']

- 낮은 빈도 데이터 통합

In [8]:
def low_to_others(train, val):
    _, cat_features = cal_cat_cols(train, val)
    
    for col in cat_features:
        unifier = low_frequency_to_others(threshold=5, verbose=False)
        train[col] = unifier.fit_transform(train[col])
        val[col] = unifier.transform(val[col])
        
    return train, val
        
train, val = low_to_others(train, val)

## X, y 분리

In [11]:
def add_y(df, target='연승', is_train=True):
    df['rank'] = df['rank'].replace(0, 6)
    df['target'] = 0
    
    if is_train:
        if target=='단승': # 1등여부
            condition_target = df['rank'] <= 1
        elif target=='연승':
            condition_target = df['rank'] <= 2
        elif target=='삼복승':
            condition_target = df['rank'] <= 3
        
        df.loc[condition_target, 'target'] = 1

    return df

drop_cols = [
    '전일성적', 'rank', 'target'
]

train = add_y(train)
val = add_y(val)

X_train = train.drop(drop_cols, axis=1)
y_train = train[['Race_ID', '번호', 'target']]
X_val = val.drop(drop_cols, axis=1)
y_val = val[['Race_ID', '번호', 'target']]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(48522, 38) (48522, 3) (8952, 38) (8952, 3)


In [15]:
X_train

Unnamed: 0,Race_ID,번호,등급,기수,선수명,성별,나이,체중,최근6회차_평균착순점,최근6회차_평균득점,...,5코스_성적,6코스_성적,최근1경기_착순,최근2경기_착순,최근3경기_착순,최근4경기_착순,최근5경기_착순,최근6경기_착순,최근7경기_착순,최근8경기_착순
0,2016_1_1_1,1,A1,1,정용진,남,44.0,56.0,6.67,6.60,...,15.108977,30.622448,1,1,1,3,3,1,6,6
1,2016_1_1_1,2,A1,13,김민준,남,28.0,59.0,6.17,6.03,...,15.108977,14.319642,1,1,2,6,6,1,6,2
2,2016_1_1_1,3,B2,11,김현덕,남,32.0,59.0,3.31,2.56,...,10.335906,7.178567,6,3,6,6,1,6,6,6
3,2016_1_1_1,4,A1,8,한종석,남,33.0,58.0,6.86,6.81,...,38.206528,23.839682,1,1,2,2,3,1,2,3
4,2016_1_1_1,5,B1,9,정재용,남,35.0,61.0,4.06,3.94,...,3.445302,1.794642,6,6,6,2,6,6,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48517,2022_51_2_17,2,A1,1,정민수,남,47.0,61.0,5.83,5.50,...,65.058977,1.305194,6,3,6,6,6,6,2,1
48518,2022_51_2_17,3,A1,10,김완석,남,40.0,56.0,7.33,7.33,...,35.663535,31.435713,1,6,2,6,2,1,2,2
48519,2022_51_2_17,4,A2,2,손근성,남,50.0,54.0,5.29,5.18,...,31.524545,11.435713,6,3,6,2,6,6,6,2
48520,2022_51_2_17,5,A2,1,곽현성,남,50.0,55.0,6.94,6.71,...,40.083977,23.839682,2,1,3,6,6,2,1,2


In [18]:
def create_player_df(df, player_number):
    player_df = df[df['번호'] == str(player_number)].copy()

    # 컬럼명에 선수 번호를 추가
    new_columns = {col: f'{col}_{player_number}번선수' for col in player_df.columns if col != 'Race_ID'}
    player_df.rename(columns=new_columns, inplace=True)

    # 'Race_ID' 컬럼만 유지하고 나머지는 선수 번호가 붙은 컬럼으로 변경
    player_df = player_df[['Race_ID'] + list(new_columns.values())]

    return player_df

def merge_all_players(df):
    merged_df = None

    for player_number in range(1, 7):
        player_df = create_player_df(df, player_number)

        if merged_df is None:
            merged_df = player_df
        else:
            merged_df = pd.merge(merged_df, player_df, on='Race_ID', how='inner')

    return merged_df

X_merged = merge_all_players(X_train)
X_merged

Unnamed: 0,Race_ID,번호_1번선수,등급_1번선수,기수_1번선수,선수명_1번선수,성별_1번선수,나이_1번선수,체중_1번선수,최근6회차_평균착순점_1번선수,최근6회차_평균득점_1번선수,...,5코스_성적_6번선수,6코스_성적_6번선수,최근1경기_착순_6번선수,최근2경기_착순_6번선수,최근3경기_착순_6번선수,최근4경기_착순_6번선수,최근5경기_착순_6번선수,최근6경기_착순_6번선수,최근7경기_착순_6번선수,최근8경기_착순_6번선수
0,2016_1_1_1,1,A1,1,정용진,남,44.0,56.0,6.67,6.60,...,2.953116,12.717459,1,6,6,3,3,6,6,6
1,2016_1_1_2,1,A1,6,양원준,남,39.0,55.0,6.44,6.28,...,24.541313,1.595237,3,6,6,6,1,6,6,6
2,2016_1_1_3,1,A2,1,김종목,남,42.0,55.0,6.33,6.33,...,40.083977,30.622448,6,6,1,6,1,3,1,6
3,2016_1_1_4,1,B1,4,경상수,남,45.0,56.0,2.80,2.80,...,15.108977,2.392856,6,6,6,1,3,6,6,1
4,2016_1_1_5,1,A1,4,어선규,남,38.0,57.0,7.89,7.83,...,4.134363,22.851427,6,6,2,6,6,6,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8082,2022_51_2_13,1,A1,6,안지민,여,37.0,52.0,5.47,5.11,...,32.067181,1.794642,6,1,6,6,3,6,6,6
8083,2022_51_2_14,1,A2,7,윤동오,남,39.0,61.0,5.00,4.72,...,3.445302,12.717459,6,1,6,1,1,3,3,6
8084,2022_51_2_15,1,A1,6,손지영,여,37.0,53.0,7.16,7.13,...,52.608977,23.839682,6,1,1,3,3,6,3,1
8085,2022_51_2_16,1,A2,1,강지환,남,44.0,61.0,5.17,4.81,...,27.558977,23.839682,1,6,6,1,3,2,1,1


In [None]:
def all_precoess(train, val):
    ROOT_DIR = "data"
    RANDOM_STATE = 999
    
    train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
    val = pd.read_csv(os.path.join(ROOT_DIR, "val.csv"))

    train = drop_columns_from_datasets(train)
    val = drop_columns_from_datasets(val)

    train = separation_course(train)
    val = separation_course(val)

    train, val = laplace_smoothing_to_course(train, val, alpha=1)

    train = split_last_eight_rank(train)
    train = adjust_for_top3(train)
    
    val = split_last_eight_rank(val)
    val = adjust_for_top3(val)

    train, val = low_to_others(train, val)

    train = add_y(train, target='단승')
    val = add_y(val, target='단승')

    X_train = train.drop(drop_cols, axis=1)
    y_train = train['연승']
    X_val = val.drop(drop_cols, axis=1)
    y_val = val['연승']