작업 환경: Google Colab (파일 경로 등 수정해서 사용할 것)

본 코드는 Colab 무료 버전 상에서 작성되었습니다.

수행 시간: 3시간 이상 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Libraries

사용할 라이브러리를 import 합니다 

In [2]:
import random
import os
import numpy as np
import pandas as pd
import gc

from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer, log_loss
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Fixed Seed

# csv to parquet


메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [4]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [5]:
# 주의: 반드시 본인 환경에 맞게 파일 경로 수정해서 사용할 것
csv_to_parquet('/content/drive/MyDrive/dataset/airplane/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/dataset/airplane/test.csv', 'test')

train Done.
test Done.


# Data Load

데이터를 가져옵니다 

In [6]:
# 주의: 반드시 본인 환경에 맞게 파일 경로 수정해서 사용할 것
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('/content/drive/MyDrive/dataset/airplane/sample_submission.csv', index_col = 0)

# Data Preprocessing

## Estimated_Departure_Time, Estimated_Arrival_Time 형변환, 시간/분 분리 

In [None]:
train.dropna(subset=['Estimated_Departure_Time', 'Estimated_Arrival_Time'], how='all', axis='index', inplace =True)

In [7]:
import datetime
def to_time(time_list):
    Time = pd.Series(time_list).astype(str).str.zfill(4)
    Time = Time.replace('2400','0000')
    return [datetime.datetime.strptime(i, '%H%M').strftime("%H:%M") if i != '0nan' else np.NaN for i in Time] 

time_list = [i if str(i) == 'nan' else str(int(i))  for i in train['Estimated_Departure_Time'] ]
time_list1 = [i if str(i) == 'nan' else str(int(i))  for i in train['Estimated_Arrival_Time'] ]
train['Estimated_Departure_Time_HH:MM'] = to_time(time_list)
train['Estimated_Arrival_Time_HH:MM'] = to_time(time_list1)
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,...,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,...,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,...,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,,07:40,10:24
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,...,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,,16:10,18:05
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,...,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,,09:05,17:35
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,...,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,,09:00,10:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,...,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,,09:36,12:43
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,...,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,,09:20,10:28
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,...,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,,08:00,13:40
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,...,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,,16:13,18:24


In [8]:
time_list = [i if str(i) == 'nan' else str(int(i))  for i in test['Estimated_Departure_Time'] ]
time_list1 = [i if str(i) == 'nan' else str(int(i))  for i in test['Estimated_Arrival_Time'] ]

test['Estimated_Departure_Time_HH:MM'] = to_time(time_list)
test['Estimated_Arrival_Time_HH:MM'] = to_time(time_list1)

In [9]:
train['Estimated_Departure_HH'] = [int(i.split(':')[0]) if str(i) != 'nan' else np.NaN for i in train['Estimated_Departure_Time_HH:MM']]
train['Estimated_Departure_MM'] = [int(i.split(':')[1]) if str(i) != 'nan' else np.NaN for i in train['Estimated_Departure_Time_HH:MM']]

train['Estimated_Arrival_HH'] = [int(i.split(':')[0]) if str(i) != 'nan' else np.NaN for i in train['Estimated_Arrival_Time_HH:MM']]
train['Estimated_Arrival_MM'] = [int(i.split(':')[1]) if str(i) != 'nan' else np.NaN for i in train['Estimated_Arrival_Time_HH:MM']]
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,...,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Departure_HH,Estimated_Departure_MM,Estimated_Arrival_HH,Estimated_Arrival_MM
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,...,WN,19393.0,N7858A,,,,,,,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,...,UA,20304.0,N125SY,,07:40,10:24,7.0,40.0,10.0,24.0
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,...,AA,19805.0,N103US,,16:10,18:05,16.0,10.0,18.0,5.0
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,...,UA,,N595UA,,09:05,17:35,9.0,5.0,17.0,35.0
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,...,UA,20304.0,N161SY,,09:00,10:19,9.0,0.0,10.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,...,UA,19977.0,N477UA,,09:36,12:43,9.0,36.0,12.0,43.0
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,...,DL,,N439SW,,09:20,10:28,9.0,20.0,10.0,28.0
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,...,WN,19393.0,N230WN,,08:00,13:40,8.0,0.0,13.0,40.0
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,...,DL,19790.0,N968DL,,16:13,18:24,16.0,13.0,18.0,24.0


In [10]:
test['Estimated_Departure_HH'] = [int(i.split(':')[0]) if str(i) != 'nan' else np.NaN for i in test['Estimated_Departure_Time_HH:MM']]
test['Estimated_Departure_MM'] = [int(i.split(':')[1]) if str(i) != 'nan' else np.NaN for i in test['Estimated_Departure_Time_HH:MM']]

test['Estimated_Arrival_HH'] = [int(i.split(':')[0]) if str(i) != 'nan' else np.NaN for i in test['Estimated_Arrival_Time_HH:MM']]
test['Estimated_Arrival_MM'] = [int(i.split(':')[1]) if str(i) != 'nan' else np.NaN for i in test['Estimated_Arrival_Time_HH:MM']]

## Origin State, Destination State 결측치 보충 

In [11]:
# Origin_Airport(결측치 X), Origin_Airport_ID(결측치 X), Origin_State 열을 추출해서 일치하는 것으로 대체합니다.
origin_state = train[['Origin_Airport','Origin_Airport_ID','Origin_State']].dropna() # Origin_State가 빠진 행을 제거합니다
origin_state = origin_state.drop_duplicates() # 중복 행을 제거합니다.
origin_state.columns = ['Origin_Airport','Origin_Airport_ID','F_Origin_State']

# Destination_state에 대해서도 반복
Destination_state = train[['Destination_Airport','Destination_Airport_ID','Destination_State']].dropna()
Destination_state = Destination_state.drop_duplicates()
Destination_state.columns = ['Destination_Airport','Destination_Airport_ID','F_Destination_State']

# F_Origin_State, F_Destination_State 행을 합쳐줍니다
train = pd.merge(train, origin_state, how='left', on=['Origin_Airport','Origin_Airport_ID'], sort=False)
train = pd.merge(train, Destination_state, how='left', on=['Destination_Airport','Destination_Airport_ID'], sort=False)

# F_Destination_State에 결측치가 하나 남는데, 외부 정보를 활용해 대체합니다 (<< 되는지 모르겠음) 
mask = (train['Destination_Airport'] == 'YNG')&(train['Destination_Airport_ID'] == 16133)
value = 'Ohio'
train.loc[mask,'F_Destination_State'] = train.loc[mask,'F_Destination_State'].fillna(value)

# Origin_State, Destination_State 행을 대체합니다.
train = train.drop(columns=['Origin_State', 'Destination_State'])
train = train.rename(columns={'F_Origin_State': 'Origin_State', 'F_Destination_State': 'Destination_State'})

In [12]:
# test 데이터에 대해서도 반복합니다.
origin_state = test[['Origin_Airport','Origin_Airport_ID','Origin_State']].dropna()
origin_state = origin_state.drop_duplicates()
origin_state.columns = ['Origin_Airport','Origin_Airport_ID','F_Origin_State']

Destination_state = test[['Destination_Airport','Destination_Airport_ID','Destination_State']].dropna()
Destination_state = Destination_state.drop_duplicates()
Destination_state.columns = ['Destination_Airport','Destination_Airport_ID','F_Destination_State']

test = pd.merge(test, origin_state, how='left', on=['Origin_Airport','Origin_Airport_ID'], sort=False)
test = pd.merge(test, Destination_state, how='left', on=['Destination_Airport','Destination_Airport_ID'], sort=False)

mask = (test['Destination_Airport'] == 'YNG')&(test['Destination_Airport_ID'] == 16133)
value = 'Ohio'
test.loc[mask,'F_Destination_State'] = test.loc[mask,'F_Destination_State'].fillna(value)
test = test.drop(columns=['Origin_State', 'Destination_State'])
test = test.rename(columns={'F_Origin_State': 'Origin_State', 'F_Destination_State': 'Destination_State'})

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 25 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   ID                              1000000 non-null  object 
 1   Month                           1000000 non-null  int64  
 2   Day_of_Month                    1000000 non-null  int64  
 3   Estimated_Departure_Time        890981 non-null   float64
 4   Estimated_Arrival_Time          890960 non-null   float64
 5   Cancelled                       1000000 non-null  int64  
 6   Diverted                        1000000 non-null  int64  
 7   Origin_Airport                  1000000 non-null  object 
 8   Origin_Airport_ID               1000000 non-null  int64  
 9   Destination_Airport             1000000 non-null  object 
 10  Destination_Airport_ID          1000000 non-null  int64  
 11  Distance                        1000000 non-null  float64
 12  A

In [14]:
train.isnull().sum()

ID                                     0
Month                                  0
Day_of_Month                           0
Estimated_Departure_Time          109019
Estimated_Arrival_Time            109040
Cancelled                              0
Diverted                               0
Origin_Airport                         0
Origin_Airport_ID                      0
Destination_Airport                    0
Destination_Airport_ID                 0
Distance                               0
Airline                           108920
Carrier_Code(IATA)                108990
Carrier_ID(DOT)                   108997
Tail_Number                            0
Delay                             744999
Estimated_Departure_Time_HH:MM    109019
Estimated_Arrival_Time_HH:MM      109040
Estimated_Departure_HH            109019
Estimated_Departure_MM            109019
Estimated_Arrival_HH              109040
Estimated_Arrival_MM              109040
Origin_State                           0
Destination_Stat

## Train/test 공통 Feature 준지도학습 

In [15]:
# 각 열 별로 준지도학습 적용

In [16]:
train_lb = train.__deepcopy__()
test_lb = test.__deepcopy__()

In [17]:
fill_list = ['Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)','Carrier_Code(IATA)',
             'Estimated_Departure_HH','Estimated_Departure_MM', 'Estimated_Arrival_HH', 'Estimated_Arrival_MM']

not_labeling_list = ['Carrier_ID(DOT)','Estimated_Departure_HH','Estimated_Departure_MM', 'Estimated_Arrival_HH','Estimated_Arrival_MM']

qual_col = ['Origin_Airport', 'Destination_Airport', 'Tail_Number']

oridinary_col = ['Cancelled','Diverted','Origin_Airport','Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 
                 'Tail_Number', 'Month', 'Day_of_Month','Distance']

# Train 데이터에서 null이 없는 열들 중 라벨링(not int)이 필요한 열들 순차적으로 정리
add_list = {}
for i in range(len(fill_list)): 
    if i == 0:
        add_list[i] = qual_col
    else:       
        add_list[i] = add_list[i-1] + [fill_list[i-1]]
        add_list[i] = list(set(add_list[i]) - set(not_labeling_list))

add_list

{0: ['Origin_Airport', 'Destination_Airport', 'Tail_Number'],
 1: ['Origin_State', 'Destination_Airport', 'Tail_Number', 'Origin_Airport'],
 2: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State'],
 3: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State',
  'Airline'],
 4: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State',
  'Airline'],
 5: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State',
  'Airline',
  'Carrier_Code(IATA)'],
 6: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State',
  'Airline',
  'Carrier_Code(IATA)'],
 7: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Airport',
  'Origin_State',
  'Airline',
  'Carrier_Code(IATA)'],
 8: ['Tail_Number',
  'Destination_State',
  'Destination_Airport',
  'Origin_Air

In [18]:
for i in range(len(fill_list)): 

    # 라벨링 준비
    label_st = train_lb.drop(columns = [fill_list[i]])
    
    # 결측치를 따로 분리
    train_st = train_lb[(train_lb[fill_list[i]].astype(str) != 'None')&(train_lb[fill_list[i]].astype(str) != 'nan')].drop(columns = [fill_list[i]])
    X_nan = train_lb[(train_lb[fill_list[i]].astype(str) == 'None')|(train_lb[fill_list[i]].astype(str) == 'nan')].drop(columns = [fill_list[i]])

    
    if fill_list[i] == 'Origin_State':
      # 첫번째 열('Origin_State')일 경우 첫번째 열을 분리
      test_nan = test_lb[(test_lb[fill_list[i]].astype(str) == 'None')|(test_lb[fill_list[i]].astype(str) == 'nan')].drop(columns = [fill_list[i]])
    else:
      # 그 외의 경우 이전에 채운 열이 여전히 비어있을 때 그 행을 추가로 제거
      test_nan = test_lb[(test_lb[fill_list[i]].astype(str) == 'None')|
                           (test_lb[fill_list[i]].astype(str) == 'nan')].drop(columns = [fill_list[i]]).dropna(subset = [fill_list[i-1]])
    
    print('시작 리스트: ',add_list[i])
    print('채워야하는 열: ',fill_list[i])
    
    # 라벨링이 안된 원소들 라벨링
    # 참고) 범주형 feature에서 그 수가 많을 때에는 LabelEncoder 보다 OrdinalEncoder를 쓰는 것이 더 효과적입니다 
    for j in add_list[i]:
        # train에는 없고, test에는 있는 원소는 -2 처리
        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
        # -1과 1 사이로 변환 
        oe=oe.fit(label_st[j].to_numpy().reshape(-1, 1))
        train_st[j]=oe.transform(train_st[j].to_numpy().reshape(-1, 1))

        if fill_list[i] == 'Origin_State':
            pass
        else:
            if len(X_nan[fill_list[i-1]]) != 0:
                X_nan[j]=oe.transform(X_nan[j].to_numpy().reshape(-1, 1))
            else:
                pass

        test_nan[j]= oe.transform(test_nan[j].to_numpy().reshape(-1, 1))

    # 라벨링을 통해 null이 없었던 columns을 통해서 self_training
    X = train_st[oridinary_col]
    target_col = oridinary_col+ [fill_list[i]]
    y = train_lb[(train_lb[fill_list[i]].astype(str) != 'None')&(train_lb[fill_list[i]].astype(str) != 'nan')][target_col][fill_list[i]]

    stclf = SelfTrainingClassifier(
        base_estimator = RandomForestClassifier(n_estimators = 15, random_state=42),
        verbose = True)
    stclf.fit(X, y)   

    # train에서 NaN이었던 행에 대해 값을 채웁니다
    if fill_list[i] == 'Origin_State': # Origin_State는 앞에서 채웠으니 통과 
        pass
    else:
        if len(X_nan[fill_list[i-1]]) != 0:
            X_nan[fill_list[i]+'_pred'] = stclf.predict(X_nan[oridinary_col])
        else:
            pass
    
    # train에서 NaN이었던 행에 대해 값을 채웁니다 (test에는 delay 열이 없으니 건너뜁니다)
    test_nan[fill_list[i]+'_pred'] = stclf.predict(test_nan[oridinary_col])
    
    # train 데이터에 결측치였던 값들을 채운 값으로 대체합니다 
    if fill_list[i] == 'Origin_State':
        pass
    else:
        if len(X_nan[fill_list[i-1]]) != 0:
            train_lb.loc[(train_lb[fill_list[i]].astype(str) == 'None')|(train_lb[fill_list[i]].astype(str) == 'nan'), fill_list[i]] = X_nan[fill_list[i]+'_pred']
        else:    
            pass
    
    # test 데이터에 결측치였던 값들을 채운 값으로 대체합니다
    test_lb.loc[(test_lb[fill_list[i]].astype(str) == 'None')|(test_lb[fill_list[i]].astype(str) ==  'nan'), fill_list[i]] = test_nan[fill_list[i]+'_pred']
    
    # 다음 self_training을 위해 열 변경
    oridinary_col = oridinary_col + [fill_list[i]]
    print('다음 훈련 리스트: ', oridinary_col)
    print()

시작 리스트:  ['Origin_Airport', 'Destination_Airport', 'Tail_Number']
채워야하는 열:  Origin_State




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State']

시작 리스트:  ['Origin_State', 'Destination_Airport', 'Tail_Number', 'Origin_Airport']
채워야하는 열:  Destination_State




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State']
채워야하는 열:  Airline




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline']
채워야하는 열:  Carrier_ID(DOT)




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline']
채워야하는 열:  Carrier_Code(IATA)




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline', 'Carrier_Code(IATA)']
채워야하는 열:  Estimated_Departure_HH




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Estimated_Departure_HH']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline', 'Carrier_Code(IATA)']
채워야하는 열:  Estimated_Departure_MM




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Estimated_Departure_HH', 'Estimated_Departure_MM']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline', 'Carrier_Code(IATA)']
채워야하는 열:  Estimated_Arrival_HH




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Estimated_Departure_HH', 'Estimated_Departure_MM', 'Estimated_Arrival_HH']

시작 리스트:  ['Tail_Number', 'Destination_State', 'Destination_Airport', 'Origin_Airport', 'Origin_State', 'Airline', 'Carrier_Code(IATA)']
채워야하는 열:  Estimated_Arrival_MM




다음 훈련 리스트:  ['Cancelled', 'Diverted', 'Origin_Airport', 'Origin_Airport_ID', 'Destination_Airport', 'Destination_Airport_ID', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Estimated_Departure_HH', 'Estimated_Departure_MM', 'Estimated_Arrival_HH', 'Estimated_Arrival_MM']



## 시, 분 병합하여 시간 형태로 표현

In [19]:
train_lb = train_lb.drop(columns = ['Estimated_Departure_Time','Estimated_Arrival_Time','Estimated_Departure_Time_HH:MM', 'Estimated_Arrival_Time_HH:MM'])
train_lb['Estimated_Departure_Time'] = [int(str(int(i)).zfill(2) + str(int(j)).zfill(2)) for i, j in zip(train_lb['Estimated_Departure_HH'], train_lb['Estimated_Departure_MM'])]
train_lb['Estimated_Arrival_Time'] = [int(str(int(i)).zfill(2) + str(int(j)).zfill(2)) for i, j in zip(train_lb['Estimated_Arrival_HH'], train_lb['Estimated_Arrival_MM'])]
train_lb

Unnamed: 0,ID,Month,Day_of_Month,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,...,Tail_Number,Delay,Estimated_Departure_HH,Estimated_Departure_MM,Estimated_Arrival_HH,Estimated_Arrival_MM,Origin_State,Destination_State,Estimated_Departure_Time,Estimated_Arrival_Time
0,TRAIN_000000,4,15,0,0,OKC,13851,HOU,12191,419.0,...,N7858A,,5.0,35.0,7.0,0.0,Oklahoma,Texas,535,700
1,TRAIN_000001,8,15,0,0,ORD,13930,SLC,14869,1250.0,...,N125SY,,7.0,40.0,10.0,24.0,Illinois,Utah,740,1024
2,TRAIN_000002,9,6,0,0,CLT,11057,LGA,12953,544.0,...,N103US,,16.0,10.0,18.0,5.0,North Carolina,New York,1610,1805
3,TRAIN_000003,7,10,0,0,LAX,12892,EWR,11618,2454.0,...,N595UA,,9.0,5.0,17.0,35.0,California,New Jersey,905,1735
4,TRAIN_000004,1,11,0,0,SFO,14771,ACV,10157,250.0,...,N161SY,,9.0,0.0,10.0,19.0,California,California,900,1019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,0,0,ORD,13930,PHL,14100,678.0,...,N477UA,,9.0,36.0,12.0,43.0,Illinois,Pennsylvania,936,1243
999996,TRAIN_999996,5,30,0,0,FAR,11637,MSP,13487,223.0,...,N439SW,,9.0,20.0,10.0,28.0,North Dakota,Minnesota,920,1028
999997,TRAIN_999997,6,28,0,0,OAK,13796,HOU,12191,1642.0,...,N230WN,,8.0,0.0,13.0,40.0,California,Texas,800,1340
999998,TRAIN_999998,9,27,0,0,BNA,10693,ATL,10397,214.0,...,N968DL,,16.0,13.0,18.0,24.0,Tennessee,Georgia,1613,1824


In [20]:
test_lb = test_lb.drop(columns = ['Estimated_Departure_Time','Estimated_Arrival_Time','Estimated_Departure_Time_HH:MM', 'Estimated_Arrival_Time_HH:MM'])
test_lb['Estimated_Departure_Time'] = [int(str(int(i)).zfill(2) + str(int(j)).zfill(2)) for i, j in zip(test_lb['Estimated_Departure_HH'], test_lb['Estimated_Departure_MM'])]
test_lb['Estimated_Arrival_Time'] = [int(str(int(i)).zfill(2) + str(int(j)).zfill(2)) for i, j in zip(test_lb['Estimated_Arrival_HH'], test_lb['Estimated_Arrival_MM'])]

## 거리에 따라 그룹화

단거리, 중거리일수록 Delay가 잘 생기고, 장거리 노선은 Delay가 거의 생기지 않음.

이걸 활용하면 좀 더 정확한 예측이 가능할 듯 

In [21]:
def distance_group(x):
		if x < 700 : # 단거리: 700마일 미만
			return 'short'
		elif x>=700 and x<3000:  # 중거리: 700마일 이상 3000마일 미만
			return 'mid'
		elif x>=3000: # 장거리: 3000마일 이상 
			return 'long'  
		else:
			return 'dummy'
	
train_lb['route_gb'] = train_lb['Distance'].apply(distance_group)
test_lb['route_gb'] = test_lb['Distance'].apply(distance_group)

## 시간에 따라 그룹화 

In [22]:
# 예상 출발, 도착시간 그룹 화
def time_gb(x):		
		if x >= 600 and x <= 659:
			return '0600-0659'
		elif x>=1400 and x<=1459:
			return '1400-1459'
		elif x>=1200 and x<=1259:
			return '1200-1259'
		elif x>=1500 and x<=1559:
			return '1500-1559'
		elif x>=1900 and x<=1959:
			return '1900-1959'
		elif x>=900 and x<=959:
			return '0900-0959'
		elif x>=1000 and x<=1059:
			return  '1000-1059'
		elif x>=2000 and x<=2059:
			return '2000-2059'
		elif x>=1300 and x<=1359:
			return '1300-1359'
		elif x>=1100 and x<=1159:
			return '1100-1159'
		elif x>=800 and x<=859:
			return '0800-0859'
		elif x>=2200 and x<=2259:
			return '2200-2259'
		elif x>=1600 and x<=1659:
			return '1600-1659'
		elif x>=1700 and x<=1759:
			return '1700-1759'
		elif x>=2100 and x<=2159:
			return '2100-2159'
		elif x>=700 and x<=759:
			return '0700-0759'
		elif x>=1800 and x<=1859:
			return '1800-1859'
		elif x>=1 and x<=559:
			return '0001-0559'
		elif x>=2300 and x<=2400:
			return '2300-2400'
		else:
			return 'dummy'

train_lb['Dep_time_gb'] = train_lb['Estimated_Departure_Time'].dropna().apply(time_gb)
train_lb['Arr_time_gb'] = train_lb['Estimated_Arrival_Time'].dropna().apply(time_gb)

test_lb['Dep_time_gb'] = test_lb['Estimated_Departure_Time'].dropna().apply(time_gb)
test_lb['Arr_time_gb'] = test_lb['Estimated_Arrival_Time'].dropna().apply(time_gb)

## Delay(Target) 준지도학습 

In [23]:
label_st = train_lb.drop(columns = ['Delay'])
train_st = train_lb[(train_lb['Delay'].astype(str) != 'None') & (train_lb['Delay'].astype(str) != 'nan')].drop(columns = ['Delay'])
X_nan = train_lb[(train_lb['Delay'].astype(str) == 'None')|(train_lb['Delay'].astype(str) == 'nan')].drop(columns = ['Delay'])

qual_col = ['Airline', 'Tail_Number', 'Destination_Airport', 'Origin_State', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Destination_State', 
            'Origin_Airport', 'Estimated_Departure_Time','Estimated_Arrival_Time','route_gb', 'Dep_time_gb', 'Arr_time_gb']

for i in qual_col:
    oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
    oe=oe.fit(label_st[i].to_numpy().reshape(-1, 1))
    train_st[i]=oe.transform(train_st[i].to_numpy().reshape(-1, 1))
    X_nan[i]=oe.transform(X_nan[i].to_numpy().reshape(-1, 1))

original_col = ['Cancelled','Diverted','Origin_Airport','Origin_Airport_ID', 'Origin_State', 
                'Destination_Airport', 'Destination_Airport_ID', 'Destination_State', 'Tail_Number', 'Month', 'Day_of_Month', 'Distance', 
                'route_gb', 'Dep_time_gb', 'Arr_time_gb']

X = train_st[original_col]
y = train_lb[(train_lb['Delay'].astype(str) != 'None')][original_col + ['Delay']]['Delay']

stclf = SelfTrainingClassifier(
        base_estimator = RandomForestClassifier(n_estimators = 15, random_state=42),
        verbose = True)
stclf.fit(X, y)

X_nan['Delay_pred'] = stclf.predict(X_nan[original_col])
train_lb.loc[train_lb['Delay'].astype(str) == 'None', 'Delay'] = X_nan['Delay_pred']

print('Done.')



Done.


In [24]:
train_lb

Unnamed: 0,ID,Month,Day_of_Month,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Destination_Airport,Destination_Airport_ID,Distance,...,Estimated_Departure_MM,Estimated_Arrival_HH,Estimated_Arrival_MM,Origin_State,Destination_State,Estimated_Departure_Time,Estimated_Arrival_Time,route_gb,Dep_time_gb,Arr_time_gb
0,TRAIN_000000,4,15,0,0,OKC,13851,HOU,12191,419.0,...,35.0,7.0,0.0,Oklahoma,Texas,535,700,short,0001-0559,0700-0759
1,TRAIN_000001,8,15,0,0,ORD,13930,SLC,14869,1250.0,...,40.0,10.0,24.0,Illinois,Utah,740,1024,mid,0700-0759,1000-1059
2,TRAIN_000002,9,6,0,0,CLT,11057,LGA,12953,544.0,...,10.0,18.0,5.0,North Carolina,New York,1610,1805,short,1600-1659,1800-1859
3,TRAIN_000003,7,10,0,0,LAX,12892,EWR,11618,2454.0,...,5.0,17.0,35.0,California,New Jersey,905,1735,mid,0900-0959,1700-1759
4,TRAIN_000004,1,11,0,0,SFO,14771,ACV,10157,250.0,...,0.0,10.0,19.0,California,California,900,1019,short,0900-0959,1000-1059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,0,0,ORD,13930,PHL,14100,678.0,...,36.0,12.0,43.0,Illinois,Pennsylvania,936,1243,short,0900-0959,1200-1259
999996,TRAIN_999996,5,30,0,0,FAR,11637,MSP,13487,223.0,...,20.0,10.0,28.0,North Dakota,Minnesota,920,1028,short,0900-0959,1000-1059
999997,TRAIN_999997,6,28,0,0,OAK,13796,HOU,12191,1642.0,...,0.0,13.0,40.0,California,Texas,800,1340,mid,0800-0859,1300-1359
999998,TRAIN_999998,9,27,0,0,BNA,10693,ATL,10397,214.0,...,13.0,18.0,24.0,Tennessee,Georgia,1613,1824,short,1600-1659,1800-1859


In [25]:
print(train_lb[train_lb['Delay'].astype(str) == 'Delayed'].shape[0])
print(train_lb[train_lb['route_gb'].astype(str) == 'short'].shape[0])
print(train_lb[(train_lb['route_gb'].astype(str) == 'short')&(train_lb['Delay'].astype(str) == 'Delayed')].shape[0])
print(train_lb[train_lb['route_gb'].astype(str) == 'mid'].shape[0])
print(train_lb[(train_lb['route_gb'].astype(str) == 'mid')&(train_lb['Delay'].astype(str) == 'Delayed')].shape[0])
print(train_lb[train_lb['route_gb'].astype(str) == 'long'].shape[0])
print(train_lb[(train_lb['route_gb'].astype(str) == 'long')&(train_lb['Delay'].astype(str) == 'Delayed')].shape[0])

82580
560786
41432
437630
41033
1584
115


## Ordinal Encoding

In [26]:
train = train_lb
test = test_lb

In [27]:
# delayed, not_delyed -> 0, 1로 변경

map = {
    'Not_Delayed' :    1,
    'Delayed'     :    0, 
}
train['Delay'] = train.Delay.map(map)

print('Done.')

Done.


In [28]:
# Quantify qualitative variables
# 정성적 변수는 OrdinalEncoder를 사용하여 숫자로 인코딩됩니다.
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 
            'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Tail_Number', 
            'Estimated_Departure_Time','Estimated_Arrival_Time', 'route_gb', 'Dep_time_gb', 'Arr_time_gb']

for i in qual_col:
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                         unknown_value=-2)
    oe=oe.fit(train[i].to_numpy().reshape(-1, 1))
    train[i] = oe.transform(train[i].to_numpy().reshape(-1, 1))
    test[i] = oe.transform(test[i].to_numpy().reshape(-1, 1))

print('Done.')

Done.


# Model Training

## Train, Validation 분리 

In [29]:
train_x = train.drop(columns = ['ID','Delay'], axis=1)
train_y = train['Delay']
test_x = test.drop(columns = ['ID'], axis=1)

In [30]:
# 교육 데이터는 교육 및 검증 세트로 분할되고 수치 기능은 StandardScaler를 사용하여 정규화됩니다.
# 모델은 GridSearchCV와 5겹 교차 검증을 사용하여 수행되는 하이퍼파라미터 튜닝과 함께 XGBClassifier를 사용하여 훈련됩니다.
# Split the training dataset into a training set and a validation set
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Catboost, Optuna 라이브러리 설치

In [31]:
# Colab에서 기본 제공하지 않는 라이브러리라 별도 설치가 필요합니다.
!pip install catboost # CatBoost
!pip install optuna # 하이퍼파라미터 최적화 프레임워크

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Do

## CatBoost 

In [32]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier, Pool

In [33]:
# catboost, optuna 파라미터, 함수 정의

sampler = TPESampler(seed=113)

# define function
def objective(trial):

    # Catboost parameter [출처: https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier ]
    cbrm_param = {
        "objective": "Logloss",
        'l2_leaf_reg':trial.suggest_float("l2_leaf_reg", 2, 20),
        'max_bin': trial.suggest_int("max_bin", 100, 500),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.01),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'iterations':trial.suggest_int("iterations", 5, 10),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'random_state': 42,
        'use_best_model': True,
        'early_stopping_rounds' : 5
    }

    # 모델 학습에 사용할 feature
    FEATURE = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 
              'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Tail_Number', 
              'Estimated_Departure_Time','Estimated_Arrival_Time', 'route_gb', 'Dep_time_gb', 'Arr_time_gb']
    
    # train, validation X, y를 Pool로 합칩니다
    train_pool = Pool(data=train_x.astype('int'), label=train_y, cat_features=FEATURE)
    valid_pool = Pool(data=val_x.astype('int'), label=val_y, cat_features=FEATURE)

    # model 생성, 학습
    model_cbrm = CatBoostClassifier(**cbrm_param)
    model_cbrm = model_cbrm.fit(train_pool, eval_set=valid_pool, 
                           verbose=0)

    # validation data의 logloss 계산                
    logloss = log_loss(val_y, model_cbrm.predict(val_x.astype('int')))
    return logloss

# optun를 이용해 파라미터 최적화를 진행합니다. (50회)
optuna_cbrm = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cbrm.optimize(objective, n_trials=10, timeout = 1800)

[32m[I 2023-05-06 00:35:06,293][0m A new study created in memory with name: no-name-da01b60e-2d1e-4c2a-80ce-a8568b34f6ef[0m
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.01),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
[32m[I 2023-05-06 00:35:44,257][0m Trial 0 finished with value: 2.9824320996824984 and parameters: {'l2_leaf_reg': 17.335738869141586, 'max_bin': 129, 'learning_rate': 0.007851122663573896, 'max_depth': 11, 'min_data_in_leaf': 206, 'iterations': 9, 'subsample': 0.8678165576938894}. Best is trial 0 with value: 2.9824320996824984.[0m
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.01),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
[32m[I 2023-05-06 00:36:05,246][0m Trial 1 finished with value: 2.9824320996824984 and parameters: {'l2_leaf_reg': 13.569488628767122, 'max_bin': 485, 'learning_rate': 0.0012837039728039537, 'max_depth': 11, 'min_data_in_leaf': 30, 'iterations': 10, 'subsample

In [45]:
# 가장 최적의 결과를 선택합니다.
cbrm_trial = optuna_cbrm.best_trial
cbrm_trial_params = cbrm_trial.params
print('Best Trial: score {},\nparams {}'.format(cbrm_trial.value, cbrm_trial_params)) # learning rate가 0.01을 넘어가는지 확인

cbrm_trial_params['use_best_model'] = True
cbrm_trial_params['random_state'] = 42
cbrm_trial_params['iterations'] = 45 
cbrm_trial_params

Best Trial: score 2.9824320996824984,
params {'l2_leaf_reg': 17.335738869141586, 'max_bin': 129, 'learning_rate': 0.007851122663573896, 'max_depth': 11, 'min_data_in_leaf': 206, 'iterations': 9, 'subsample': 0.8678165576938894}


{'l2_leaf_reg': 17.335738869141586,
 'max_bin': 129,
 'learning_rate': 0.007851122663573896,
 'max_depth': 11,
 'min_data_in_leaf': 206,
 'iterations': 45,
 'subsample': 0.8678165576938894,
 'use_best_model': True,
 'random_state': 42}

In [46]:
# 앞의 파라미터 최적화로 학습된 최적 파라미터를 params로 옮깁니다
params = cbrm_trial_params

In [47]:
# 최적의 파라미터를 사용하는 학습 모델을 생성합니다
from catboost import CatBoostClassifier 

cbrm_model = CatBoostClassifier(**params)

In [48]:
from sklearn.model_selection import KFold, StratifiedKFold
from catboost import Pool

# 학습 데이터를 StratifiedKFold로 분할합니다
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 예측 결과를 담을 배열 정의
y_valid_pred = 0*train_y
y_preds1_cbrm = np.zeros(test_x.shape[0]) # Not_Delayed
y_preds2_cbrm = np.zeros(test_x.shape[0]) # Delayed

qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 
            'Airline', 'Carrier_ID(DOT)', 'Carrier_Code(IATA)', 'Tail_Number', 
            'Estimated_Departure_Time','Estimated_Arrival_Time', 'route_gb']
            
for i, (train_index, test_index) in enumerate(skfold.split(train_x, train_y)):    
    # Fold 별로 데이터 분할
    X_train_fold, X_valid_fold = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train_fold, y_valid_fold = train_y.iloc[train_index], train_y.iloc[test_index]
    # Train, Validation Pool
    train_data = Pool(data=X_train_fold.astype('int'), label=y_train_fold, cat_features=qual_col)
    valid_data = Pool(data=X_valid_fold.astype('int'), label=y_valid_fold, cat_features=qual_col)

    print( "\nFold ", i)
    # 모델 학습
    cbrm_fit_model = cbrm_model.fit(train_data, eval_set=valid_data, use_best_model=True, verbose=5)
    print( "  N trees = ", cbrm_model.tree_count_ )
    
    # Predict value Clipping
    y_preds1_cbrm +=  cbrm_fit_model.predict_proba(test_x[X_valid_fold.columns].astype('int'))[:,0]
    y_preds2_cbrm +=  cbrm_fit_model.predict_proba(test_x[X_valid_fold.columns].astype('int'))[:,1]
    
y_preds1_cbrm /= 5  # 예측 확률의 평균
y_preds2_cbrm /= 5


Fold  0
0:	learn: 0.6847966	test: 0.6847962	best: 0.6847962 (0)	total: 931ms	remaining: 41s
5:	learn: 0.6458115	test: 0.6457882	best: 0.6457882 (5)	total: 7.25s	remaining: 47.1s
10:	learn: 0.6106456	test: 0.6105655	best: 0.6105655 (10)	total: 11s	remaining: 33.9s
15:	learn: 0.5786421	test: 0.5785407	best: 0.5785407 (15)	total: 15.8s	remaining: 28.6s
20:	learn: 0.5501577	test: 0.5500634	best: 0.5500634 (20)	total: 21.2s	remaining: 24.3s
25:	learn: 0.5249293	test: 0.5248275	best: 0.5248275 (25)	total: 24.2s	remaining: 17.7s
30:	learn: 0.5012945	test: 0.5011704	best: 0.5011704 (30)	total: 29.9s	remaining: 13.5s
35:	learn: 0.4802183	test: 0.4801027	best: 0.4801027 (35)	total: 36.8s	remaining: 9.21s
40:	learn: 0.4609434	test: 0.4608161	best: 0.4608161 (40)	total: 42.2s	remaining: 4.12s
44:	learn: 0.4469767	test: 0.4468495	best: 0.4468495 (44)	total: 46.6s	remaining: 0us

bestTest = 0.4468495268
bestIteration = 44

  N trees =  45

Fold  1
0:	learn: 0.6847914	test: 0.6847629	best: 0.6847629

In [49]:
y_preds1_cbrm

array([0.30384826, 0.33003256, 0.35325829, ..., 0.35343333, 0.34069821,
       0.30650201])

In [50]:
y_preds2_cbrm

array([0.69615174, 0.66996744, 0.64674171, ..., 0.64656667, 0.65930179,
       0.69349799])

# 결과 파일 제출 

In [51]:
# 예측 결과가 아래에 저장됩니다.
test_x['Not_Delayed'] =  y_preds1_cbrm
test_x['Delayed'] =  y_preds2_cbrm

In [52]:
submission = sample_submission.reset_index()
submission = submission.drop(columns = ['Not_Delayed','Delayed'])

In [53]:
pred_y = test_x[['Not_Delayed', 'Delayed']]

In [54]:
pred_y_np = pred_y.values
submission = pd.DataFrame(data=pred_y_np, columns=sample_submission.columns, index=sample_submission.index)
submission

Unnamed: 0_level_0,Not_Delayed,Delayed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
TEST_000000,0.303848,0.696152
TEST_000001,0.330033,0.669967
TEST_000002,0.353258,0.646742
TEST_000003,0.356374,0.643626
TEST_000004,0.372572,0.627428
...,...,...
TEST_999995,0.353801,0.646199
TEST_999996,0.290173,0.709827
TEST_999997,0.353433,0.646567
TEST_999998,0.340698,0.659302


In [55]:
submission.to_csv('/content/drive/MyDrive/dataset/airplane/model_ensemble_submission_feature_add.csv', index=True) # 결과 파일 제출