In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
import matplotlib.pyplot as plt
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

train Done.
test Done.


In [2]:
mom = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
train['Day_of_Month'] = train.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
test['Day_of_Month'] = test.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
train['Estimated_Departure_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
test['Estimated_Departure_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
train['Estimated_Arrival_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)
test['Estimated_Arrival_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)

In [3]:
train = train.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])
train.info()
test = test.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Day_of_Month              1000000 non-null  float64
 1   Estimated_Departure_Time  890981 non-null   float64
 2   Estimated_Arrival_Time    890960 non-null   float64
 3   Origin_Airport_ID         1000000 non-null  int64  
 4   Origin_State              890985 non-null   object 
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Destination_State         890921 non-null   object 
 7   Distance                  1000000 non-null  float64
 8   Airline                   891080 non-null   object 
 9   Carrier_Code(IATA)        891010 non-null   object 
 10  Carrier_ID(DOT)           891003 non-null   float64
 11  Tail_Number               1000000 non-null  object 
 12  Delay                     255001 non-null   object 
dtypes: float64(5), int64(2), obj

In [4]:
# #레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
# #가장 많이 나온 값들로 NaN을 대체함
# NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

# for col in NaN_col:
#     mode = train[col].mode()[0]
#     train[col] = train[col].fillna(mode)
    
#     if col in test.columns:
#         test[col] = test[col].fillna(mode)
# print('Done.')

threshold = 10
category_column = ['Origin_Airport_ID', 'Origin_State', 'Destination_Airport_ID', 'Destination_State', 'Airline','Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']
for column_name in category_column:
    # value_counts() 메서드를 사용하여 해당 column에서 각 값의 빈도수 계산
    value_counts = train[column_name].value_counts()

    # 빈도수가 threshold보다 작은 값들의 인덱스를 추출하여 리스트로 저장
    to_remove = value_counts[value_counts < threshold].index.tolist()

    # to_remove 리스트에 속하지 않은 row들로 이루어진 새로운 dataframe 생성
    train = train[~train[column_name].isin(to_remove)]
    
def to_number(x):
    if x == None: return -1
    elif x == 'Delayed': return 1
    else: return 0
    
train.loc[:, 'Delay'] = train['Delay'].apply(lambda x: to_number(x)) # 0 : Not Delayed  //  1 : Delayed

# Quantify qualitative variables
# 정성적 변수는 LabelEncoder를 사용하여 숫자로 인코딩됩니다.
qual_col = ['Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in set(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

train = train.dropna()
print('Done.')

Done.


In [5]:
train['Delay'].value_counts()

-1    525968
 0    148460
 1     31763
Name: Delay, dtype: int64

In [6]:
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
if len(selected_row_0) < len(selected_row_1): selected_row_0 = selected_row_0.sample(n=len(selected_row_1), replace=True)
else: selected_row_1 = selected_row_1.sample(n=len(selected_row_0), replace=True)
train_0to1 = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
train_0to1 = train_0to1.sample(frac=1)
# selected_row_null = train.loc[train['Delay'] == -1]
# selected_row_null = selected_row_null.sample(n = 31813)
# train_sorted = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
# train_sorted = train_sorted.sample(frac=1)

In [7]:
train_0to1['Delay'].value_counts()

0    148460
1    148460
Name: Delay, dtype: int64

In [8]:
X_train = train_0to1.drop(columns=['Delay'])
y_train = train_0to1['Delay']

In [9]:
from flaml import AutoML
automl = AutoML()
automl.fit(X_train, y_train, task="classification", ensemble=True, early_stop=True, time_budget=60*10, seed=42)

[flaml.automl.logger: 04-10 17:12:11] {1768} INFO - task = classification
[flaml.automl.logger: 04-10 17:12:11] {1775} INFO - Data split method: stratified
[flaml.automl.logger: 04-10 17:12:11] {1778} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-10 17:12:11] {1891} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 04-10 17:12:11] {2011} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 04-10 17:12:11] {2341} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-10 17:12:11] {2480} INFO - Estimated sufficient time budget=11757s. Estimated necessary time budget=289s.
[flaml.automl.logger: 04-10 17:12:11] {2532} INFO -  at 0.6s,	estimator lgbm's best error=0.3925,	best estimator lgbm's best error=0.3925
[flaml.automl.logger: 04-10 17:12:11] {2341} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-10 17:12:11] {2532} INFO -  at 0.7s,	estimator 

In [10]:
selected_row_null = train.loc[train['Delay'] == -1]
selected_row_null = selected_row_null.drop(columns=['Delay'])
preds = automl.predict(selected_row_null)

In [11]:
selected_row_null['Delay'] = preds
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
train_filled = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
train_filled = train_filled.sample(frac=1)

In [12]:
train_filled['Delay'].value_counts()

0    549342
1    156849
Name: Delay, dtype: int64

In [13]:
selected_row_0 = train_filled.loc[train_filled['Delay'] == 0]
selected_row_1 = train_filled.loc[train_filled['Delay'] == 1]
if len(selected_row_0) < len(selected_row_1): selected_row_0 = selected_row_0.sample(n=len(selected_row_1), replace=True)
else: selected_row_1 = selected_row_1.sample(n=len(selected_row_0), replace=True)
train_filled = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
train_filled = train_filled.sample(frac=1)

In [14]:
train_filled['Delay'].value_counts()

0    549342
1    549342
Name: Delay, dtype: int64

In [15]:
X_train = train_filled.drop(columns=['Delay'])
y_train = train_filled['Delay']

In [16]:
from flaml import AutoML
automl_2 = AutoML()
automl_2.fit(X_train, y_train, task="classification", ensemble=True, early_stop=True, time_budget=60*30, seed=42)

[flaml.automl.logger: 04-10 17:26:05] {1768} INFO - task = classification
[flaml.automl.logger: 04-10 17:26:05] {1775} INFO - Data split method: stratified
[flaml.automl.logger: 04-10 17:26:05] {1778} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-10 17:26:05] {1891} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 04-10 17:26:05] {2011} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 04-10 17:26:05] {2341} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-10 17:26:06] {2480} INFO - Estimated sufficient time budget=54382s. Estimated necessary time budget=1335s.
[flaml.automl.logger: 04-10 17:26:06] {2532} INFO -  at 2.3s,	estimator lgbm's best error=0.1723,	best estimator lgbm's best error=0.1723
[flaml.automl.logger: 04-10 17:26:06] {2341} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-10 17:26:06] {2532} INFO -  at 2.4s,	estimator

In [17]:
y_pred = automl_2.predict_proba(test)
label = automl_2.predict(test)

In [18]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)