In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
# csv_to_parquet('./train.csv', 'train')
# csv_to_parquet('./test.csv', 'test')
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

train Done.
test Done.


In [2]:
mom = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
train['Day_of_Month'] = train.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
test['Day_of_Month'] = test.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
train['Estimated_Departure_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
test['Estimated_Departure_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
train['Estimated_Arrival_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)
test['Estimated_Arrival_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)

In [None]:
train = train.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])
test = test.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])

In [None]:
train.info()

In [None]:
# #레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
# #가장 많이 나온 값들로 NaN을 대체함
# NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

# for col in NaN_col:
#     mode = train[col].mode()[0]
#     train[col] = train[col].fillna(mode)
    
#     if col in test.columns:
#         test[col] = test[col].fillna(mode)
# print('Done.')

threshold = 10
category_column = ['Origin_Airport_ID', 'Origin_State', 'Destination_Airport_ID', 'Destination_State', 'Airline','Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']
for column_name in category_column:
    # value_counts() 메서드를 사용하여 해당 column에서 각 값의 빈도수 계산
    value_counts = train[column_name].value_counts()

    # 빈도수가 threshold보다 작은 값들의 인덱스를 추출하여 리스트로 저장
    to_remove = value_counts[value_counts < threshold].index.tolist()

    # to_remove 리스트에 속하지 않은 row들로 이루어진 새로운 dataframe 생성
    train = train[~train[column_name].isin(to_remove)]
    
def to_number(x):
    if x == None: return -1
    elif x == 'Delayed': return 1
    else: return 0
    
train.loc[:, 'Delay'] = train['Delay'].apply(lambda x: to_number(x)) # 0 : Not Delayed  //  1 : Delayed

# Quantify qualitative variables
# 정성적 변수는 LabelEncoder를 사용하여 숫자로 인코딩됩니다.
qual_col = ['Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in set(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

train = train.dropna()
print('Done.')

In [None]:
train['Delay'].value_counts()

In [None]:
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
if len(selected_row_0) > len(selected_row_1): selected_row_0 = selected_row_0.sample(n=len(selected_row_1)//3)
# else: selected_row_1 = selected_row_1.sample(n=len(selected_row_0), replace=True)
train_0to1 = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
train_0to1 = train_0to1.sample(frac=1)
# selected_row_null = train.loc[train['Delay'] == -1]
# selected_row_null = selected_row_null.sample(n = 31813)
# train_sorted = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
# train_sorted = train_sorted.sample(frac=1)

In [None]:
train_0to1['Delay'].value_counts()

In [None]:
#### 1. h2o 분석 준비하기 ####
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.no_progress()
################################################################
## make dataset
# Identify the response and set of predictors
y = "Delay"
x = list(train_0to1.columns)  #if x is defined as all columns except the response, then x is not required
x.remove(y)

# # data_df을 8:2로 나눈다, 50 : 13
# train, valid = train_test_split(train,
#                                 test_size=0.2, 
#                                 shuffle=True)
h2o_train = h2o.H2OFrame(train_0to1)
# h2o_valid = h2o.H2OFrame(valid)

# For binary classification, response should be a factor
h2o_train[y] = h2o_train[y].asfactor()
# h2o_valid[y] = h2o_valid[y].asfactor()

In [None]:
aml_1 = H2OAutoML(max_models = 25,
                balance_classes=True,
		        seed = 1)
aml_1.train(x = x, y = y, training_frame=h2o_train)
lb_1 = aml_1.leaderboard

In [None]:
selected_row_null = train.loc[train['Delay'] == -1]
selected_row_null = selected_row_null.drop(columns=['Delay'])
h2o_null = h2o.H2OFrame(selected_row_null)
m_1 = aml_1.get_best_model()
preds = m_1.predict(h2o_null)

In [None]:
preds = preds.as_data_frame()

In [None]:
selected_row_null['Delay'] = preds['predict'].values
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
train_filled = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
train_filled = train_filled.sample(frac=1)

In [None]:
train_filled['Delay'].value_counts()

In [None]:
# selected_row_0 = train_filled.loc[train_filled['Delay'] == 0]
# selected_row_1 = train_filled.loc[train_filled['Delay'] == 1]
# # if len(selected_row_0) < len(selected_row_1): selected_row_1 = selected_row_1.sample(n=len(selected_row_0)//2)
# # else: selected_row_0 = selected_row_0.sample(n=len(selected_row_1)//2)
# train_filled = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
# train_filled = train_filled.sample(frac=1)

In [None]:
# train_filled['Delay'].value_counts()

In [None]:
################################################################
## make dataset
# Identify the response and set of predictors
y = "Delay"
x = list(train_filled.columns)  #if x is defined as all columns except the response, then x is not required
x.remove(y)

# # data_df을 8:2로 나눈다, 50 : 13
# train, valid = train_test_split(train,
#                                 test_size=0.2, 
#                                 shuffle=True)
h2o_train = h2o.H2OFrame(train_filled)
# h2o_valid = h2o.H2OFrame(valid)

# For binary classification, response should be a factor
h2o_train[y] = h2o_train[y].asfactor()
# h2o_valid[y] = h2o_valid[y].asfactor()

In [None]:
aml = H2OAutoML(max_models = 25,
                balance_classes=True,
		        seed = 1)
aml.train(x = x, y = y, training_frame=h2o_train)
lb = aml.leaderboard

In [None]:
h2o_test = h2o.H2OFrame(test)
m = aml.get_best_model(criterion="logloss")
h2o.save_model(model=m)
preds = m.predict(h2o_test)

In [None]:
preds

In [None]:
y_pred = preds.as_data_frame().drop(columns=['predict']).values

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)