In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [2]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [4]:
train['Delay'].value_counts()

Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

In [3]:
train = train[['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_ID(DOT)', 'Distance', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Airline', 'Month', 'Carrier_Code(IATA)', 'Delay']]
train.info()
test = test[['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_ID(DOT)', 'Distance', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Airline', 'Month', 'Carrier_Code(IATA)']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Estimated_Departure_Time  890981 non-null   float64
 1   Estimated_Arrival_Time    890960 non-null   float64
 2   Carrier_ID(DOT)           891003 non-null   float64
 3   Distance                  1000000 non-null  float64
 4   Origin_Airport_ID         1000000 non-null  int64  
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Airline                   891080 non-null   object 
 7   Month                     1000000 non-null  int64  
 8   Carrier_Code(IATA)        891010 non-null   object 
 9   Delay                     255001 non-null   object 
dtypes: float64(4), int64(3), object(3)
memory usage: 76.3+ MB


In [4]:
def to_number(x):
    if x == None: return 1 # Delayed
    else: return 0 # Not Delayed

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x))
train = train.drop(columns=['Delay'])
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 561520 entries, 1 to 999998
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Estimated_Departure_Time  561520 non-null  float64
 1   Estimated_Arrival_Time    561520 non-null  float64
 2   Carrier_ID(DOT)           561520 non-null  float64
 3   Distance                  561520 non-null  float64
 4   Origin_Airport_ID         561520 non-null  int64  
 5   Destination_Airport_ID    561520 non-null  int64  
 6   Airline                   561520 non-null  object 
 7   Month                     561520 non-null  int64  
 8   Carrier_Code(IATA)        561520 non-null  object 
 9   Delay_num                 561520 non-null  int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 47.1+ MB


In [5]:
#### 1. h2o 분석 준비하기 ####
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.no_progress()
################################################################
## make dataset
# Identify the response and set of predictors
y = "Delay_num"
x = list(train.columns)  #if x is defined as all columns except the response, then x is not required
x.remove(y)

# # data_df을 8:2로 나눈다, 50 : 13
# train, valid = train_test_split(train,
#                                 test_size=0.2, 
#                                 shuffle=True)
h2o_train = h2o.H2OFrame(train)
# h2o_valid = h2o.H2OFrame(valid)

# For binary classification, response should be a factor
h2o_train[y] = h2o_train[y].asfactor()
# h2o_valid[y] = h2o_valid[y].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.18+9-LTS-195, mixed mode)
  Starting server from c:\Python\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\KDY\AppData\Local\Temp\tmpq2f4bpny
  JVM stdout: C:\Users\KDY\AppData\Local\Temp\tmpq2f4bpny\h2o_KDY_started_from_python.out
  JVM stderr: C:\Users\KDY\AppData\Local\Temp\tmpq2f4bpny\h2o_KDY_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.3
H2O_cluster_version_age:,3 days
H2O_cluster_name:,H2O_from_python_KDY_71hfe8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.982 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [6]:
aml = H2OAutoML(max_models = 25,
                balance_classes=True,
		        seed = 1)
aml.train(x = x, y = y, training_frame=h2o_train)
lb = aml.leaderboard


00:29:09.656: AutoML: XGBoost is not available; skipping it.



In [7]:
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_grid_1_AutoML_1_20230408_02909_model_2,0.522941,0.56727,0.758735,0.5,0.435657,0.189797
GBM_grid_1_AutoML_1_20230408_02909_model_7,0.522438,0.567328,0.758539,0.499994,0.435682,0.189819
GBM_grid_1_AutoML_1_20230408_02909_model_3,0.52195,0.567337,0.758062,0.5,0.435686,0.189822
GBM_5_AutoML_1_20230408_02909,0.521873,0.567361,0.757772,0.5,0.435696,0.189831
GBM_2_AutoML_1_20230408_02909,0.521606,0.567378,0.757783,0.5,0.435703,0.189837
GBM_grid_1_AutoML_1_20230408_02909_model_6,0.520708,0.567426,0.756732,0.499979,0.435725,0.189856
GBM_3_AutoML_1_20230408_02909,0.52009,0.567542,0.756477,0.5,0.435771,0.189896
StackedEnsemble_AllModels_1_AutoML_1_20230408_02909,0.520024,0.567617,0.756805,0.499997,0.435808,0.189928
GLM_1_AutoML_1_20230408_02909,0.519109,0.567471,0.755417,0.5,0.435744,0.189873
GBM_4_AutoML_1_20230408_02909,0.517737,0.568043,0.755068,0.499992,0.43598,0.190078


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Estimated_Departure_Time  891016 non-null   float64
 1   Estimated_Arrival_Time    890952 non-null   float64
 2   Carrier_ID(DOT)           890994 non-null   float64
 3   Distance                  1000000 non-null  float64
 4   Origin_Airport_ID         1000000 non-null  int64  
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Airline                   893473 non-null   object 
 7   Month                     1000000 non-null  int64  
 8   Carrier_Code(IATA)        891007 non-null   object 
dtypes: float64(4), int64(3), object(2)
memory usage: 68.7+ MB


In [32]:
h2o_test = h2o.H2OFrame(test)
m = aml.get_best_model(criterion="logloss")
h2o.save_model(model=m)
preds = m.predict(h2o_test)



In [13]:
y_pred = preds.as_data_frame().drop(columns=['predict']).values

In [20]:
y_pred = y_pred.values

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('H2O_submission.csv', index=True)

In [23]:
# import h2o
# from h2o.automl import H2OAutoML
# h2o.init()
# h2o_test = h2o.H2OFrame(test)
saved_model = h2o.load_model('./GBM_grid_1_AutoML_1_20230408_02909_model_2')
preds = saved_model.predict(h2o_test)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%




In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Estimated_Departure_Time  891016 non-null   float64
 1   Estimated_Arrival_Time    890952 non-null   float64
 2   Carrier_ID(DOT)           890994 non-null   float64
 3   Distance                  1000000 non-null  float64
 4   Origin_Airport_ID         1000000 non-null  int64  
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Airline                   893473 non-null   object 
 7   Month                     1000000 non-null  int64  
 8   Carrier_Code(IATA)        891007 non-null   object 
dtypes: float64(4), int64(3), object(2)
memory usage: 68.7+ MB


In [25]:
y_pred = preds.as_data_frame().drop(columns=['predict']).values

In [26]:
y_pred = y_pred[1:]

In [27]:
y_pred.shape

(1000000, 2)

In [28]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('H2O_submission.csv', index=True)