In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [2]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [3]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [4]:
train = train[['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_ID(DOT)', 'Distance', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Delay']]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Estimated_Departure_Time  890981 non-null   float64
 1   Estimated_Arrival_Time    890960 non-null   float64
 2   Carrier_ID(DOT)           891003 non-null   float64
 3   Distance                  1000000 non-null  float64
 4   Origin_Airport_ID         1000000 non-null  int64  
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Delay                     255001 non-null   object 
dtypes: float64(4), int64(2), object(1)
memory usage: 53.4+ MB


In [5]:
def to_number(x):
    if x == None:
        return 0
    else:
        return 1

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x))
print('Done.')

Done.


In [6]:
train = train.drop(columns=['Delay'])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Estimated_Departure_Time  890981 non-null   float64
 1   Estimated_Arrival_Time    890960 non-null   float64
 2   Carrier_ID(DOT)           891003 non-null   float64
 3   Distance                  1000000 non-null  float64
 4   Origin_Airport_ID         1000000 non-null  int64  
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Delay_num                 1000000 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 53.4 MB


In [7]:
train_drop = train.dropna()
train_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 707317 entries, 1 to 999999
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Estimated_Departure_Time  707317 non-null  float64
 1   Estimated_Arrival_Time    707317 non-null  float64
 2   Carrier_ID(DOT)           707317 non-null  float64
 3   Distance                  707317 non-null  float64
 4   Origin_Airport_ID         707317 non-null  int64  
 5   Destination_Airport_ID    707317 non-null  int64  
 6   Delay_num                 707317 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 43.2 MB


In [8]:
train_x = train_drop.drop(columns=['Delay_num'])
train_y = train_drop['Delay_num']
test_x = test[['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_ID(DOT)', 'Distance', 'Origin_Airport_ID', 'Destination_Airport_ID']]

In [13]:
from pycaret.classification import *
from sklearn.metrics import log_loss

exp_name = setup(data=train_x, target=train_y, use_gpu=True)
add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
best_model = compare_models(fold=5, sort = 'logloss', 
                                    include=['lr', 'ridge', 'lda', 'dummy', 'knn', 'qda'])

In [14]:
best_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3807, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
best_tune = tune_model(best_model, n_iter=50, choose_better = True)
evaluate_model(best_tune)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.7448,0.5176,0.0,0.0,0.0,0.0,0.0,8.8131
1,0.7448,0.5201,0.0,0.0,0.0,0.0,0.0,8.8126
2,0.7448,0.5157,0.0,0.0,0.0,0.0,0.0,8.8126
3,0.7448,0.5161,0.0,0.0,0.0,0.0,0.0,8.8126
4,0.7448,0.5137,0.0,0.0,0.0,0.0,0.0,8.8126
5,0.7448,0.5234,0.0,0.0,0.0,0.0,0.0,8.8126
6,0.7448,0.5159,0.0,0.0,0.0,0.0,0.0,8.8126
7,0.7448,0.52,0.0,0.0,0.0,0.0,0.0,8.8126
8,0.7448,0.5217,0.0,0.0,0.0,0.0,0.0,8.8126
9,0.7448,0.5211,0.0,0.0,0.0,0.0,0.0,8.8133


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.7448,0.5176,0.0,0.0,0.0,0.0,0.0,8.8131
1,0.7448,0.5201,0.0,0.0,0.0,0.0,0.0,8.8126
2,0.7448,0.5157,0.0,0.0,0.0,0.0,0.0,8.8126
3,0.7448,0.5161,0.0,0.0,0.0,0.0,0.0,8.8126
4,0.7448,0.5137,0.0,0.0,0.0,0.0,0.0,8.8126
5,0.7448,0.5234,0.0,0.0,0.0,0.0,0.0,8.8126
6,0.7448,0.5159,0.0,0.0,0.0,0.0,0.0,8.8126
7,0.7448,0.52,0.0,0.0,0.0,0.0,0.0,8.8126
8,0.7448,0.5217,0.0,0.0,0.0,0.0,0.0,8.8126
9,0.7448,0.5211,0.0,0.0,0.0,0.0,0.0,8.8133


Fitting 10 folds for each of 50 candidates, totalling 500 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
best_model_bagged = ensemble_model(best_model, 
                                    method = 'Bagging', 
                                    fold = 5,
                                    n_estimators = 100)
evaluate_model(best_model_bagged)

best_model_bossted = ensemble_model(best_model, 
                                    method = 'Boosting', 
                                    fold = 5,
                                    n_estimators = 100)
evaluate_model(best_model_bossted)

In [14]:
y_pred = predict_model(best_tune, data= test_x)[['prediction_label']]
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('baseline_submission.csv', index=True)