In [16]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [17]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [18]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [20]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
#가장 많이 나온 값들로 NaN을 대체함
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

#레이블이 없는 데이터들을 제거합니다
train = train.dropna()
train.info()

Done.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 255001 entries, 5 to 999992
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        255001 non-null  object 
 1   Month                     255001 non-null  int64  
 2   Day_of_Month              255001 non-null  int64  
 3   Estimated_Departure_Time  255001 non-null  float64
 4   Estimated_Arrival_Time    255001 non-null  float64
 5   Cancelled                 255001 non-null  int64  
 6   Diverted                  255001 non-null  int64  
 7   Origin_Airport            255001 non-null  object 
 8   Origin_Airport_ID         255001 non-null  int64  
 9   Origin_State              255001 non-null  object 
 10  Destination_Airport       255001 non-null  object 
 11  Destination_Airport_ID    255001 non-null  int64  
 12  Destination_State         255001 non-null  object 
 13  Distance                  255001 non-n

In [21]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [22]:
train_x = train.drop(columns=['ID', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [None]:
from pycaret.classification import *
from sklearn.metrics import log_loss

exp_name = setup(data=train_x, target=train_y, use_gpu=True)
add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
best_logloss_model = compare_models(fold = 5, n_select = 1, sort = 'logloss',
                                    include=['lr', 'ridge', 'lda', 'dummy', 'knn', 'qda'])

In [15]:
best_logloss_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1974, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
best_logloss_model = tune_model(best_logloss_model)
evaluate_model(best_logloss_model)

# best_tune_blender = tune_model(blender)
# evaluate_model(best_tune_blender)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.745,0.5145,0.0,0.0,0.0,0.0,0.0,8.8074
1,0.745,0.5168,0.0,0.0,0.0,0.0,0.0,8.8074
2,0.745,0.5201,0.0,0.0,0.0,0.0,0.0,8.8074
3,0.745,0.5147,0.0,0.0,0.0,0.0,0.0,8.8074
4,0.745,0.5132,0.0,0.0,0.0,0.0,0.0,8.8074
5,0.745,0.5141,0.0,0.0,0.0,0.0,0.0,8.8074
6,0.745,0.5197,0.0,0.0,0.0,0.0,0.0,8.8074
7,0.745,0.5132,0.0,0.0,0.0,0.0,0.0,8.8074
8,0.745,0.5196,0.0,0.0,0.0,0.0,0.0,8.8074
9,0.745,0.5148,0.0,0.0,0.0,0.0,0.0,8.8079


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.745,0.5145,0.0,0.0,0.0,0.0,0.0,8.8074
1,0.745,0.5168,0.0,0.0,0.0,0.0,0.0,8.8074
2,0.745,0.5201,0.0,0.0,0.0,0.0,0.0,8.8074
3,0.745,0.5147,0.0,0.0,0.0,0.0,0.0,8.8074
4,0.745,0.5132,0.0,0.0,0.0,0.0,0.0,8.8074
5,0.745,0.5141,0.0,0.0,0.0,0.0,0.0,8.8074
6,0.745,0.5197,0.0,0.0,0.0,0.0,0.0,8.8074
7,0.745,0.5132,0.0,0.0,0.0,0.0,0.0,8.8074
8,0.745,0.5196,0.0,0.0,0.0,0.0,0.0,8.8074
9,0.745,0.5148,0.0,0.0,0.0,0.0,0.0,8.8079


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [13]:
y_pred_logloss = predict_model(best_logloss_model, data= test_x)[['prediction_label']]
# y_pred_blender = predict_model(best_tune_blender, data= test_x)[['prediction_label']]

In [14]:
submission = pd.DataFrame(data=y_pred_logloss, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('logloss_submission.csv', index=True)

# submission = pd.DataFrame(data=y_pred_blender, columns=sample_submission.columns, index=sample_submission.index)
# submission.to_csv('blender_submission.csv', index=True)