In [39]:
import random
import os
import numpy as np
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score,  precision_score, recall_score, make_scorer,f1_score
from xgboost import XGBClassifier

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Fixed Seed

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

csv_to_parquet('/Users/soyoung/ml_project/airplane/train.csv', 'train')
csv_to_parquet('/Users/soyoung/ml_project/airplane/test.csv', 'test')


train Done.
test Done.


In [40]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('/Users/soyoung/ml_project/airplane/sample_submission.csv', index_col = 0)

airline_list = train[['Airline',"Carrier_Code(IATA)","Carrier_ID(DOT)"]]
airline_list.dropna(inplace=True)
airline_list.drop_duplicates(inplace=True)

def missing_values(df):
    missing_values = df["Carrier_ID(DOT)"].isnull()
    for idx, value in enumerate(missing_values):
        if value:
            subset = airline_list[(airline_list["Airline"] == df["Airline"][idx]) | (airline_list["Carrier_Code(IATA)"] == df["Carrier_Code(IATA)"][idx])]
            if len(subset) > 0:
                df.at[idx, "Carrier_ID(DOT)"] = subset["Carrier_ID(DOT)"].iloc[0]
                continue
    return df

train = missing_values(train)
test = missing_values(test)

# Replace variables with missing values except for the label (Delay) with the most frequent values of the training data
# 컬럼의 누락된 값은 훈련 데이터에서 해당 컬럼의 최빈값으로 대체됩니다.
NaN_col = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')




Done.


In [41]:
# "Carrier_ID(DOT)" 컬럼 값이 null인 행을 선택합니다.
train[train["Carrier_ID(DOT)"].isnull()]
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time         0
Estimated_Arrival_Time           0
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                109015
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)               1273
Tail_Number                      0
Delay                       744999
dtype: int64

In [None]:
droplist_first = ['ID','Origin_State','Destination_State','Carrier_Code(IATA)','Airline']
train.drop(columns=droplist_first,inplace=True)
test.drop(columns=droplist_first,inplace=True)

In [43]:

# Quantify qualitative variables
# 정성적 변수는 LabelEncoder를 사용하여 숫자로 인코딩됩니다.
qual_col = ['Origin_Airport', 'Destination_Airport', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])
print('Done.')

Done.


In [48]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
train.dropna(inplace=True)
train.replace('Delayed',1,inplace=True)
train.replace('Not_Delayed',0,inplace=True)

# 라벨이 있는 데이터셋
labeled_train_df = train.dropna(subset=['Delay'])

# 라벨이 없는 데이터셋
unlabeled_train_df = train[train['Delay'].isna()]

# 라벨이 있는 데이터셋으로 pretraining 모델 학습
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)

unsupervised_model.fit(
    X_train=labeled_train_df.drop('Delay', axis=1).values,
    eval_set=[labeled_train_df.drop('Delay', axis=1).values],
    pretraining_ratio=0.8,
)

# 라벨이 없는 데이터셋에서 예측값 생성
unlabeled_train_x = unlabeled_train_df.drop('Delay', axis=1).values
unlabeled_pred = unsupervised_model.predict(unlabeled_train_x)
unlabeled_train_x = np.hstack((unlabeled_train_x, unlabeled_pred))

# 라벨이 있는 데이터셋
labeled_train_x = labeled_train_df.drop('Delay', axis=1).values
labeled_train_y = labeled_train_df['Delay'].values

# 라벨이 결측된 데이터 제외한 라벨이 없는 데이터셋
unlabeled_train_x = unlabeled_train_x[~np.isnan(unlabeled_train_df['Delay'])]

# 라벨이 결측된 데이터 제외한 라벨이 있는 데이터셋
labeled_train_x = labeled_train_x[~np.isnan(labeled_train_y)]
labeled_train_y = labeled_train_y[~np.isnan(labeled_train_y)]

# Fine-tuning을 위해 새로운 feature를 추가한 train_x, train_y 생성
train_x = np.vstack((labeled_train_x, unlabeled_train_x))
train_y = np.concatenate((labeled_train_y, unlabeled_train_df['Delay'].values))

# TabNetClassifier 모델 학습
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax'
    
)
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# EarlyStopping 객체 생성
early_stop_callback = EarlyStopping(
    monitor='logloss', # 모니터링할 metric 선택
    min_delta=0.1, # metric 개선 여부를 판단하는 최소 차이 값
    patience=10, # 개선이 없을 경우 학습을 중지하기까지의 epoch 수
    verbose=False, # True일 경우 early stopping이 실행될 때마다 메시지를 출력합니다.
    mode='min' # 모니터링할 metric이 감소해야 하는지, 증가해야 하는지를 선택합니다.
)

# val_0_unsup_loss_numpy는 unsupervised pretraining 단계에서 계산된 unsupervised loss 값을 나타냅니다. 크다고 나쁜거 아님.
clf.fit(
    X_train=train_x,
    y_train=train_y,
    eval_set=[(train_x, train_y)],
    eval_name=['train'],
    eval_metric=['logloss','f1'],
    from_unsupervised=unsupervised_model,
    callbacks=[early_stop_callback] # EarlyStopping 객체 전달

)


epoch 0  | loss: 192.40995| val_0_unsup_loss_numpy: 204.34649658203125|  0:00:41s
epoch 1  | loss: 149.485 | val_0_unsup_loss_numpy: 204.2398223876953|  0:01:21s
epoch 2  | loss: 108.30113| val_0_unsup_loss_numpy: 204.20974731445312|  0:02:02s
epoch 3  | loss: 72.64008| val_0_unsup_loss_numpy: 204.18020629882812|  0:02:43s
epoch 4  | loss: 45.32415| val_0_unsup_loss_numpy: 204.19676208496094|  0:03:23s
epoch 5  | loss: 27.28641| val_0_unsup_loss_numpy: 203.5570068359375|  0:04:04s
epoch 6  | loss: 15.73953| val_0_unsup_loss_numpy: 203.27999877929688|  0:04:46s
epoch 7  | loss: 9.3982  | val_0_unsup_loss_numpy: 203.13485717773438|  0:05:28s
epoch 8  | loss: 6.16895 | val_0_unsup_loss_numpy: 202.59197998046875|  0:06:09s
epoch 9  | loss: 4.69144 | val_0_unsup_loss_numpy: 202.02589416503906|  0:06:50s
epoch 10 | loss: 4.24826 | val_0_unsup_loss_numpy: 201.63180541992188|  0:07:31s
epoch 11 | loss: 3.70833 | val_0_unsup_loss_numpy: 201.35052490234375|  0:08:12s
epoch 12 | loss: 2.80764 | v

KeyboardInterrupt: 

In [46]:
y_pred = clf.predict_proba(test)

NameError: name 'clf' is not defined

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_tabnet.csv', index=True)