In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc
import matplotlib.pyplot as plt
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

train Done.
test Done.


In [2]:
mom = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
train['Day_of_Month'] = train.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
test['Day_of_Month'] = test.apply(lambda x: np.sin(((sum(mom[:(int(x['Month'])-1)]) + int(x['Day_of_Month'])) / 365) * np.pi), axis=1)
train['Estimated_Departure_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
test['Estimated_Departure_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Departure_Time']) // 100) * 60 + (int(x['Estimated_Departure_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Departure_Time']) else None, axis=1)
train['Estimated_Arrival_Time'] = train.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)
test['Estimated_Arrival_Time'] = test.apply(lambda x: np.sin((((int(x['Estimated_Arrival_Time']) // 100) * 60 + (int(x['Estimated_Arrival_Time']) % 100)) / 1439) * np.pi) if pd.notnull(x['Estimated_Arrival_Time']) else None, axis=1)

In [3]:
train = train.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])
train.info()
test = test.drop(columns=['ID', 'Month', 'Origin_Airport', 'Destination_Airport', 'Cancelled', 'Diverted'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Day_of_Month              1000000 non-null  float64
 1   Estimated_Departure_Time  890981 non-null   float64
 2   Estimated_Arrival_Time    890960 non-null   float64
 3   Origin_Airport_ID         1000000 non-null  int64  
 4   Origin_State              890985 non-null   object 
 5   Destination_Airport_ID    1000000 non-null  int64  
 6   Destination_State         890921 non-null   object 
 7   Distance                  1000000 non-null  float64
 8   Airline                   891080 non-null   object 
 9   Carrier_Code(IATA)        891010 non-null   object 
 10  Carrier_ID(DOT)           891003 non-null   float64
 11  Tail_Number               1000000 non-null  object 
 12  Delay                     255001 non-null   object 
dtypes: float64(5), int64(2), obj

In [4]:
# #레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
# #가장 많이 나온 값들로 NaN을 대체함
# NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

# for col in NaN_col:
#     mode = train[col].mode()[0]
#     train[col] = train[col].fillna(mode)
    
#     if col in test.columns:
#         test[col] = test[col].fillna(mode)
# print('Done.')

def to_number(x):
    if x == None: return -1
    elif x == 'Delayed': return 1
    else: return 0
    
train.loc[:, 'Delay'] = train['Delay'].apply(lambda x: to_number(x))

# Quantify qualitative variables
# 정성적 변수는 LabelEncoder를 사용하여 숫자로 인코딩됩니다.
qual_col = ['Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in set(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

train = train.dropna()
print('Done.')

Done.


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 707317 entries, 1 to 999999
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Day_of_Month              707317 non-null  float64
 1   Estimated_Departure_Time  707317 non-null  float64
 2   Estimated_Arrival_Time    707317 non-null  float64
 3   Origin_Airport_ID         707317 non-null  int64  
 4   Origin_State              707317 non-null  int32  
 5   Destination_Airport_ID    707317 non-null  int64  
 6   Destination_State         707317 non-null  int32  
 7   Distance                  707317 non-null  float64
 8   Airline                   707317 non-null  int32  
 9   Carrier_Code(IATA)        707317 non-null  int32  
 10  Carrier_ID(DOT)           707317 non-null  float64
 11  Tail_Number               707317 non-null  int32  
 12  Delay                     707317 non-null  int64  
dtypes: float64(5), int32(5), int64(3)
memory usa

In [6]:
threshold = 10
category_column = ['Origin_Airport_ID', 'Origin_State', 'Destination_Airport_ID', 'Destination_State', 'Airline','Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number']
for column_name in category_column:
    # value_counts() 메서드를 사용하여 해당 column에서 각 값의 빈도수 계산
    value_counts = train[column_name].value_counts()

    # 빈도수가 threshold보다 작은 값들의 인덱스를 추출하여 리스트로 저장
    to_remove = value_counts[value_counts < threshold].index.tolist()

    # to_remove 리스트에 속하지 않은 row들로 이루어진 새로운 dataframe 생성
    train = train[~train[column_name].isin(to_remove)]

In [7]:
train['Delay'].value_counts()

-1    526842
 0    148658
 1     31817
Name: Delay, dtype: int64

In [8]:
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
if len(selected_row_0) > len(selected_row_1): selected_row_0 = selected_row_0.sample(n=len(selected_row_1))
else: selected_row_1 = selected_row_1.sample(n=len(selected_row_0))
train_0to1 = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
train_0to1 = train_0to1.sample(frac=1)
# selected_row_null = train.loc[train['Delay'] == -1]
# selected_row_null = selected_row_null.sample(n = 31813)
# train_sorted = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
# train_sorted = train_sorted.sample(frac=1)

In [9]:
train_0to1['Delay'].value_counts()

0    31817
1    31817
Name: Delay, dtype: int64

In [10]:
# from sklearn.cluster import KMeans
# # selected_row_null = selected_row_null.drop(columns=['Delay'])
# clusterer = KMeans(n_clusters=2, random_state=10)
# cluster_labels = clusterer.fit_predict(selected_row_null)


In [11]:
# list(clusterer.fit_predict(selected_row_1)).count(1)

In [12]:
exp_name = setup(data=train_0to1, target='Delay', 
                use_gpu=True)
add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
best_model_1 = compare_models(n_select = 1, sort='logloss', fold=5)

In [13]:
selected_row_null = train.loc[train['Delay'] == -1]
selected_row_null = selected_row_null.drop(columns=['Delay'])
selected_row_null['Delay'] = predict_model(best_model_1, data= selected_row_null)['prediction_label'].values
selected_row_0 = train.loc[train['Delay'] == 0]
selected_row_1 = train.loc[train['Delay'] == 1]
train_filled = pd.concat([selected_row_0, selected_row_1, selected_row_null], ignore_index=True)
train_filled = train_filled.sample(frac=1)

In [14]:
train_filled['Delay'].value_counts()

0    432605
1    274712
Name: Delay, dtype: int64

In [25]:
selected_row_0 = train_filled.loc[train_filled['Delay'] == 0]
selected_row_1 = train_filled.loc[train_filled['Delay'] == 1]
if len(selected_row_0) > len(selected_row_1): selected_row_0 = selected_row_0.sample(n=len(selected_row_1)//2)
else: selected_row_1 = selected_row_1.sample(n=len(selected_row_0))
train_filled = pd.concat([selected_row_0, selected_row_1], ignore_index=True)
train_filled = train_filled.sample(frac=1)

In [26]:
train_filled['Delay'].value_counts()

1    274712
0    137356
Name: Delay, dtype: int64

In [17]:
# from sklearn.semi_supervised import LabelPropagation
# # Semi-supervised learning을 위한 LabelPropagation 모델 학습
# X_train = train_sorted.drop(columns=['Delay'])
# y_train = train_sorted['Delay']
# label_prop_model = LabelPropagation()
# label_prop_model.fit(X_train, y_train)

# # 학습된 LabelPropagation 모델로 Unlabeled 데이터 라벨링
# y_unlabeled_pred = label_prop_model.predict(X_train)

In [27]:
exp_name = setup(data=train_filled, target='Delay', 
                use_gpu=True)
add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

best_model = compare_models(n_select = 1, sort='logloss', fold=5)

Unnamed: 0,Description,Value
0,Session id,4656
1,Target,Delay
2,Target type,Binary
3,Original data shape,"(412068, 13)"
4,Transformed data shape,"(412068, 13)"
5,Transformed train set shape,"(288447, 13)"
6,Transformed test set shape,"(123621, 13)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


In [28]:
# best_model = tune_model(best_model, optimize='logloss', 
#                         n_iter=50,
#                         choose_better = True)
# evaluate_model(best_model)

In [29]:
# from pycaret.classification import *
# from sklearn.metrics import log_loss
# exp_name = setup(data=train_x, target=train_y, use_gpu=True)
# add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
# top5 = compare_models(n_select = 3, sort='logloss', fold=5, 
#                     include=['lr', 'ridge', 'lda', 'dummy', 'knn', 'qda'])
# stacker = stack_models(top5)

In [30]:
predict_model(best_model, data= test)

Unnamed: 0,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport_ID,Origin_State,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,prediction_label,prediction_score
0,0.128748,0.999971,,12266,42,14683,42,191.0,26,8,,4387,1,0.7830
1,0.811539,0.923357,0.772014,11618,28,10397,52,746.0,9,3,19790.0,1936,1,0.6282
2,0.530730,0.865297,0.581053,13930,11,12953,30,733.0,26,8,19977.0,2147,1,0.9715
3,0.927542,0.572135,0.410987,13796,4,12892,4,337.0,23,10,19393.0,5486,1,0.9680
4,0.998880,0.581053,0.273725,11697,7,12892,4,2343.0,18,2,20409.0,5965,1,0.9679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.967938,,0.171617,12892,4,11292,5,862.0,23,10,19393.0,3831,1,0.9525
999996,0.979614,0.707493,0.777535,10792,30,13930,11,473.0,26,8,19977.0,2069,0,0.8496
999997,0.984474,0.778906,0.595179,12264,52,11433,20,383.0,19,8,20378.0,2619,1,0.9205
999998,0.369725,,0.609122,14679,4,10721,19,2588.0,18,2,20409.0,6343,1,0.9648


In [31]:
y_pred_label = predict_model(best_model, data= test)['prediction_label'].values
y_pred_score = predict_model(best_model, data= test)['prediction_score'].values
y_pred = []  # 0 : Not Delayed  //  1 : Delayed
for i in range(len(test)):
    if y_pred_label[i] == 0: y_pred.append([y_pred_score[i], 1.0 - y_pred_score[i]])
    else: y_pred.append([1.0 - y_pred_score[i], y_pred_score[i]])

In [32]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)