# Import

In [364]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [365]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [366]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet
메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [367]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [368]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


# Data Load

In [383]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

# Data Pre-Processing

In [384]:
# Carrier ID를 채울때 Airline 혹은 Carriercode가 같은 행의 데이터로 채운다.
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

In [385]:
airline_list = train_df[['Airline',"Carrier_Code(IATA)","Carrier_ID(DOT)"]]
airline_list.dropna(inplace=True)
airline_list.drop_duplicates(inplace=True)


In [386]:
def missing_values(df):
    missing_values = df["Carrier_ID(DOT)"].isnull()
    for idx, value in enumerate(missing_values):
        if value:
            subset = airline_list[(airline_list["Airline"] == df["Airline"][idx]) | (airline_list["Carrier_Code(IATA)"] == df["Carrier_Code(IATA)"][idx])]
            if len(subset) > 0:
                df.at[idx, "Carrier_ID(DOT)"] = subset["Carrier_ID(DOT)"].iloc[0]
                continue
    return df

train_df = missing_values(train_df)
test_df = missing_values(test_df)

In [387]:
# missing_values = train_df["Carrier_ID(DOT)"].isnull()

# for idx, value in enumerate(missing_values):
#     if value:
#         subset = airline_list[(airline_list["Airline"] == train_df["Airline"][idx]) | (airline_list["Carrier_Code(IATA)"] == train_df["Carrier_Code(IATA)"][idx])]
#         if len(subset) > 0:
#             train_df.at[idx, "Carrier_ID(DOT)"] = subset["Carrier_ID(DOT)"].iloc[0]
#             continue

In [388]:
train_df['date'] = train_df['Month'].astype(str)+'-'+train_df['Day_of_Month'].astype(str)
train_df['date'] = pd.to_datetime(train_df['date'], format='%m-%d')
test_df['date'] = test_df['Month'].astype(str)+'-'+test_df['Day_of_Month'].astype(str)
test_df['date'] = pd.to_datetime(test_df['date'], format='%m-%d')

In [390]:
#질적 변수들을 수치화합니다
# qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
qual_col = ['date','Tail_Number']
for i in qual_col:
    le = LabelEncoder()
    test_df[i] = le.fit_transform(test_df[i])
    le=le.fit(train_df[i])
    train_df[i]=le.transform(train_df[i])
    


#     for label in np.unique(test_df[i]):
#         if label not in le.classes_: 
#             le.classes_ = np.append(le.classes_, label)
#     test_df[i]=le.transform(test_df[i])
# print('Done.')

In [391]:
#레이블이 없는 데이터들을 제거합니다
droplist = ['ID','Month','Day_of_Month', 'Origin_Airport','Origin_State','Destination_Airport','Destination_State','Airline','Carrier_Code(IATA)','Cancelled',	'Diverted']
train_df.replace('Delayed',1,inplace=True)
train_df.replace('Not_Delayed',0,inplace=True)
train_df['Delay_Num'] = pd.to_numeric(train_df['Delay'])
train_df.drop(columns=droplist+['Delay'],inplace=True,axis=1)

In [392]:
train_df.dropna(inplace=True)
train_x = train_df.drop(columns=['Delay_Num'])
train_y = train_df['Delay_Num']
test_x = test_df.drop(columns=droplist)

In [393]:
scaler.fit(train_x)
train_x = scaler.transform(train_x)
scaler.fit(test_x)
test_x = scaler.transform(test_x)

In [394]:
param_grid = {
    'learning_rate':0.05,
    'max_depth': 7,
    'min_child_weight':5,
    'subsample':1,
    'colsample_bytree':1
}
xgb = XGBClassifier(**param_grid,n_estimaters = 10000,early_stopping_rounds=100)
X_train, X_test, y_train, y_test = train_test_split(
    train_x, train_y, test_size=0.2, random_state=42
)


In [395]:
xgb.fit(X_train, y_train,eval_set=[(X_test, y_test)])

Parameters: { "n_estimaters" } are not used.

[0]	validation_0-logloss:0.67179
[1]	validation_0-logloss:0.65249
[2]	validation_0-logloss:0.63499
[3]	validation_0-logloss:0.61912
[4]	validation_0-logloss:0.60469
[5]	validation_0-logloss:0.59156
[6]	validation_0-logloss:0.57954
[7]	validation_0-logloss:0.56859
[8]	validation_0-logloss:0.55858
[9]	validation_0-logloss:0.54945
[10]	validation_0-logloss:0.54111
[11]	validation_0-logloss:0.53344
[12]	validation_0-logloss:0.52637
[13]	validation_0-logloss:0.51994
[14]	validation_0-logloss:0.51404
[15]	validation_0-logloss:0.50862
[16]	validation_0-logloss:0.50364
[17]	validation_0-logloss:0.49909
[18]	validation_0-logloss:0.49488
[19]	validation_0-logloss:0.49105
[20]	validation_0-logloss:0.48745
[21]	validation_0-logloss:0.48421
[22]	validation_0-logloss:0.48121
[23]	validation_0-logloss:0.47844
[24]	validation_0-logloss:0.47594
[25]	validation_0-logloss:0.47365
[26]	validation_0-logloss:0.47155
[27]	validation_0-logloss:0.46958
[28]	validat

In [396]:
# Make predictions for the test set
y_pred_test = xgb.predict(X_test)
# View accuracy score
accuracy_score(y_test, y_pred_test)

0.8217870741235227

In [397]:
y_pred = xgb.predict_proba(test_x)

# Submit

In [315]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [316]:
submission.to_csv('baseline_submission.csv', index=True)