In [35]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss

In [20]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

In [21]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [22]:
csv_to_parquet('/kaggle/input/airplane/train.csv', 'train')
csv_to_parquet('/kaggle/input/airplane/test.csv', 'test')

train Done.
test Done.


In [23]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('/kaggle/input/airplane/sample_submission.csv', index_col=0)

In [24]:
train.describe()

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport_ID,Destination_Airport_ID,Distance,Carrier_ID(DOT)
count,1000000.0,1000000.0,890981.0,890960.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,891003.0
mean,6.945156,15.764842,1341.153019,1493.295934,0.0,0.0,12696.278484,12701.813986,784.078499,19997.388093
std,3.462506,8.763515,489.814011,520.803494,0.0,0.0,1514.938441,1515.213044,590.790469,404.268639
min,1.0,1.0,1.0,1.0,0.0,0.0,10135.0,10135.0,16.0,19393.0
25%,4.0,8.0,925.0,1105.0,0.0,0.0,11292.0,11292.0,350.0,19790.0
50%,7.0,16.0,1332.0,1524.0,0.0,0.0,12889.0,12889.0,623.0,19977.0
75%,10.0,23.0,1742.0,1924.0,0.0,0.0,14057.0,14057.0,1020.0,20378.0
max,12.0,31.0,2359.0,2400.0,0.0,0.0,16869.0,16869.0,5095.0,21171.0


In [25]:
train.head()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,


In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              890985 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         890921 non-null   object 
 13  Distance                  10

In [27]:
NaN_col = ['Estimated_Departure_Time','Estimated_Arrival_Time','Origin_State', 'Destination_State','Airline','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)

In [28]:
Qual_col = ['Origin_Airport','Origin_State','Destination_Airport','Destination_State','Airline','Carrier_Code(IATA)','Tail_Number']
for col in Qual_col:
    le = LabelEncoder()
    le = le.fit(train[col])
    train[col] = le.transform(train[col])
                              
    for label in np.unique(test[col]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[col] = le.transform(test[col])

In [29]:
train = train.dropna()

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255001 entries, 5 to 999992
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        255001 non-null  object 
 1   Month                     255001 non-null  int64  
 2   Day_of_Month              255001 non-null  int64  
 3   Estimated_Departure_Time  255001 non-null  float64
 4   Estimated_Arrival_Time    255001 non-null  float64
 5   Cancelled                 255001 non-null  int64  
 6   Diverted                  255001 non-null  int64  
 7   Origin_Airport            255001 non-null  int64  
 8   Origin_Airport_ID         255001 non-null  int64  
 9   Origin_State              255001 non-null  int64  
 10  Destination_Airport       255001 non-null  int64  
 11  Destination_Airport_ID    255001 non-null  int64  
 12  Destination_State         255001 non-null  int64  
 13  Distance                  255001 non-null  float6

In [31]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x:to_number(x, column_number))

In [32]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [34]:
rf = RandomForestClassifier()
rf.fit(train_x, train_y)

In [42]:
y_pred_rf = rf.predict_proba(test_x)

In [38]:
cat = CatBoostClassifier()
cat.fit(train_x, train_y)

Learning rate set to 0.109781
0:	learn: 0.6383903	total: 99ms	remaining: 1m 38s
1:	learn: 0.5966872	total: 138ms	remaining: 1m 9s
2:	learn: 0.5651279	total: 180ms	remaining: 59.9s
3:	learn: 0.5401110	total: 233ms	remaining: 57.9s
4:	learn: 0.5214741	total: 279ms	remaining: 55.5s
5:	learn: 0.5067219	total: 323ms	remaining: 53.6s
6:	learn: 0.4950679	total: 366ms	remaining: 51.9s
7:	learn: 0.4859896	total: 410ms	remaining: 50.8s
8:	learn: 0.4789850	total: 451ms	remaining: 49.6s
9:	learn: 0.4735459	total: 492ms	remaining: 48.7s
10:	learn: 0.4692905	total: 535ms	remaining: 48.1s
11:	learn: 0.4658664	total: 577ms	remaining: 47.5s
12:	learn: 0.4629944	total: 620ms	remaining: 47s
13:	learn: 0.4607254	total: 659ms	remaining: 46.4s
14:	learn: 0.4587585	total: 698ms	remaining: 45.9s
15:	learn: 0.4572426	total: 739ms	remaining: 45.5s
16:	learn: 0.4561311	total: 778ms	remaining: 45s
17:	learn: 0.4550464	total: 824ms	remaining: 44.9s
18:	learn: 0.4541618	total: 861ms	remaining: 44.5s
19:	learn: 0.45

<catboost.core.CatBoostClassifier at 0x7d93dfeda140>

In [40]:
y_pred_cat = cat.predict_proba(test_x)

In [45]:
submission_rf = pd.DataFrame(data=y_pred_rf, columns=sample_submission.columns, index=sample_submission.index)

In [44]:
submission_cat = pd.DataFrame(data=y_pred_cat, columns=sample_submission.columns, index=sample_submission.index)

In [46]:
submission_rf.to_csv('rf_basic_submission.csv', index=True)

In [47]:
submission_cat.to_csv('cat_basic_submission.csv', index=True)