In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [2]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
train = pd.read_csv(DATA_PATH + "flight_delays_train.csv")
test = pd.read_csv(DATA_PATH + "flight_delays_test.csv")

In [4]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [5]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [6]:
places = pd.Series((test['Origin'].append(test['Dest']).append(train['Origin']).append(train['Dest'])).unique()).to_dict()
places_map = {v: k for k, v in places.items()}
places_map

{'MRY': 0,
 'LAS': 1,
 'GSP': 2,
 'BWI': 3,
 'ORD': 4,
 'ORF': 5,
 'IAD': 6,
 'MSP': 7,
 'PIT': 8,
 'DFW': 9,
 'JFK': 10,
 'ASE': 11,
 'GRR': 12,
 'DEN': 13,
 'DAB': 14,
 'ATL': 15,
 'CMH': 16,
 'MDT': 17,
 'IAH': 18,
 'MIA': 19,
 'LAX': 20,
 'TYS': 21,
 'MCI': 22,
 'STL': 23,
 'LGA': 24,
 'DTW': 25,
 'CLT': 26,
 'DCA': 27,
 'SFO': 28,
 'MSN': 29,
 'IND': 30,
 'ALB': 31,
 'HNL': 32,
 'PWM': 33,
 'MDW': 34,
 'PHX': 35,
 'BOS': 36,
 'GEG': 37,
 'ABE': 38,
 'DAL': 39,
 'EWR': 40,
 'LEX': 41,
 'OAK': 42,
 'SAN': 43,
 'SGF': 44,
 'OMA': 45,
 'MAF': 46,
 'CLD': 47,
 'SNA': 48,
 'BUR': 49,
 'PHL': 50,
 'PDX': 51,
 'BDL': 52,
 'ANC': 53,
 'JAN': 54,
 'OKC': 55,
 'MLU': 56,
 'RDU': 57,
 'ONT': 58,
 'SLC': 59,
 'TWF': 60,
 'FLL': 61,
 'ABI': 62,
 'CLE': 63,
 'CVG': 64,
 'AZO': 65,
 'SWF': 66,
 'SJU': 67,
 'COS': 68,
 'GSO': 69,
 'MEM': 70,
 'MCO': 71,
 'RSW': 72,
 'TPA': 73,
 'FAT': 74,
 'BUF': 75,
 'JAX': 76,
 'ITO': 77,
 'ERI': 78,
 'PIA': 79,
 'LIT': 80,
 'SBA': 81,
 'CID': 82,
 'BHM': 83,
 '

In [8]:
carriers = pd.Series((test['UniqueCarrier'].append(train['UniqueCarrier'])).unique()).to_dict()
carriers_map = {v: k for k, v in carriers.items()}
carriers_map

{'YV': 0,
 'WN': 1,
 'MQ': 2,
 'UA': 3,
 'NW': 4,
 'B6': 5,
 'US': 6,
 'AA': 7,
 'OH': 8,
 'OO': 9,
 'FL': 10,
 'DL': 11,
 'EV': 12,
 'CO': 13,
 'XE': 14,
 '9E': 15,
 'HA': 16,
 'AS': 17,
 'AQ': 18,
 'F9': 19,
 'DH': 20,
 'TZ': 21,
 'HP': 22}

In [9]:
def prepare_df(df):
    df_copy = df.copy()
    df_copy['Dest'] = df_copy['Dest'].map(places_map)
    df_copy['Origin'] = df_copy['Origin'].map(places_map)
    df_copy['UniqueCarrier'] = df_copy['UniqueCarrier'].map(carriers_map)
    df_copy['Month'] = df_copy['Month'].str.replace('c-', '').astype(int)
    df_copy['DayofMonth'] = df_copy['DayofMonth'].str.replace('c-', '').astype(int)
    df_copy['DayOfWeek'] = df_copy['DayOfWeek'].str.replace('c-', '').astype(int)
    return df_copy

In [10]:
X_train, y_train = prepare_df(train), train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_train = X_train.drop(columns=['dep_delayed_15min'])
X_test = prepare_df(test)

In [11]:
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)
    
model = CatBoostClassifier(random_state=17, learning_rate=0.1, max_depth=5, verbose=False)

model.fit(X_train_part, y_train_part)
model_valid_pred = model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, model_valid_pred)

0.742409983767691

In [12]:
model.fit(X_train, y_train)
model_test_pred = model.predict_proba(X_test)[:, 1]

pd.Series(model_test_pred, name='dep_delayed_15min').to_csv('catboost_solution1.csv', index_label='id', header=True)