In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
from catboost import CatBoostRegressor

In [2]:
def rmse(y_true, y_pred):
    result = np.sqrt(np.mean((y_true - y_pred)**2))
    return result

In [3]:
data = pd.read_parquet('/kaggle/input/flight-delay')
data.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [6]:
data = data[data['DepDelay'].isna() == False]

In [7]:
data['FlightDate'] = pd.to_datetime(data['FlightDate'], yearfirst=True)
data.sort_values(by='FlightDate', inplace=True)

In [8]:
data.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
84462,2021-01-01,Delta Air Lines Inc.,ATL,EWR,False,False,1354,1350.0,0.0,-4.0,...,1404.0,1541.0,7.0,1554,-6.0,0.0,-1.0,1500-1559,3,0.0
208924,2021-01-01,SkyWest Airlines Inc.,SJC,SAN,False,False,1525,1515.0,0.0,-10.0,...,1523.0,1626.0,2.0,1643,-15.0,0.0,-1.0,1600-1659,2,0.0
194036,2021-01-01,SkyWest Airlines Inc.,BIL,PHX,False,False,523,517.0,0.0,-6.0,...,526.0,708.0,4.0,802,-50.0,0.0,-2.0,0800-0859,4,0.0
194037,2021-01-01,SkyWest Airlines Inc.,PHX,YUM,False,False,923,922.0,0.0,-1.0,...,947.0,1018.0,2.0,1027,-7.0,0.0,-1.0,1000-1059,1,0.0
7896,2021-01-01,Endeavor Air Inc.,BUF,ATL,False,False,1355,1350.0,0.0,-5.0,...,1358.0,1555.0,8.0,1613,-10.0,0.0,-1.0,1600-1659,3,0.0


In [9]:
data['DepDelayMinutes'].describe()

count    6.203458e+06
mean     1.276132e+01
std      4.736319e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      6.000000e+00
max      3.095000e+03
Name: DepDelayMinutes, dtype: float64

In [9]:
data.columns

Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',
       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOu

In [13]:
data = data[data['DepDelayMinutes'] <= 600]

In [14]:
X = data[['Month', 'DayOfWeek', 'DayofMonth', 'DOT_ID_Operating_Airline',
          'Airline', 'OriginAirportID', 'DestCityName', 'DestAirportID', 'Diverted', 'Cancelled',
          'IATA_Code_Operating_Airline', 'IATA_Code_Marketing_Airline']]
y = data['DepDelayMinutes']

for col in X:
    print(X[col].isna().value_counts())

Month
False    6196399
Name: count, dtype: int64
DayOfWeek
False    6196399
Name: count, dtype: int64
DayofMonth
False    6196399
Name: count, dtype: int64
DOT_ID_Operating_Airline
False    6196399
Name: count, dtype: int64
Airline
False    6196399
Name: count, dtype: int64
OriginAirportID
False    6196399
Name: count, dtype: int64
DestCityName
False    6196399
Name: count, dtype: int64
DestAirportID
False    6196399
Name: count, dtype: int64
Diverted
False    6196399
Name: count, dtype: int64
Cancelled
False    6196399
Name: count, dtype: int64
IATA_Code_Operating_Airline
False    6196399
Name: count, dtype: int64
IATA_Code_Marketing_Airline
False    6196399
Name: count, dtype: int64


In [15]:
X['Airline'] = pd.Categorical(X['Airline'])
X['DOT_ID_Operating_Airline'] = pd.Categorical(X['DOT_ID_Operating_Airline'])
X['OriginAirportID'] = pd.Categorical(X['OriginAirportID'])
X['DestCityName'] = pd.Categorical(X['DestCityName'])
X['IATA_Code_Operating_Airline'] = pd.Categorical(X['IATA_Code_Operating_Airline'])
X['IATA_Code_Marketing_Airline'] = pd.Categorical(X['IATA_Code_Marketing_Airline'])
X['Diverted'] = pd.Categorical(X['Diverted'])
X['Cancelled'] = pd.Categorical(X['Cancelled'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Airline'] = pd.Categorical(X['Airline'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DOT_ID_Operating_Airline'] = pd.Categorical(X['DOT_ID_Operating_Airline'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['OriginAirportID'] = pd.Categorical(X['OriginAirportID'])
A value is trying to be

In [18]:
cat_features = ['DOT_ID_Operating_Airline', 'Airline', 'OriginAirportID', 
                'DestCityName', 'DestAirportID','IATA_Code_Operating_Airline', 
                'IATA_Code_Marketing_Airline', 'Diverted', 'Cancelled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = CatBoostRegressor(random_seed=42, cat_features=cat_features, n_estimators=500)

In [19]:
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

Learning rate set to 0.285404
0:	learn: 35.4488308	test: 35.1631915	best: 35.1631915 (0)	total: 2.25s	remaining: 18m 44s
1:	learn: 35.3523210	test: 35.0673267	best: 35.0673267 (1)	total: 4.26s	remaining: 17m 40s
2:	learn: 35.2924509	test: 35.0087047	best: 35.0087047 (2)	total: 7.91s	remaining: 21m 49s
3:	learn: 35.2472304	test: 34.9651679	best: 34.9651679 (3)	total: 10.6s	remaining: 21m 51s
4:	learn: 35.2191052	test: 34.9385771	best: 34.9385771 (4)	total: 13.4s	remaining: 22m 7s
5:	learn: 35.1991076	test: 34.9197736	best: 34.9197736 (5)	total: 15.8s	remaining: 21m 44s
6:	learn: 35.1799845	test: 34.9004621	best: 34.9004621 (6)	total: 18.6s	remaining: 21m 49s
7:	learn: 35.1539164	test: 34.8744609	best: 34.8744609 (7)	total: 21s	remaining: 21m 33s
8:	learn: 35.1189359	test: 34.8400504	best: 34.8400504 (8)	total: 24.2s	remaining: 21m 58s
9:	learn: 35.1031225	test: 34.8248263	best: 34.8248263 (9)	total: 26.9s	remaining: 21m 56s
10:	learn: 35.0876996	test: 34.8093738	best: 34.8093738 (10)	to

<catboost.core.CatBoostRegressor at 0x794e73b9d360>

In [20]:
preds = model.predict(X_test)

In [21]:
print(rmse(y_test, preds))
print(r2_score(y_test, preds))
print(mean_absolute_error(y_test, preds))

34.48167023923854
0.06429132574957142
16.134389994594127


In [22]:
compare_df = pd.DataFrame({'real': y_test, 'pred': preds})
compare_df['error'] = abs(compare_df['real'] - compare_df['pred'])
compare_df.sort_values('error')

Unnamed: 0,real,pred,error
146413,0.0,0.000026,0.000026
579449,20.0,20.000099,0.000099
148563,0.0,-0.000163,0.000163
383229,0.0,0.000186,0.000186
405404,3.0,2.999781,0.000219
...,...,...,...
192743,600.0,8.830734,591.169266
283181,597.0,5.745466,591.254534
397769,595.0,3.536245,591.463755
397912,597.0,4.616910,592.383090


In [23]:
compare_df[(compare_df['real'] > 120) & (compare_df['real'] < 240)]['error'].median()

138.4704905730102

In [18]:
compare_df['error'].describe()

count    1.240692e+06
mean     1.782825e+01
std      4.330925e+01
min      1.571538e-04
25%      5.633730e+00
50%      1.000213e+01
75%      1.662360e+01
max      3.050583e+03
Name: error, dtype: float64

In [19]:
import pickle

with open('catboost.pkl', 'wb') as f:
    pickle.dump(model, f)