In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
from catboost import CatBoostRegressor

In [3]:
def rmse(y_true, y_pred):
    result = np.sqrt((y_true - y_pred)**2)
    return np.mean(result)

In [4]:
data = pd.read_parquet('/kaggle/input/flight-delay')
data.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [29]:
data = data[data['DepDelay'].isna() == False]

In [30]:
data['FlightDate'] = pd.to_datetime(data['FlightDate'], yearfirst=True)
data.sort_values(by='FlightDate', inplace=True)

In [5]:
data.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
428764,2022-01-01,SkyWest Airlines Inc.,DEN,LAX,False,False,600,852.0,172.0,172.0,...,942.0,1042.0,7.0,756,173.0,1.0,11.0,0700-0759,4,0
160310,2022-01-01,Southwest Airlines Co.,BDL,BNA,False,False,1430,1428.0,0.0,-2.0,...,1438.0,1608.0,10.0,1610,8.0,0.0,0.0,1600-1659,4,0
160311,2022-01-01,Southwest Airlines Co.,BDL,BWI,False,False,1235,1313.0,38.0,38.0,...,1324.0,1421.0,5.0,1350,36.0,1.0,2.0,1300-1359,2,0
160312,2022-01-01,Southwest Airlines Co.,BDL,BWI,False,False,525,531.0,6.0,6.0,...,541.0,633.0,20.0,645,8.0,0.0,0.0,0600-0659,2,0
160313,2022-01-01,Southwest Airlines Co.,BDL,BWI,True,False,1710,,,,...,,,,1830,,,,1800-1859,2,0


In [7]:
data['DepDelayMinutes'].describe()

count    6.203458e+06
mean     1.276132e+01
std      4.736319e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      6.000000e+00
max      3.095000e+03
Name: DepDelayMinutes, dtype: float64

In [5]:
data.columns

Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',
       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOu

In [61]:
X = data[['Month', 'DayOfWeek', 'DayofMonth', 'DOT_ID_Operating_Airline',
          'Airline', 'OriginAirportID', 'DestCityName', 'DestAirportID', 'Diverted', 'Cancelled',
          'IATA_Code_Operating_Airline', 'IATA_Code_Marketing_Airline']]
y = data['DepDelayMinutes']

for col in X:
    print(X[col].isna().value_counts())

Unnamed: 0,Month,DayOfWeek,DayofMonth,DOT_ID_Operating_Airline,Airline,OriginAirportID,DestCityName,DestAirportID,Diverted,Cancelled,DepartureDelayGroups,IATA_Code_Operating_Airline,IATA_Code_Marketing_Airline
84462,1,5,1,19790,Delta Air Lines Inc.,10397,"Newark, NJ",11618,False,False,-1.0,DL,DL
221718,1,5,1,20304,SkyWest Airlines Inc.,12892,"Sun Valley/Hailey/Ketchum, ID",15041,False,False,-1.0,OO,DL
221717,1,5,1,20304,SkyWest Airlines Inc.,14869,"Idaho Falls, ID",12280,False,False,-1.0,OO,DL
221723,1,5,1,20304,SkyWest Airlines Inc.,14869,"Jackson, WY",12441,False,False,-1.0,OO,DL
346800,1,5,1,20378,Mesa Airlines Inc.,12266,"Washington, DC",12264,False,False,-1.0,YV,UA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196413,12,5,31,19805,American Airlines Inc.,13495,"Miami, FL",13303,False,False,-1.0,AA,AA
196444,12,5,31,19805,American Airlines Inc.,11298,"Phoenix, AZ",14107,False,False,3.0,AA,AA
442142,12,5,31,20397,Comair Inc.,10868,"Washington, DC",11278,False,False,-1.0,OH,AA
381145,12,5,31,19690,Hawaiian Airlines Inc.,12402,"Honolulu, HI",12173,False,False,0.0,HA,HA


In [37]:
X['Airline'] = pd.Categorical(X['Airline'])
X['DOT_ID_Operating_Airline'] = pd.Categorical(X['DOT_ID_Operating_Airline'])
X['OriginAirportID'] = pd.Categorical(X['OriginAirportID'])
X['DestCityName'] = pd.Categorical(X['DestCityName'])
X['IATA_Code_Operating_Airline'] = pd.Categorical(X['IATA_Code_Operating_Airline'])
X['IATA_Code_Marketing_Airline'] = pd.Categorical(X['IATA_Code_Marketing_Airline'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Airline'] = pd.Categorical(X['Airline'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DOT_ID_Operating_Airline'] = pd.Categorical(X['DOT_ID_Operating_Airline'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['OriginAirportID'] = pd.Categorical(X['OriginAirportID'])
A value is trying to be

In [62]:
cat_features = ['DOT_ID_Operating_Airline', 'Airline', 'OriginAirportID', 
                'DestCityName', 'DestAirportID','IATA_Code_Operating_Airline', 'IATA_Code_Marketing_Airline']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = CatBoostRegressor(random_seed=42, cat_features=cat_features)

In [None]:
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

Learning rate set to 0.187029
0:	learn: 40.8922906	test: 41.3492323	best: 41.3492323 (0)	total: 3.37s	remaining: 56m 4s
1:	learn: 36.0069778	test: 36.5197874	best: 36.5197874 (1)	total: 6.34s	remaining: 52m 46s
2:	learn: 32.3466497	test: 32.9034907	best: 32.9034907 (2)	total: 9.43s	remaining: 52m 13s
3:	learn: 29.6765294	test: 30.2676682	best: 30.2676682 (3)	total: 12.7s	remaining: 52m 36s
4:	learn: 27.7464795	test: 28.3620654	best: 28.3620654 (4)	total: 15.7s	remaining: 52m 10s
5:	learn: 26.3989081	test: 27.0291043	best: 27.0291043 (5)	total: 18.9s	remaining: 52m 5s
6:	learn: 25.4566629	test: 26.0991148	best: 26.0991148 (6)	total: 21.5s	remaining: 50m 46s
7:	learn: 24.7980429	test: 25.4441763	best: 25.4441763 (7)	total: 24.4s	remaining: 50m 24s
8:	learn: 24.3311604	test: 24.9774116	best: 24.9774116 (8)	total: 26.9s	remaining: 49m 27s
9:	learn: 24.0230034	test: 24.6678189	best: 24.6678189 (9)	total: 29.8s	remaining: 49m 8s
10:	learn: 23.8107993	test: 24.4523777	best: 24.4523777 (10)	to

In [40]:
preds = model.predict(X_test)

In [None]:
print(rmse(y_test, preds))
print(r2_score(y_test, preds))
print(mean_absolute_error(y_test, preds))

In [42]:
compare_df = pd.DataFrame({'real': y_test, 'pred': preds})
compare_df['error'] = abs(compare_df['real'] - compare_df['pred'])
compare_df.sort_values('error')

Unnamed: 0,real,pred,error
156065,0.0,1.195446e-07,1.195446e-07
490570,0.0,-7.761486e-07,7.761486e-07
490615,0.0,-7.761486e-07,7.761486e-07
450798,0.0,-1.064435e-06,1.064435e-06
121011,0.0,-1.069933e-06,1.069933e-06
...,...,...,...
184328,1865.0,4.792259e+02,1.385774e+03
208500,1961.0,4.675377e+02,1.493462e+03
338368,2070.0,5.336581e+02,1.536342e+03
96573,1915.0,3.027265e+02,1.612273e+03


In [57]:
compare_df[(compare_df['real'] > 120) & (compare_df['real'] < 240)]['error'].median()

538691     0.512366
570165    17.712749
359923     6.115742
9938       6.688728
395536     3.996039
            ...    
359635     3.330473
381616     0.020480
553561     1.650025
555014     0.892544
540286     5.572227
Name: error, Length: 20736, dtype: float64

In [43]:
compare_df['error'].describe()

count    1.240692e+06
mean     2.960916e+00
std      2.256353e+01
min      1.195446e-07
25%      3.935965e-02
50%      1.013808e-01
75%      2.709740e+00
max      1.731311e+03
Name: error, dtype: float64

In [45]:
import pickle

with open('catboost.pkl', 'wb') as f:
    pickle.dump(model, f)