### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime
%matplotlib inline

### Loading the data

In [2]:
train=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')
Submission=pd.read_csv('SampleSubmission.csv')
train.sample(5)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
87840,train_id_87840,2018-08-15,TU 0856,TUN,MXP,2018-08-15 14:00:00,2018-08-15 15.45.00,ATA,TU 31BIMO,30.0
89286,train_id_89286,2018-10-08,TU 0708,TUN,ORN,2018-10-08 13:15:00,2018-10-08 15.00.00,ATA,TU 32AIMI,14.0
27496,train_id_27496,2016-09-26,TU 0711,TUN,CMN,2016-09-26 07:00:00,2016-09-26 09.40.00,ATA,TU 320IMV,11.0
18888,train_id_18888,2016-04-07,TU 0716,TUN,ORY,2016-04-07 06:50:00,2016-04-07 09.10.00,ATA,TU 320IMS,25.0
29690,train_id_29690,2016-11-10,WKL 0000,TUN,TUN,2016-11-10 22:00:00,2016-11-11 01.00.00,SCH,TU 32AIML,0.0


In [3]:
test.sample(5)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
6667,test_id_6667,2018-09-11,TU 0514,TUN,BCN,2018-09-11 08:25:00,2018-09-11 10.10.00,ATA,TU 32AIML
8715,test_id_8715,2018-09-14,TU 0711,TUN,CMN,2018-09-14 07:25:00,2018-09-14 10.10.00,ATA,TU 320IMS
5217,test_id_5217,2017-02-24,WKL 0000,TUN,TUN,2017-02-24 22:00:00,2017-02-25 01.00.00,SCH,TU 32AIMM
921,test_id_921,2016-05-12,TU 0440,MIR,ORY,2016-05-12 11:00:00,2016-05-12 13.30.00,ATA,TU 31AIMJ
6409,test_id_6409,2018-09-26,TU 0215,IST,TUN,2018-09-26 10:55:00,2018-09-26 13.45.00,ATA,TU 32AIMH


### Variable definitions
DATOP - Date of flight

FLTID - Flight number

DEPSTN - Departure point

ARRSTN - Arrival point

STD - Scheduled Time departure

STA - Scheduled Time arrival

STATUS - Flight status

ETD - Expected Time departure

ETA - Expected Time arrival

ATD - Actual Time of Departure

ATA - Actual Time of arrival

DELAY1 - Delay code 1

DUR1 - delay time 1

DELAY2 - Delay code 2

DUR2 - delay time 2

DELAY3 - Delay code 3

DUR3 - delay time 3

DELAY4 - Delay code 4

DUR4 - delay time 4

AC - Aircraft Code

In [5]:
print('dimension of the train set: ',train.shape)
print('dimension of the test set : ',test.shape)

dimension of the train set:  (107833, 10)
dimension of the test set :  (9333, 9)


### Data Preprocessing

In [233]:
#Combining train and test set for easy preprocessing:

ntrain = train.shape[0] # will be used to split train and test set from the combined dataframe
data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {data.shape}')

The shape of the combined dataframe is: (117166, 10)


In [234]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117166 entries, 0 to 117165
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      117166 non-null  object 
 1   DATOP   117166 non-null  object 
 2   FLTID   117166 non-null  object 
 3   DEPSTN  117166 non-null  object 
 4   ARRSTN  117166 non-null  object 
 5   STD     117166 non-null  object 
 6   STA     117166 non-null  object 
 7   STATUS  117166 non-null  object 
 8   AC      117166 non-null  object 
 9   target  107833 non-null  float64
dtypes: float64(1), object(9)
memory usage: 8.9+ MB


In [235]:
#Convert to datetime
data['STA']=data['STA'].map(lambda x:x.replace('.',':'))
data[['DATOP','STD','STA']]=data[['DATOP','STD','STA']].apply(pd.to_datetime,errors='coerce')

In [236]:
#data['DATOP_day']=data['DATOP'].dt.day
#data['DATOP_month']=data['DATOP'].dt.month
#data['DATOP_year']=data['DATOP'].dt.year
data.drop(['DATOP'],axis=1,inplace=True)

In [237]:
def date_split(column):
    """
    split DateTime Data to create multiple feature
    """
    data[column+'_day']=data[column].dt.day
    data[column+'_month']=data[column].dt.month
    data[column+'_year']=data[column].dt.year
    data[column+'_hours']=data[column].dt.hour
    #data[column+'_minutes']=data[column].dt.minute
    #data[column+'_seconds']=data[column].dt.second

In [238]:
#apply date split to STD and STA
date_split('STD')
date_split('STA')
data.drop(['STD','STA'],axis=1,inplace=True)

##### Category columns

In [239]:
cat_cols=['FLTID','DEPSTN','ARRSTN','AC','STATUS']

In [240]:
# Change columns to their respective datatypes
data[cat_cols]=data[cat_cols].astype('category')

In [241]:
#Number of unique values per categorical columns
for col in cat_cols:
    print(col,data[col].nunique())

FLTID 1912
DEPSTN 134
ARRSTN 130
AC 70
STATUS 5


In [242]:
data=pd.get_dummies(data =data, columns = [cat_cols[-1]])

In [243]:
# Encode categorical features
#Label Encoding
from sklearn.preprocessing import LabelEncoder
for col in cat_cols[:-1]:
    data[col]=LabelEncoder().fit_transform(data[col])

In [244]:
data.head()

Unnamed: 0,ID,FLTID,DEPSTN,ARRSTN,AC,target,STD_day,STD_month,STD_year,STD_hours,STA_day,STA_month,STA_year,STA_hours,STATUS_ATA,STATUS_DEL,STATUS_DEP,STATUS_RTR,STATUS_SCH
0,train_id_0,239,32,121,47,260.0,3,1,2016,10,3,1,2016,12,1,0,0,0,0
1,train_id_1,266,88,121,29,20.0,13,1,2016,15,13,1,2016,16,1,0,0,0,0
2,train_id_2,93,125,59,47,0.0,16,1,2016,4,16,1,2016,6,1,0,0,0,0
3,train_id_3,173,38,93,51,0.0,17,1,2016,14,17,1,2016,17,1,0,0,0,0
4,train_id_4,145,125,4,35,22.0,17,1,2016,14,17,1,2016,15,1,0,0,0,0


In [245]:
data.shape

(117166, 19)

### Training and making predictions

In [246]:
# Separate train and test data from the combined dataframe
train_df =data[:ntrain]
test_df =data[ntrain:]

# Check the shapes of the split dataset
train_df.shape, test_df.shape

((107833, 19), (9333, 19))

In [247]:
main_cols=data.columns.difference(['ID','target'])
X=train_df[main_cols]
y=train_df.target

In [321]:
# Split data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=44)

#### Linear Regression

In [322]:
from sklearn.linear_model import LinearRegression

model=LinearRegression()
model.fit(X_train,y_train)
print('done')

done


In [323]:
y_preds=model.predict(X_test)

In [324]:
mean_squared_error(y_test,y_preds)

12076.133456398426

In [325]:
model.score(X_test,y_test)

0.04089814787595847

In [326]:
rms = mean_squared_error(y_test, y_preds, squared=False)
rms

109.89146216334746

#### Catboost

In [327]:
from catboost import CatBoostRegressor

In [328]:
model = CatBoostRegressor(iterations=1000,learning_rate=0.1, depth=12,l2_leaf_reg=0.5, loss_function='RMSE')
model =model.fit(X_train,y_train)

0:	learn: 117.3391695	total: 95.7ms	remaining: 1m 35s
1:	learn: 116.5816417	total: 187ms	remaining: 1m 33s
2:	learn: 115.9083616	total: 286ms	remaining: 1m 34s
3:	learn: 115.1945565	total: 393ms	remaining: 1m 37s
4:	learn: 114.6555521	total: 484ms	remaining: 1m 36s
5:	learn: 114.2046967	total: 582ms	remaining: 1m 36s
6:	learn: 113.7964837	total: 677ms	remaining: 1m 36s
7:	learn: 113.4569771	total: 775ms	remaining: 1m 36s
8:	learn: 113.0206915	total: 864ms	remaining: 1m 35s
9:	learn: 112.6426152	total: 998ms	remaining: 1m 38s
10:	learn: 112.2546474	total: 1.1s	remaining: 1m 39s
11:	learn: 112.0420660	total: 1.21s	remaining: 1m 39s
12:	learn: 111.6665344	total: 1.3s	remaining: 1m 39s
13:	learn: 111.4822957	total: 1.41s	remaining: 1m 39s
14:	learn: 111.1852421	total: 1.53s	remaining: 1m 40s
15:	learn: 110.8915383	total: 1.63s	remaining: 1m 40s
16:	learn: 110.6308268	total: 1.73s	remaining: 1m 39s
17:	learn: 110.3037545	total: 1.83s	remaining: 1m 39s
18:	learn: 110.0344695	total: 1.91s	rem

153:	learn: 90.4190425	total: 14.9s	remaining: 1m 21s
154:	learn: 90.3541080	total: 15s	remaining: 1m 21s
155:	learn: 90.1593484	total: 15.1s	remaining: 1m 21s
156:	learn: 90.0658704	total: 15.2s	remaining: 1m 21s
157:	learn: 89.8817806	total: 15.3s	remaining: 1m 21s
158:	learn: 89.7280769	total: 15.4s	remaining: 1m 21s
159:	learn: 89.6274769	total: 15.5s	remaining: 1m 21s
160:	learn: 89.5649685	total: 15.6s	remaining: 1m 21s
161:	learn: 89.4363623	total: 15.7s	remaining: 1m 21s
162:	learn: 89.3598284	total: 15.8s	remaining: 1m 20s
163:	learn: 89.2829268	total: 15.9s	remaining: 1m 20s
164:	learn: 89.1761362	total: 15.9s	remaining: 1m 20s
165:	learn: 89.0906145	total: 16s	remaining: 1m 20s
166:	learn: 89.0327674	total: 16.1s	remaining: 1m 20s
167:	learn: 88.9273807	total: 16.2s	remaining: 1m 20s
168:	learn: 88.8348179	total: 16.3s	remaining: 1m 20s
169:	learn: 88.7794661	total: 16.4s	remaining: 1m 20s
170:	learn: 88.7185357	total: 16.5s	remaining: 1m 20s
171:	learn: 88.6447781	total: 16

307:	learn: 78.3082031	total: 30.1s	remaining: 1m 7s
308:	learn: 78.1636951	total: 30.2s	remaining: 1m 7s
309:	learn: 78.1322709	total: 30.3s	remaining: 1m 7s
310:	learn: 78.0804008	total: 30.3s	remaining: 1m 7s
311:	learn: 78.0426777	total: 30.4s	remaining: 1m 7s
312:	learn: 77.9100788	total: 30.5s	remaining: 1m 7s
313:	learn: 77.8367267	total: 30.6s	remaining: 1m 6s
314:	learn: 77.7497290	total: 30.7s	remaining: 1m 6s
315:	learn: 77.6847210	total: 30.8s	remaining: 1m 6s
316:	learn: 77.6588264	total: 30.9s	remaining: 1m 6s
317:	learn: 77.6102784	total: 31s	remaining: 1m 6s
318:	learn: 77.5397566	total: 31.1s	remaining: 1m 6s
319:	learn: 77.4718485	total: 31.2s	remaining: 1m 6s
320:	learn: 77.4187312	total: 31.3s	remaining: 1m 6s
321:	learn: 77.3667106	total: 31.4s	remaining: 1m 6s
322:	learn: 77.2604541	total: 31.5s	remaining: 1m 6s
323:	learn: 77.2102222	total: 31.6s	remaining: 1m 5s
324:	learn: 77.1564808	total: 31.7s	remaining: 1m 5s
325:	learn: 77.1090099	total: 31.8s	remaining: 1

464:	learn: 70.2237108	total: 45.1s	remaining: 51.9s
465:	learn: 70.1764476	total: 45.2s	remaining: 51.8s
466:	learn: 70.1509875	total: 45.3s	remaining: 51.7s
467:	learn: 70.1012914	total: 45.3s	remaining: 51.5s
468:	learn: 70.0621448	total: 45.4s	remaining: 51.4s
469:	learn: 70.0046609	total: 45.5s	remaining: 51.3s
470:	learn: 69.9573372	total: 45.6s	remaining: 51.2s
471:	learn: 69.9305354	total: 45.7s	remaining: 51.1s
472:	learn: 69.8988183	total: 45.8s	remaining: 51s
473:	learn: 69.8494266	total: 45.9s	remaining: 50.9s
474:	learn: 69.7767476	total: 46s	remaining: 50.8s
475:	learn: 69.7081190	total: 46.1s	remaining: 50.7s
476:	learn: 69.6575663	total: 46.2s	remaining: 50.6s
477:	learn: 69.6098921	total: 46.3s	remaining: 50.5s
478:	learn: 69.5518773	total: 46.4s	remaining: 50.5s
479:	learn: 69.4827951	total: 46.5s	remaining: 50.4s
480:	learn: 69.4466911	total: 46.6s	remaining: 50.3s
481:	learn: 69.4179995	total: 46.8s	remaining: 50.2s
482:	learn: 69.3932395	total: 46.9s	remaining: 50.

621:	learn: 63.8571577	total: 1m 1s	remaining: 37.1s
622:	learn: 63.8097576	total: 1m 1s	remaining: 37.1s
623:	learn: 63.7753951	total: 1m 1s	remaining: 37s
624:	learn: 63.7273054	total: 1m 1s	remaining: 36.9s
625:	learn: 63.6975806	total: 1m 1s	remaining: 36.8s
626:	learn: 63.6637754	total: 1m 1s	remaining: 36.7s
627:	learn: 63.6308611	total: 1m 1s	remaining: 36.7s
628:	learn: 63.6011372	total: 1m 2s	remaining: 36.6s
629:	learn: 63.5755030	total: 1m 2s	remaining: 36.5s
630:	learn: 63.5415222	total: 1m 2s	remaining: 36.4s
631:	learn: 63.5254054	total: 1m 2s	remaining: 36.3s
632:	learn: 63.4848770	total: 1m 2s	remaining: 36.2s
633:	learn: 63.4529793	total: 1m 2s	remaining: 36.1s
634:	learn: 63.4354813	total: 1m 2s	remaining: 36.1s
635:	learn: 63.3959727	total: 1m 2s	remaining: 36s
636:	learn: 63.3417345	total: 1m 3s	remaining: 35.9s
637:	learn: 63.2977761	total: 1m 3s	remaining: 35.8s
638:	learn: 63.2672234	total: 1m 3s	remaining: 35.7s
639:	learn: 63.2321642	total: 1m 3s	remaining: 35.

775:	learn: 59.0541568	total: 1m 20s	remaining: 23.2s
776:	learn: 59.0244600	total: 1m 20s	remaining: 23.1s
777:	learn: 59.0072819	total: 1m 20s	remaining: 23s
778:	learn: 58.9848385	total: 1m 20s	remaining: 22.9s
779:	learn: 58.9619692	total: 1m 20s	remaining: 22.8s
780:	learn: 58.9241438	total: 1m 20s	remaining: 22.7s
781:	learn: 58.9119456	total: 1m 21s	remaining: 22.6s
782:	learn: 58.8768312	total: 1m 21s	remaining: 22.5s
783:	learn: 58.8515224	total: 1m 21s	remaining: 22.4s
784:	learn: 58.8356769	total: 1m 21s	remaining: 22.3s
785:	learn: 58.8099957	total: 1m 21s	remaining: 22.2s
786:	learn: 58.7676217	total: 1m 21s	remaining: 22.1s
787:	learn: 58.7393553	total: 1m 21s	remaining: 22s
788:	learn: 58.7203236	total: 1m 22s	remaining: 21.9s
789:	learn: 58.6915927	total: 1m 22s	remaining: 21.8s
790:	learn: 58.6686113	total: 1m 22s	remaining: 21.7s
791:	learn: 58.6467285	total: 1m 22s	remaining: 21.6s
792:	learn: 58.6265041	total: 1m 22s	remaining: 21.6s
793:	learn: 58.5671843	total: 1m

929:	learn: 54.9201140	total: 1m 39s	remaining: 7.46s
930:	learn: 54.8931848	total: 1m 39s	remaining: 7.35s
931:	learn: 54.8742496	total: 1m 39s	remaining: 7.24s
932:	learn: 54.8577899	total: 1m 39s	remaining: 7.14s
933:	learn: 54.8136695	total: 1m 39s	remaining: 7.03s
934:	learn: 54.7907943	total: 1m 39s	remaining: 6.92s
935:	learn: 54.7683285	total: 1m 39s	remaining: 6.82s
936:	learn: 54.7533612	total: 1m 39s	remaining: 6.71s
937:	learn: 54.7435272	total: 1m 39s	remaining: 6.6s
938:	learn: 54.7309862	total: 1m 40s	remaining: 6.5s
939:	learn: 54.7075206	total: 1m 40s	remaining: 6.39s
940:	learn: 54.6909528	total: 1m 40s	remaining: 6.28s
941:	learn: 54.6687540	total: 1m 40s	remaining: 6.18s
942:	learn: 54.6532872	total: 1m 40s	remaining: 6.07s
943:	learn: 54.6281479	total: 1m 40s	remaining: 5.96s
944:	learn: 54.5910285	total: 1m 40s	remaining: 5.86s
945:	learn: 54.5602603	total: 1m 40s	remaining: 5.75s
946:	learn: 54.5344598	total: 1m 40s	remaining: 5.65s
947:	learn: 54.5109511	total: 

In [329]:
y_preds=model.predict(X_test)
rms = mean_squared_error(y_test, y_preds, squared=False)
rms

99.01462091011365

In [330]:
test_df = test_df[main_cols]
predictions = model.predict(test_df)

In [331]:
sub_file = Submission.copy()
sub_file.target = predictions
# Create a csv file and upload to zindi 
sub_file.to_csv('Baseline.csv', index = False)
sub_file.head()

Unnamed: 0,ID,target
0,test_id_0,5.288105
1,test_id_1,26.701231
2,test_id_2,13.84095
3,test_id_3,-1.628446
4,test_id_4,29.663225


105.25957312917205