### Importing libraries

In [23]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime
%matplotlib inline

### Loading the data

In [8]:
train=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')
train.sample(5)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
1785,train_id_1785,2016-07-18,TU 0318,TUN,LIS,2016-07-18 14:35:00,2016-07-18 17.30.00,ATA,TU 736ION,19.0
9476,train_id_9476,2016-04-16,TU 0789,BRU,TUN,2016-04-16 11:00:00,2016-04-16 13.40.00,ATA,TU 320IMT,5.0
57210,train_id_57210,2017-12-22,TU 0606,TUN,MAD,2017-12-22 07:10:00,2017-12-22 09.15.00,ATA,TU 32AIML,30.0
12227,train_id_12227,2016-08-25,TU 0695,MRS,DJE,2016-08-25 14:35:00,2016-08-25 16.35.00,ATA,TU 736IOK,0.0
57437,train_id_57437,2017-10-19,TU 0814,CAI,TUN,2017-10-19 10:55:00,2017-10-19 14.10.00,ATA,TU 32AIMP,665.0


In [7]:
test.sample(5)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
7818,test_id_7818,2018-09-23,TU 0856,TUN,MXP,2018-09-23 14:00:00,2018-09-23 15.45.00,ATA,TU 31BIMQ
6020,test_id_6020,2018-09-20,TU 0694,DJE,MRS,2018-09-20 07:25:00,2018-09-20 09.25.00,ATA,TU 736IOR
2592,test_id_2592,2016-05-01,UG 1311,CDG,TUN,2016-05-01 16:40:00,2016-05-01 18.55.00,SCH,TU CR9ISA
3222,test_id_3222,2017-02-04,TU 6497,HAM,NBE,2017-02-04 11:40:00,2017-02-04 14.40.00,ATA,TU 736IOK
5699,test_id_5699,2018-09-10,TU 0724,TUN,ORY,2018-09-10 18:20:00,2018-09-10 20.40.00,ATA,TU 32AIMI


### Variable definitions
ID : flight ID

DATOP: operation date

FLITID : flightID

DEPSTN : departure destination

ARRSTN : arrival destination

STD : departure time

STA : arrival time

STA : status

target : delay time

In [10]:
print('dimension of the train set: ',train.shape)
print('dimension of the test set : ',test.shape)

dimension of the train set:  (107833, 10)
dimension of the test set :  (9333, 9)


### Data Preprocessing

In [126]:
#Combining train and test set for easy preprocessing:

ntrain = train.shape[0] # will be used to split train and test set from the combined dataframe
data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {data.shape}')

The shape of the combined dataframe is: (117166, 10)


In [127]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117166 entries, 0 to 117165
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      117166 non-null  object 
 1   DATOP   117166 non-null  object 
 2   FLTID   117166 non-null  object 
 3   DEPSTN  117166 non-null  object 
 4   ARRSTN  117166 non-null  object 
 5   STD     117166 non-null  object 
 6   STA     117166 non-null  object 
 7   STATUS  117166 non-null  object 
 8   AC      117166 non-null  object 
 9   target  107833 non-null  float64
dtypes: float64(1), object(9)
memory usage: 8.9+ MB


In [128]:
#Convert to datetime
data['STA']=data['STA'].map(lambda x:x.replace('.',':'))
data[['DATOP','STD','STA']]=data[['DATOP','STD','STA']].apply(pd.to_datetime,errors='coerce')

In [129]:
data['DATOP_day']=data['DATOP'].dt.day
data['DATOP_month']=data['DATOP'].dt.month
data['DATOP_year']=data['DATOP'].dt.year
data.drop(['DATOP'],axis=1,inplace=True)

In [130]:
def date_split(column):
    """
    split DateTime Data to create multiple feature
    """
    data[column+'_day']=data[column].dt.day
    data[column+'_month']=data[column].dt.month
    data[column+'_year']=data[column].dt.year
    data[column+'_hours']=data[column].dt.hour
    data[column+'_minutes']=data[column].dt.minute
    data[column+'_seconds']=data[column].dt.second

In [131]:
#apply date split to STD and STA
date_split('STD')
date_split('STA')
data.drop(['STD','STA'],axis=1,inplace=True)

##### Category columns

In [139]:
cat_cols=['FLTID','DEPSTN','ARRSTN','STATUS','AC']

In [140]:
# Change columns to their respective datatypes
data[cat_cols]=data[cat_cols].astype('category')

In [141]:
#Number of unique values per categorical columns
for col in cat_cols:
    print(col,data[col].nunique())

FLTID 1912
DEPSTN 134
ARRSTN 130
STATUS 5
AC 70


In [147]:
# Encode categorical features
#Label Encoding
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    data[col]=LabelEncoder().fit_transform(data[col])

In [148]:
data.head()

Unnamed: 0,ID,FLTID,DEPSTN,ARRSTN,STATUS,AC,target,DATOP_day,DATOP_month,DATOP_year,...,STD_year,STD_hours,STD_minutes,STD_seconds,STA_day,STA_month,STA_year,STA_hours,STA_minutes,STA_seconds
0,train_id_0,239,32,121,0,47,260.0,3,1,2016,...,2016,10,30,0,3,1,2016,12,55,0
1,train_id_1,266,88,121,0,29,20.0,13,1,2016,...,2016,15,5,0,13,1,2016,16,55,0
2,train_id_2,93,125,59,0,47,0.0,16,1,2016,...,2016,4,10,0,16,1,2016,6,45,0
3,train_id_3,173,38,93,0,51,0.0,17,1,2016,...,2016,14,10,0,17,1,2016,17,0,0
4,train_id_4,145,125,4,0,35,22.0,17,1,2016,...,2016,14,30,0,17,1,2016,15,50,0
