In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [2]:
#reading train data
train_df = pd.read_csv('Train.csv')

In [3]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp
0,BOS_CLE_29089,2004,9,20,1,1000,1154,114,BOS,CLE,563,20/09/04 11:54
1,CLE_ATL_44346,2004,11,19,5,1440,1634,114,CLE,ATL,554,19/11/04 18:55
2,SNA_LAS_44378,2004,5,4,2,1800,1902,62,SNA,LAS,226,04/05/04 19:02
3,CLT_TPA_51502,2004,7,6,2,1400,1533,93,CLT,TPA,508,06/07/04 15:33
4,MSP_ORD_44884,2004,9,23,4,830,946,76,MSP,ORD,334,23/09/04 09:46


In [4]:
# train dataframe info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7861 entries, 0 to 7860
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   FlightNumber            7861 non-null   object
 1   Year                    7861 non-null   int64 
 2   Month                   7861 non-null   int64 
 3   DayofMonth              7861 non-null   int64 
 4   DayOfWeek               7861 non-null   int64 
 5   ScheduledDepTime        7861 non-null   int64 
 6   ScheduledArrTime        7861 non-null   int64 
 7   ScheduledTravelTime     7861 non-null   int64 
 8   Origin                  7861 non-null   object
 9   Destination             7861 non-null   object
 10  Distance                7861 non-null   int64 
 11  ActualArrivalTimeStamp  7861 non-null   object
dtypes: int64(8), object(4)
memory usage: 737.1+ KB


In [5]:
train_df.isnull().sum()

FlightNumber              0
Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
ScheduledDepTime          0
ScheduledArrTime          0
ScheduledTravelTime       0
Origin                    0
Destination               0
Distance                  0
ActualArrivalTimeStamp    0
dtype: int64

In [6]:
#sorting values w.r.t month, day and destination
train_df.sort_values(['Month','DayofMonth','Destination'], inplace = True, ignore_index = True)

In [7]:
train_df.shape

(7861, 12)

In [8]:
train_df.dtypes

FlightNumber              object
Year                       int64
Month                      int64
DayofMonth                 int64
DayOfWeek                  int64
ScheduledDepTime           int64
ScheduledArrTime           int64
ScheduledTravelTime        int64
Origin                    object
Destination               object
Distance                   int64
ActualArrivalTimeStamp    object
dtype: object

In [9]:
# converting ActualArrivalTimeStamp col to datetime
actual_arrival_time_stamp = train_df['ActualArrivalTimeStamp'].astype(str)
train_df['ActualArrivalTimeStamp'] = pd.to_datetime(actual_arrival_time_stamp, dayfirst=True)

In [10]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00


In [11]:
train_df.dtypes

FlightNumber                      object
Year                               int64
Month                              int64
DayofMonth                         int64
DayOfWeek                          int64
ScheduledDepTime                   int64
ScheduledArrTime                   int64
ScheduledTravelTime                int64
Origin                            object
Destination                       object
Distance                           int64
ActualArrivalTimeStamp    datetime64[ns]
dtype: object

In [12]:
# converting respective columns to be converted to datetime to string, and then using zfill method
train_df['Month'] = train_df['Month'].astype(str)
train_df['DayofMonth'] = train_df['DayofMonth'].astype(str)
train_df['ScheduledArrTime'] = train_df['ScheduledArrTime'].astype(str)

train_df['Month'] = train_df['Month'].str.zfill(2)
train_df['DayofMonth'] = train_df['DayofMonth'].str.zfill(2)
train_df['ScheduledArrTime'] = train_df['ScheduledArrTime'].str.zfill(4)

In [13]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00


In [14]:
# Deriving new column using date and time columns
train_df['ScheduledArrTime_c'] = train_df['Year'].astype(str) + train_df['Month'] + train_df['DayofMonth'] + train_df['ScheduledArrTime']

In [15]:
train_df['ScheduledArrTime_c'] = pd.to_datetime(train_df['ScheduledArrTime_c'],format = '%Y%m%d%H%M')

In [16]:
pd.options.display.max_rows = 200
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00


In [17]:
# converting derived column to datetime format
train_df['ScheduledArrTime_c'] = pd.to_datetime(train_df['ScheduledArrTime_c'],format = '%Y%m%d%H%M')

In [18]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00


In [19]:
# subtracting to obtain total time in datetime format
train_df['FlightDelayinmin'] = train_df['ActualArrivalTimeStamp'] - train_df['ScheduledArrTime_c']

In [20]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00


In [21]:
# Extracting only minute components from the new derived column
train_df['FlightDelayin_min'] = train_df['FlightDelayinmin'].dt.components['minutes']

In [22]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0


In [23]:
# Extracting only hour components from the new derived column
train_df['FlightDelayin_hour'] = train_df['FlightDelayinmin'].dt.components['hours']

In [24]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0,0
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0,0
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36,1
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0,0
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0,0


In [25]:
train_df.dtypes

FlightNumber                       object
Year                                int64
Month                              object
DayofMonth                         object
DayOfWeek                           int64
ScheduledDepTime                    int64
ScheduledArrTime                   object
ScheduledTravelTime                 int64
Origin                             object
Destination                        object
Distance                            int64
ActualArrivalTimeStamp     datetime64[ns]
ScheduledArrTime_c         datetime64[ns]
FlightDelayinmin          timedelta64[ns]
FlightDelayin_min                   int64
FlightDelayin_hour                  int64
dtype: object

In [26]:
# multiplying hour component column by 60 to obtain total hours in minutes
train_df['FlightDelayin_hour'] = train_df['FlightDelayin_hour'] * 60

In [27]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0,0
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0,0
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36,60
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0,0
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0,0


In [28]:
# Adding both hour and min column components to obtain total delay in minutes
train_df['TotalFlightDelay'] = train_df['FlightDelayin_hour'] + train_df['FlightDelayin_min']

In [29]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour,TotalFlightDelay
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0,0,0
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0,0,0
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36,60,96
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0,0,0
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0,0,0


In [30]:
print(train_df['TotalFlightDelay'].unique())
print()
print(train_df['TotalFlightDelay'].max())
print(train_df['TotalFlightDelay'].min())

[  0  96  69 120  72 154  29  32  57  31 101 110  88 121  70  98 109  48
  40  44 147  67  54  80  85  83 173  60  36  99 162  39  23 140  49  56
 133 112 128 116 151 169  47  91 145 158  93  46 124 149 107  74 100 117
  78  42  41  59  97 134 122 130  34  35 127  43  87 119  84 179  92 168
 180 143 141 102 108  66 160 137  81 105 103 111 156 150 123  65 171  90
  58  33 157 161 155 167  89  71  94  45 163 131 166 159 177  76  51 139
 104 113  77 132 125 144 146  73 174 135 152  68  50 172 176 118 153 175
 170 148  24 106  22  37  82 164  95  75 138 178  86 114 142 136  27  30
  28  52 165 126 129 115  21  79  55  26  53]

180
0


In [31]:
print(train_df['TotalFlightDelay'].sort_values())

0         0
4658      0
4657      0
4656      0
4655      0
       ... 
3675    180
5583    180
5431    180
7844    180
7086    180
Name: TotalFlightDelay, Length: 7861, dtype: int64


In [32]:
# Binninig to obtain final target column from total minutes

bins = [-1,15]
names = ['2','1']

d = dict(enumerate(names, 1))

train_df['FlightDelayStatus'] = np.vectorize(d.get)(np.digitize(train_df['FlightDelayinmin'], bins))

In [33]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour,TotalFlightDelay,FlightDelayStatus
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0,0,0,2
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0,0,0,2
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36,60,96,1
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0,0,0,2
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0,0,0,2


In [34]:
train_df.loc[train_df['FlightDelayStatus'] == '2']

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour,TotalFlightDelay,FlightDelayStatus
0,SMF_ATL_37955,2004,01,01,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days,0,0,0,2
1,VLD_ATL_48545,2004,01,01,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days,0,0,0,2
3,MKE_BWI_39603,2004,01,01,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days,0,0,0,2
4,DFW_CLT_33648,2004,01,01,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days,0,0,0,2
5,PHL_CVG_49031,2004,01,01,4,620,0814,114,PHL,CVG,507,2004-01-01 08:14:00,2004-01-01 08:14:00,0 days,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7853,OAK_SAN_44227,2004,11,30,2,1040,1205,85,OAK,SAN,446,2004-11-30 12:05:00,2004-11-30 12:05:00,0 days,0,0,0,2
7856,SFO_SLC_44569,2004,11,30,2,1115,1357,102,SFO,SLC,599,2004-11-30 13:57:00,2004-11-30 13:57:00,0 days,0,0,0,2
7857,PHX_SMF_33953,2004,11,30,2,810,0914,124,PHX,SMF,647,2004-11-30 09:14:00,2004-11-30 09:14:00,0 days,0,0,0,2
7858,BUR_SMF_54057,2004,11,30,2,705,0825,80,BUR,SMF,358,2004-11-30 08:25:00,2004-11-30 08:25:00,0 days,0,0,0,2


In [35]:
train_df.dtypes

FlightNumber                       object
Year                                int64
Month                              object
DayofMonth                         object
DayOfWeek                           int64
ScheduledDepTime                    int64
ScheduledArrTime                   object
ScheduledTravelTime                 int64
Origin                             object
Destination                        object
Distance                            int64
ActualArrivalTimeStamp     datetime64[ns]
ScheduledArrTime_c         datetime64[ns]
FlightDelayinmin          timedelta64[ns]
FlightDelayin_min                   int64
FlightDelayin_hour                  int64
TotalFlightDelay                    int64
FlightDelayStatus                  object
dtype: object

In [36]:
#converting FlightDelayStatus to category
train_df['FlightDelayStatus'] = train_df.FlightDelayStatus.astype('category')

In [37]:
#creating instance of labelencoder
#labelencoder = LabelEncoder()
# Assigning numerical values and storing
#train_df['FlightDelayStatus'] = labelencoder.fit_transform(train_df['FlightDelayStatus'])

In [38]:
train_df.head()

Unnamed: 0,FlightNumber,Year,Month,DayofMonth,DayOfWeek,ScheduledDepTime,ScheduledArrTime,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayinmin,FlightDelayin_min,FlightDelayin_hour,TotalFlightDelay,FlightDelayStatus
0,SMF_ATL_37955,2004,1,1,4,1150,1911,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,0 days 00:00:00,0,0,0,2
1,VLD_ATL_48545,2004,1,1,4,1329,1455,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,0 days 00:00:00,0,0,0,2
2,DTW_BOS_31626,2004,1,1,4,1507,1657,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,0 days 01:36:00,36,60,96,1
3,MKE_BWI_39603,2004,1,1,4,1240,1525,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,0 days 00:00:00,0,0,0,2
4,DFW_CLT_33648,2004,1,1,4,1456,1814,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,0 days 00:00:00,0,0,0,2


In [39]:
train_df.dtypes

FlightNumber                       object
Year                                int64
Month                              object
DayofMonth                         object
DayOfWeek                           int64
ScheduledDepTime                    int64
ScheduledArrTime                   object
ScheduledTravelTime                 int64
Origin                             object
Destination                        object
Distance                            int64
ActualArrivalTimeStamp     datetime64[ns]
ScheduledArrTime_c         datetime64[ns]
FlightDelayinmin          timedelta64[ns]
FlightDelayin_min                   int64
FlightDelayin_hour                  int64
TotalFlightDelay                    int64
FlightDelayStatus                category
dtype: object

In [40]:
# Dropping dervied columns 
train_df.drop(['FlightDelayinmin','FlightDelayin_min','FlightDelayin_hour','TotalFlightDelay'], axis = 1,inplace = True)

In [41]:
# Dropping columns after used for deriving new coluumn (ScheduledArrTime_c)
train_df.drop(['Year','Month','DayofMonth','ScheduledArrTime','ScheduledDepTime'], axis = 1,inplace = True)

In [42]:
train_df.head()

Unnamed: 0,FlightNumber,DayOfWeek,ScheduledTravelTime,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayStatus
0,SMF_ATL_37955,4,261,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,2
1,VLD_ATL_48545,4,86,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,2
2,DTW_BOS_31626,4,110,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,1
3,MKE_BWI_39603,4,105,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,2
4,DFW_CLT_33648,4,138,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,2


In [43]:
# Correlation plot for train data frame
corr = train_df.corr()
corr.style.background_gradient(cmap='Spectral').set_precision(2)

  corr.style.background_gradient(cmap='Spectral').set_precision(2)


Unnamed: 0,DayOfWeek,ScheduledTravelTime,Distance
DayOfWeek,1.0,0.03,0.03
ScheduledTravelTime,0.03,1.0,0.98
Distance,0.03,0.98,1.0


In [44]:
train_df_v1 = train_df.drop(['ScheduledTravelTime'], axis = 1)

In [45]:
train_df_v1.shape

(7861, 8)

In [46]:
train_df_v1.head()

Unnamed: 0,FlightNumber,DayOfWeek,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayStatus
0,SMF_ATL_37955,4,SMF,ATL,2092,2004-01-01 19:11:00,2004-01-01 19:11:00,2
1,VLD_ATL_48545,4,VLD,ATL,208,2004-01-01 14:55:00,2004-01-01 14:55:00,2
2,DTW_BOS_31626,4,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,1
3,MKE_BWI_39603,4,MKE,BWI,641,2004-01-01 15:25:00,2004-01-01 15:25:00,2
4,DFW_CLT_33648,4,DFW,CLT,936,2004-01-01 18:14:00,2004-01-01 18:14:00,2


In [47]:
train_df_v1.loc[train_df_v1['DayOfWeek'] == 1]

Unnamed: 0,FlightNumber,DayOfWeek,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayStatus
230,BTR_ATL_37937,1,BTR,ATL,449,2004-03-01 08:45:00,2004-03-01 08:45:00,2
231,BTR_ATL_35648,1,BTR,ATL,449,2004-03-01 10:00:00,2004-03-01 10:00:00,2
232,RSW_BOS_42049,1,RSW,BOS,1249,2004-03-01 14:27:00,2004-03-01 14:27:00,2
233,DCA_BOS_49472,1,DCA,BOS,399,2004-03-01 15:53:00,2004-03-01 15:53:00,2
234,FAI_BRW_28361,1,FAI,BRW,503,2004-03-01 17:54:00,2004-03-01 17:54:00,2
...,...,...,...,...,...,...,...,...
7799,BNA_SAT_36670,1,BNA,SAT,822,2004-11-29 11:50:00,2004-11-29 11:50:00,2
7800,SEA_SJC_48582,1,SEA,SJC,697,2004-11-29 12:09:00,2004-11-29 12:09:00,2
7801,GJT_SLC_48677,1,GJT,SLC,217,2004-11-29 07:42:00,2004-11-29 07:42:00,2
7802,HOU_STL_55270,1,HOU,STL,687,2004-11-29 12:15:00,2004-11-29 11:40:00,1


In [48]:
#creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing
train_df_v1['DayOfWeek'] = labelencoder.fit_transform(train_df_v1['DayOfWeek'])

In [49]:
train_df_v1['FlightDelayStatus'] = train_df_v1.FlightDelayStatus.astype(int)

In [50]:
train_df_v1['FlightDelayStatus'] = train_df_v1.FlightDelayStatus.astype('category')

In [51]:
train_df_v1.dtypes

FlightNumber                      object
DayOfWeek                          int64
Origin                            object
Destination                       object
Distance                           int64
ActualArrivalTimeStamp    datetime64[ns]
ScheduledArrTime_c        datetime64[ns]
FlightDelayStatus               category
dtype: object

In [52]:
train_df_v1.loc[train_df_v1['FlightDelayStatus'] == 1]

Unnamed: 0,FlightNumber,DayOfWeek,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayStatus
2,DTW_BOS_31626,3,DTW,BOS,632,2004-01-01 18:33:00,2004-01-01 16:57:00,1
6,DTW_DCA_46311,3,DTW,DCA,405,2004-01-01 16:18:00,2004-01-01 15:09:00,1
8,CLL_DFW_31275,3,CLL,DFW,164,2004-01-01 17:30:00,2004-01-01 15:30:00,1
10,DEN_GJT_26718,3,DEN,GJT,212,2004-01-01 13:35:00,2004-01-01 12:23:00,1
11,SFO_HNL_29125,3,SFO,HNL,2398,2004-01-01 17:58:00,2004-01-01 15:24:00,1
...,...,...,...,...,...,...,...,...
7849,GEG_PHX_40327,1,GEG,PHX,1020,2004-11-30 14:38:00,2004-11-30 11:58:00,1
7852,ORD_PVD_51672,1,ORD,PVD,849,2004-11-30 23:25:00,2004-11-30 20:48:00,1
7854,ATL_SBN_43428,1,ATL,SBN,566,2004-11-30 21:39:00,2004-11-30 18:48:00,1
7855,IAD_SDF_42429,1,IAD,SDF,451,2004-11-30 09:53:00,2004-11-30 09:29:00,1


In [53]:
train_df_v1.loc[train_df_v1['DayOfWeek'] == 0]

Unnamed: 0,FlightNumber,DayOfWeek,Origin,Destination,Distance,ActualArrivalTimeStamp,ScheduledArrTime_c,FlightDelayStatus
230,BTR_ATL_37937,0,BTR,ATL,449,2004-03-01 08:45:00,2004-03-01 08:45:00,2
231,BTR_ATL_35648,0,BTR,ATL,449,2004-03-01 10:00:00,2004-03-01 10:00:00,2
232,RSW_BOS_42049,0,RSW,BOS,1249,2004-03-01 14:27:00,2004-03-01 14:27:00,2
233,DCA_BOS_49472,0,DCA,BOS,399,2004-03-01 15:53:00,2004-03-01 15:53:00,2
234,FAI_BRW_28361,0,FAI,BRW,503,2004-03-01 17:54:00,2004-03-01 17:54:00,2
...,...,...,...,...,...,...,...,...
7799,BNA_SAT_36670,0,BNA,SAT,822,2004-11-29 11:50:00,2004-11-29 11:50:00,2
7800,SEA_SJC_48582,0,SEA,SJC,697,2004-11-29 12:09:00,2004-11-29 12:09:00,2
7801,GJT_SLC_48677,0,GJT,SLC,217,2004-11-29 07:42:00,2004-11-29 07:42:00,2
7802,HOU_STL_55270,0,HOU,STL,687,2004-11-29 12:15:00,2004-11-29 11:40:00,1


In [54]:
train_df_v1.nunique()

FlightNumber              7861
DayOfWeek                    7
Origin                     223
Destination                215
Distance                   984
ActualArrivalTimeStamp    7594
ScheduledArrTime_c        7562
FlightDelayStatus            2
dtype: int64

In [57]:
train_df_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7861 entries, 0 to 7860
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   FlightNumber            7861 non-null   object        
 1   DayOfWeek               7861 non-null   int64         
 2   Origin                  7861 non-null   object        
 3   Destination             7861 non-null   object        
 4   Distance                7861 non-null   int64         
 5   ActualArrivalTimeStamp  7861 non-null   datetime64[ns]
 6   ScheduledArrTime_c      7861 non-null   datetime64[ns]
 7   FlightDelayStatus       7861 non-null   category      
dtypes: category(1), datetime64[ns](2), int64(2), object(3)
memory usage: 437.8+ KB


In [55]:
#train_df.to_csv('train_target')

In [56]:
#Saving the dataset as csv format
train_df_v1.to_csv('train_target.csv',index_label = False)