In [1]:
#Importing all Necessary libraries For EDA

import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Reading Required Data

data_train = pd.read_excel('Data_Train.xlsx')
data_test = pd.read_excel('Test_set.xlsx')

data_train.shape

(10683, 11)

In [3]:
data_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
#Basic Builtin EDA functions are pretty handy sometimes
data_train.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [5]:
statistics_of_data = []
for col in data_train.columns:
  statistics_of_data.append((col,
                             data_train[col].nunique(),
                             data_train[col].isnull().sum()*100/data_train.shape[0],
                             data_train[col].value_counts(normalize=True, dropna=False).values[0] * 100, 
                             data_train[col].dtype
                             ))
stats_df = pd.DataFrame(statistics_of_data, columns=['Feature', 'Uniq_val', 'missing_val', 'val_biggest_cat', 'type'])

In [6]:
stats_df.sort_values('missing_val', ascending=False)

Unnamed: 0,Feature,Uniq_val,missing_val,val_biggest_cat,type
4,Route,128,0.009361,22.240944,object
8,Total_Stops,5,0.009361,52.653749,object
0,Airline,12,0.0,36.029205,object
1,Date_of_Journey,44,0.0,4.717776,object
2,Source,5,0.0,42.469344,object
3,Destination,6,0.0,42.469344,object
5,Dep_Time,222,0.0,2.181035,object
6,Arrival_Time,1343,0.0,3.959562,object
7,Duration,368,0.0,5.148367,object
9,Additional_Info,10,0.0,78.114762,object


### Observations:

- There are only 0.009% missing values are there, we will remove them.
- We have all object or categorical data other than price which is int

In [7]:

print("Original Length of Training Set : ", len(data_train))

data_train = data_train.dropna() # dropping the NaN value.  we chose to drop it as there is only one NaN value. 

print("Length of Training Set after dropping NaN: ", len(data_train))

Original Length of Training Set :  10683
Length of Training Set after dropping NaN:  10682


In [8]:
data_train.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2h 25m,non-stop,No info,3873
6,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15h 30m,1 stop,In-flight meal not included,11087
7,Jet Airways,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21h 5m,1 stop,No info,22270
8,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25h 30m,1 stop,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7h 50m,1 stop,No info,8625


### Observations::

- We donot pretty concerned about Airline, Source, Destination,Route, Total_stops, and additional info. Because we will label encode all of them

- We need to extract information from date_of_Journey column

- We need to extract from Duration, Hours and minutes Both

- We need to extract hours and minutes from dep, and arrival time

In [9]:
data_train["Date_of_Journey"].head()

0    24/03/2019
1     1/05/2019
2     9/06/2019
3    12/05/2019
4    01/03/2019
Name: Date_of_Journey, dtype: object

In [10]:
#We need to transform it first in date format

data_train['Journey_Day'] = pd.to_datetime(data_train.Date_of_Journey, format='%d/%m/%Y').dt.day
data_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1


In [11]:
data_train['Journey_Month'] = pd.to_datetime(data_train.Date_of_Journey, format='%d/%m/%Y').dt.month
data_train['Journey_Year'] = pd.to_datetime(data_train.Date_of_Journey, format='%d/%m/%Y').dt.year
data_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


In [12]:
data_train.drop(['Date_of_Journey'],axis =1,inplace = True)

In [13]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


In [14]:
#Extracting hours and minutes from Duration

data_train['Duration'].head()

0    2h 50m
1    7h 25m
2       19h
3    5h 25m
4    4h 45m
Name: Duration, dtype: object

In [15]:
duration = list(data_train['Duration'])


for i in range(len(duration)) :
    if len(duration[i].split()) != 2: 
        if 'h' in duration[i] :
            duration[i] = duration[i].strip() + ' 0m'
        elif 'm' in duration[i] :
            duration[i] = '0h {}'.format(duration[i].strip())

dur_hours = []
dur_minutes = []  

for i in range(len(duration)) :
    dur_hours.append(int(duration[i].split()[0][:-1])) #for examole if duration is 49 mintutes 4 sec then it will reflect like 
    dur_minutes.append(int(duration[i].split()[1][:-1]))#0:49:4 and if 2 hours 10 seconds then it will reflect like 2:0:10
    
data_train['Duration_hours'] = dur_hours
data_train['Duration_minutes'] =dur_minutes

data_train.drop(labels = 'Duration', axis = 1, inplace = True) # dropping the original duration column from training set


In [16]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,non-stop,No info,3897,24,3,2019,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,2 stops,No info,7662,1,5,2019,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,2 stops,No info,13882,9,6,2019,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,1 stop,No info,6218,12,5,2019,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,1 stop,No info,13302,1,3,2019,4,45


In [17]:
#Now we need to extract hours and minutes from Dep_time and Arrival_time

data_train['Dep_Time'].head()

0    22:20
1    05:50
2    09:25
3    18:05
4    16:50
Name: Dep_Time, dtype: object

In [18]:
#Extract Directly .....!!!!

data_train['departure_Hour'] = pd.to_datetime(data_train.Dep_Time).dt.hour
data_train['departure_Min'] = pd.to_datetime(data_train.Dep_Time).dt.minute
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,non-stop,No info,3897,24,3,2019,2,50,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,2 stops,No info,7662,1,5,2019,7,25,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,2 stops,No info,13882,9,6,2019,19,0,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,1 stop,No info,6218,12,5,2019,5,25,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,1 stop,No info,13302,1,3,2019,4,45,16,50


In [19]:
data_train.drop(['Dep_Time'],axis=1,inplace=True)

In [20]:
#Now Do this exactly with Arrival_Time, We just need Hour and min

data_train['Arr_Hour'] = pd.to_datetime(data_train.Arrival_Time).dt.hour
data_train['Arr_Min'] = pd.to_datetime(data_train.Arrival_Time).dt.minute
data_train.drop(['Arrival_Time'],axis=1,inplace=True)
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,2019,2,50,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,2019,7,25,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,2019,19,0,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6218,12,5,2019,5,25,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13302,1,3,2019,4,45,16,50,21,35


In [21]:
# Formatting Total Stops Feature
combine = [data_train]
titlemapping = {'non-stop':0, '1 stop':1,'2 stop':2, '3 stop':3,'4 stop':4}
for row in combine:
    row["Total_Stops"] = row["Total_Stops"].map(titlemapping)
    row['Total_Stops'] = row['Total_Stops'].fillna(0)
    row['Total_Stops'] = row['Total_Stops'].astype(int)

In [22]:
data_train['Route_1']=''
data_train['Route_2']=''
data_train['Route_3']=''
data_train['Route_4']=''
data_train['Route_5']=''
combine = [data_train]
for row in combine:
    row['Route_1']=row['Route'].str.split('→ ').str[0]
    row['Route_2']=row['Route'].str.split('→ ').str[1]
    row['Route_3']=row['Route'].str.split('→ ').str[2]
    row['Route_4']=row['Route'].str.split('→ ').str[3]
    row['Route_5']=row['Route'].str.split('→ ').str[4]

In [23]:
for row in combine:
    row['Route_1'].fillna("None",inplace=True)
    row['Route_2'].fillna("None",inplace=True)
    row['Route_3'].fillna("None",inplace=True)
    row['Route_4'].fillna("None",inplace=True)
    row['Route_5'].fillna("None",inplace=True)

In [24]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,Banglore,New Delhi,BLR → DEL,0,No info,3897,24,3,2019,2,50,22,20,1,10,BLR,DEL,,,
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,0,No info,7662,1,5,2019,7,25,5,50,13,15,CCU,IXR,BBI,BLR,
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,0,No info,13882,9,6,2019,19,0,9,25,4,25,DEL,LKO,BOM,COK,
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No info,6218,12,5,2019,5,25,18,5,23,30,CCU,NAG,BLR,,
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1,No info,13302,1,3,2019,4,45,16,50,21,35,BLR,NAG,DEL,,


In [25]:
data_train.drop(['Route'],inplace = True,axis =1)

In [26]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,2,50,22,20,1,10,BLR,DEL,,,
1,Air India,Kolkata,Banglore,0,No info,7662,1,5,2019,7,25,5,50,13,15,CCU,IXR,BBI,BLR,
2,Jet Airways,Delhi,Cochin,0,No info,13882,9,6,2019,19,0,9,25,4,25,DEL,LKO,BOM,COK,
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,2019,5,25,18,5,23,30,CCU,NAG,BLR,,
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,2019,4,45,16,50,21,35,BLR,NAG,DEL,,


In [27]:
data_train.shape

(10682, 20)

In [28]:
data_train['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [29]:
data_train.Airline = data_train.Airline.apply(lambda x: x.strip())
Airline_stats = data_train['Airline'].value_counts(ascending=False)
Airline_stats

Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [30]:
#Any Airline having less than 6 data points termed as other category, We can called this as dimension reduction
Airline_stats_less_than_10 = Airline_stats[Airline_stats<=6]
Airline_stats_less_than_10


Jet Airways Business       6
Vistara Premium economy    3
Trujet                     1
Name: Airline, dtype: int64

In [31]:
data_train.Airline  = data_train.Airline.apply(lambda x: 'other' if x in Airline_stats_less_than_10 else x)
len(data_train.Airline.unique())

10

In [32]:
data_train['Source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [33]:
data_train['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [34]:
#apply OneHotEncoding::
dummies = pd.get_dummies(data_train.Airline)
dummies.head(3)



Unnamed: 0,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0


In [35]:
data_train = pd.concat([data_train,dummies],axis='columns')
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,2,50,22,20,1,10,BLR,DEL,,,,0,0,0,1,0,0,0,0,0,0
1,Air India,Kolkata,Banglore,0,No info,7662,1,5,2019,7,25,5,50,13,15,CCU,IXR,BBI,BLR,,0,1,0,0,0,0,0,0,0,0
2,Jet Airways,Delhi,Cochin,0,No info,13882,9,6,2019,19,0,9,25,4,25,DEL,LKO,BOM,COK,,0,0,0,0,1,0,0,0,0,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,2019,5,25,18,5,23,30,CCU,NAG,BLR,,,0,0,0,1,0,0,0,0,0,0
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,2019,4,45,16,50,21,35,BLR,NAG,DEL,,,0,0,0,1,0,0,0,0,0,0


In [36]:
#Apply oHE in Source
#apply OneHotEncoding::
dummies_Source = pd.get_dummies(data_train.Source)
dummies_Source.head(3)


Unnamed: 0,Banglore,Chennai,Delhi,Kolkata,Mumbai
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0


In [37]:
data_train = pd.concat([data_train,dummies_Source],axis='columns')
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other,Banglore,Chennai,Delhi,Kolkata,Mumbai
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,2,50,22,20,1,10,BLR,DEL,,,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,Air India,Kolkata,Banglore,0,No info,7662,1,5,2019,7,25,5,50,13,15,CCU,IXR,BBI,BLR,,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Jet Airways,Delhi,Cochin,0,No info,13882,9,6,2019,19,0,9,25,4,25,DEL,LKO,BOM,COK,,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,2019,5,25,18,5,23,30,CCU,NAG,BLR,,,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,2019,4,45,16,50,21,35,BLR,NAG,DEL,,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [38]:
#Apply oHE in Destination
#apply OneHotEncoding::
dummies_Destination = pd.get_dummies(data_train.Destination)
dummies_Destination.head(3)

Unnamed: 0,Banglore,Cochin,Delhi,Hyderabad,Kolkata,New Delhi
0,0,0,0,0,0,1
1,1,0,0,0,0,0
2,0,1,0,0,0,0


In [39]:
data_train = pd.concat([data_train,dummies_Destination],axis='columns')
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other,Banglore,Chennai,Delhi,Kolkata,Mumbai,Banglore.1,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,2,50,22,20,1,10,BLR,DEL,,,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,Air India,Kolkata,Banglore,0,No info,7662,1,5,2019,7,25,5,50,13,15,CCU,IXR,BBI,BLR,,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,Jet Airways,Delhi,Cochin,0,No info,13882,9,6,2019,19,0,9,25,4,25,DEL,LKO,BOM,COK,,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,2019,5,25,18,5,23,30,CCU,NAG,BLR,,,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,2019,4,45,16,50,21,35,BLR,NAG,DEL,,,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [40]:
data_train=data_train.drop(['Airline','Source','Destination'],axis=1)


In [41]:
#So We will Use Label Encoder for Encoding Technique as we have text in our columns.
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [42]:

#data_train["Additional_Info"]=encoder.fit_transform(data_train['Additional_Info'])

data_train["Route_1"]=encoder.fit_transform(data_train['Route_1'])
data_train["Route_2"]=encoder.fit_transform(data_train['Route_2'])
data_train["Route_3"]=encoder.fit_transform(data_train['Route_3'])
data_train["Route_4"]=encoder.fit_transform(data_train['Route_4'])
data_train["Route_5"]=encoder.fit_transform(data_train['Route_5'])

In [43]:
data_train.head()

Unnamed: 0,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Journey_Year,Duration_hours,Duration_minutes,departure_Hour,departure_Min,Arr_Hour,Arr_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other,Banglore,Chennai,Delhi,Kolkata,Mumbai,Banglore.1,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,0,No info,3897,24,3,2019,2,50,22,20,1,10,0,13,24,12,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,0,No info,7662,1,5,2019,7,25,5,50,13,15,2,25,1,3,4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,0,No info,13882,9,6,2019,19,0,9,25,4,25,3,32,4,5,4,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,1,No info,6218,12,5,2019,5,25,18,5,23,30,2,34,3,12,4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,1,No info,13302,1,3,2019,4,45,16,50,21,35,0,34,8,12,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [44]:
data_train = data_train.drop(['Journey_Year','Additional_Info','Duration_minutes','departure_Min','Arr_Hour','Arr_Min','Route_1','Route_2','Route_3','Route_4','Route_5'],axis=1)

In [45]:
### Feature Selection through Lasso

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [46]:
X_Train =data_train.drop(['Price'],axis=1)
Y_Train =data_train.Price
X_Test  = data_test

In [47]:
X_Train.shape

(10682, 26)

In [48]:
model=SelectFromModel(Lasso(alpha=0.005,random_state=0))

In [49]:
model.fit(X_Train,Y_Train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [50]:
selected_features=X_Train.columns[(model.get_support())]

In [51]:
selected_features

Index(['Total_Stops', 'Journey_Day', 'Journey_Month', 'Duration_hours',
       'departure_Hour', 'Air Asia', 'Air India', 'GoAir', 'IndiGo',
       'Jet Airways', 'Multiple carriers', 'Multiple carriers Premium economy',
       'SpiceJet', 'Vistara', 'other', 'Banglore', 'Chennai', 'Delhi',
       'Kolkata', 'Mumbai', 'Banglore', 'Cochin', 'Delhi', 'Hyderabad',
       'New Delhi'],
      dtype='object')

In [52]:
len(selected_features)

25

In [53]:
X_Train.shape

(10682, 26)

### RandomForestRegressor¶

In [54]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
#Randomized Search CV

In [55]:

rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [56]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [57]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#l#rf_random.fit(X_train, y_train)

In [58]:
rf_random.fit(X_Train, Y_Train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [59]:
acc_rf = round(rf_random.score(X_Train, Y_Train) * 100, 2)
acc_rf


88.73

In [60]:
chk =X_Train[2:3]
chk.head()

Unnamed: 0,Total_Stops,Journey_Day,Journey_Month,Duration_hours,departure_Hour,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other,Banglore,Chennai,Delhi,Kolkata,Mumbai,Banglore.1,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
2,0,9,6,19,9,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [61]:
data_train.head()

Unnamed: 0,Total_Stops,Price,Journey_Day,Journey_Month,Duration_hours,departure_Hour,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,other,Banglore,Chennai,Delhi,Kolkata,Mumbai,Banglore.1,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,0,3897,24,3,2,22,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,0,7662,1,5,7,5,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,0,13882,9,6,19,9,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,1,6218,12,5,5,18,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,1,13302,1,3,4,16,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [62]:
Y_pred = rf_random.predict(chk)
Y_pred

array([13459.65431803])

In [68]:
## Testing

def predict_Flightprice(stops,Journey_Day,Journey_Month,duration,depature_time,airline,Source,Dest):    
    airline_index = np.where(X_Train.columns==airline)[0][0]
    Source_index = np.where(X_Train.columns==Source)[0][0]
    Dest_index = np.where(X_Train.columns==Dest)[0][0]

    x = np.zeros(len(X_Train.columns))
    x[0] = stops
    x[1] = Journey_Day
    x[2] = Journey_Month
    x[3] = duration
    x[4] = depature_time
    
    if airline_index >= 0:
        x[airline_index] = 1
    if Source_index >= 0:
        x[Source_index] = 1
    if Dest_index >= 0:
        x[Dest_index] = 1
        
    return rf_random.predict([x])[0]

In [70]:
predict_Flightprice(0,9,6,19,9,'Jet Airways','Delhi','Cochin')

13459.654318034354

In [71]:
predict_Flightprice(1,15,2,6,11,'Jet Airways','Kolkata','Banglore')

10772.67093911423

In [72]:
#Saving Model Using Pickle
#import pickle
# save the model to disk
#pickle.dump(rf_random, open("rf_random.dat", "wb"))


## Exporting our tested model

import pickle
with open('Flight_prices_model.pickle','wb') as f:
    pickle.dump(rf_random,f)

### Export location and column information to a file that will be useful later on in our prediction application

In [73]:
import json
columns = {
    'data_columns' : [col.lower() for col in X_Train.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [74]:
columns

{'data_columns': ['total_stops',
  'journey_day',
  'journey_month',
  'duration_hours',
  'departure_hour',
  'air asia',
  'air india',
  'goair',
  'indigo',
  'jet airways',
  'multiple carriers',
  'multiple carriers premium economy',
  'spicejet',
  'vistara',
  'other',
  'banglore',
  'chennai',
  'delhi',
  'kolkata',
  'mumbai',
  'banglore',
  'cochin',
  'delhi',
  'hyderabad',
  'kolkata',
  'new delhi']}

In [None]:
### We got our model yayy, Moving towards building a Python Flask Server..