# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Importing the Data

In [2]:
fpp = pd.read_excel("Flight_Fare.xlsx")
fpp

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [3]:
#Basic checks: head, tail, datatypes, shape and describe
fpp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [4]:
fpp.shape

(10683, 11)

In [5]:
fpp.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [9]:
#To see the Distribution of Categorical features
fpp.describe(include=['1'])

TypeError: data type '' not understood

# EDA

In [7]:
from pandas_profiling import ProfileReport

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.1.1/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.1.1/u/import-error

In [None]:
profile = ProfileReport(fpp, title = 'Flight Price Model Profile', html = { 'style': {'full_width' : True}})
profile

In [None]:
sb.boxplot(fpp.Price)

Price is having outliers. So we should use IQR method for this feature to treating outliers.

# Feature Engineering

# Treating Outliers

In [None]:
#treating outlier with censoring / capping
#bringing extreme high outlier values in upper values
#bringing extreme low outlier values in lower values
def find_boundaries(variable):
    
    q1  = fpp[variable].quantile(0.25)
    q3  = fpp[variable].quantile(0.75)
    iqr = q3  - q1
    lower_range  = q1  - 1.5 * iqr #used to find the minimum value
    upper_range  = q3 +  1.5 * iqr #max value
    return lower_range , upper_range

In [None]:
#defining lower range and upper range
lower_Price, upper_Price = find_boundaries('Price')

In [None]:
#perform capping
#where function need to check condition
#data.column = numpy.fn(original col > upper range of col, need to change the value as upper range, in that column)
fpp.Price = np.where(fpp.Price > upper_Price, upper_Price, fpp.Price)
fpp.Price = np.where(fpp.Price > lower_Price, lower_Price, fpp.Price)

In [None]:
#After treating outliers verify the Price
sb.boxplot(fpp.Price)

In [None]:
#checking for Null values
fpp.isnull().sum() 

In [None]:
fpp.dropna(inplace = True)

In [None]:
#recheck null values
fpp.isnull().sum()

In [None]:
#Checking for NaN values
fpp.isna().sum() 

In [None]:
fpp.shape

In [None]:
fpp.head()

# Handling Categorical Data

Airline, Source and Destination are Nominal data.So we can use OneHotEncoder to changing from categorical data into
Numerical data.

# Airline

In [None]:
fpp["Airline"].unique()

In [None]:
fpp["Airline"].value_counts()

Drop_first = True removes the first column which is created for the first unique value of a column.
If we do not use drop_first = True, then n dummy variables will be created, and these predictors(n dummy variables)
are themselves correlated which is known as multicollinearity and it, in turn, leads to Dummy Variable Trap.

In [None]:
Airline = fpp[["Airline"]]

Airline = pd.get_dummies(Airline, drop_first= True)

Airline.head()


# Source

In [None]:
fpp["Source"].unique()

In [None]:
fpp["Source"].value_counts()

In [None]:
Source = fpp[["Source"]]

Source = pd.get_dummies(Source, drop_first= True)

Source.head()

# Destination

In [None]:
fpp["Destination"].unique()

In [None]:
fpp["Destination"].value_counts()

In [None]:
Destination = fpp[["Destination"]]

Destination = pd.get_dummies(Destination, drop_first= True)

Destination.head()

# Concatinating all the Nominal features

In [None]:
#concating all the encoded dataframes
## when 2 or 3 dataframes have different columns so we need to use axis = 1
fpp_ohen =pd.concat([Airline,Source, Destination],axis=1)
fpp_ohen

# Total_Stops

Total_Stops feature is ordinal type.So perform Label encoding for this feature.

In [None]:
fpp.Total_Stops.unique()

In [None]:
fpp.Total_Stops.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [None]:
fpp['Total_Stops'] = le.fit_transform(fpp['Total_Stops'])

In [None]:
fpp.Total_Stops.unique()

In [None]:
fpp.Total_Stops.value_counts()

#  Concatination all the categorical features (Nominal + Ordinal)

In [None]:
fpp_c = pd.concat([fpp_ohen, fpp.Total_Stops], axis = 1)
fpp_c.head(10)

# Route

Route is categorical feature which is ordinal with special character. we need to perform split function and can perform label encoder to convert from categorical to numerical.

In [None]:
Route = fpp[["Route"]]
Route.head()

Route and Total_Stops are similar each other. Here 5 stops are there from index 0 to 4.

In [None]:
Route['Rt_0'] = Route['Route'].str.split('→').str[0]
Route['Rt_1'] = Route['Route'].str.split('→').str[1]
Route['Rt_2'] = Route['Route'].str.split('→').str[2]
Route['Rt_3'] = Route['Route'].str.split('→').str[3]
Route['Rt_4'] = Route['Route'].str.split('→').str[4]
Route.head()

In [None]:
Route.fillna('None', inplace = True)
Route.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

for i in range(0,5):
    col = 'Rt_' + str(i)
    Route[col] = le.fit_transform(Route[col])
    
Route.head()

In [None]:
Route.drop('Route', axis = 1, inplace = True)
Route.head()

# Handling Time-Series Data

# Duration

In [None]:
def convert_duration(duration):
    if len(duration.split()) == 2:
        hours = int(duration.split()[0][:-1])
        minutes = int(duration.split()[1][:-1])
        return hours * 60 + minutes
    else:
        return int(duration[:-1]) * 60
    

In [None]:
fpp['Duration'] = fpp['Duration'].apply(convert_duration)
fpp.head()

# Dep_Time

First coverting object to datatime format.

In [None]:
fpp['Dep_Time'] = pd.to_datetime(fpp['Dep_Time'])

In [None]:
fpp.dtypes

Then extracting the hour and minutes from Dep_Time and Arrival_Time

In [None]:
fpp['Dep_Time_in_hours'] = fpp['Dep_Time'].dt.hour
fpp['Dep_Time_in_min'] = fpp['Dep_Time'].dt.minute

# Arrival_Time

In [None]:
fpp['Arrival_Time'] = pd.to_datetime(fpp['Arrival_Time'])

In [None]:
fpp.dtypes

In [None]:
fpp['Arrival_Time_in_hours'] = fpp['Arrival_Time'].dt.hour
fpp['Arrival_Time_in_min'] = fpp['Arrival_Time'].dt.minute

In [None]:
fpp.head()

Dep_Time, Arrival_Time features are not required now. so we will remove these features. 

In [None]:
fpp.drop(['Dep_Time','Arrival_Time'], axis = 1, inplace = True)

In [None]:
fpp.head()

# Date_of_Journey

First converting data/month/year to datetime format.

In [None]:
fpp['Date_of_Journey'] = pd.to_datetime(fpp['Date_of_Journey'])

In [None]:
fpp.head()

In [None]:
fpp['Date_of_Journey'].dt.year.unique()

2019 in year is common for all the values.so we can eliminate this year column.No need to split the year.we can do only for Days and month.Extracting Days and month from Date_of_Journey feature.

In [None]:
fpp['Day'] = fpp['Date_of_Journey'].dt.day
fpp['Month'] = fpp['Date_of_Journey'].dt.month

In [None]:
fpp.head()

In [None]:
fpp.drop(['Date_of_Journey'], axis = 1, inplace = True)

In [None]:
fpp.head()

# Additional_Info

In [None]:
fpp['Additional_Info'].unique()

In [None]:
fpp['Additional_Info'].value_counts()

There is no valid information. So we can discard this feature.

In [None]:
fpp.drop('Additional_Info', axis = 1, inplace = True)

In [None]:
fpp.head()

As we encoded categorical features which are Airline, Source, Destination, Route so that we can drop all these features.

In [None]:
fpp.drop(['Airline','Source','Destination','Route'], axis = 1, inplace = True)

In [None]:
fpp.head()

# Concatinating all the preprocessed data

In [None]:
#fpp_c = pd.concat([fpp_ohen, fpp.Total_Stops], axis = 1)
fpp_fe = pd.concat([fpp_c, Route, fpp], axis = 1)
fpp_fe.head()

# Feature Selection for Independent and dependent variables

In [None]:
fpp_fe.info()

In [None]:
X = fpp_fe.drop('Price', axis =1)
X

In [None]:
y = fpp_fe[['Price']]
y

# Model Selection

In [None]:
#Splitting the train data and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# 1. Linear Regression

In [None]:
#model creation
from sklearn.linear_model import LinearRegression

#initialize the model
fpp_fe_lr = LinearRegression()

In [None]:
#train the model ---> make your model to learn
#x_train, y_train
#x_test ----> model_prediction(sales)

fpp_fe_lr.fit(X_train,y_train)

In [None]:
#model evaluation
#we are passing 20% input data to test the trained model
y_predict = fpp_fe_lr.predict(X_test)

In [None]:
#we need to use evalution metrics to see the model performance
#mse
#mae
#r squared
#adjusted r squared
from sklearn.metrics import mean_squared_error, r2_score

mse = (mean_squared_error(y_test , y_predict))
print(np.sqrt(mse)) #RMSE
print(r2_score(y_test, y_predict))

# 2. Decision Tree Classifier

In [None]:
#import model 
from sklearn.tree import DecisionTreeClassifier

In [None]:
#gini, entropy, information gain --->important metrics for ASM
fpp_fe_dt  = DecisionTreeClassifier(criterion='gini', max_depth=20)

In [None]:
#fit data to the model 
fpp_fe_dt.fit(X_train,y_train)

In [None]:
y_test_pred  =  fpp_fe_dt.predict(X_test) #CHECK ON 20 PERCENT TEST DATA
y_train_pred  = fpp_fe_dt.predict(X_train) #CHECK ON 80 PERCENT TRAIN DATA

In [None]:
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score, classification_report

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
accuracy_score(y_test, y_test_pred)

# 3. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
fpp_fe_rf  =  RandomForestClassifier(n_estimators=10,criterion='entropy',max_depth=20)

In [None]:
fpp_fe_rf.fit(X_train, y_train)

In [None]:
y_train_pred1  = fpp_fe_rf.predict(X_train)
y_test_pred1  =  fpp_fe_rf.predict(X_test)

In [None]:
print(accuracy_score(y_train, y_train_pred1))

In [None]:
print(accuracy_score(y_test , y_test_pred1))

# 4. ANN

In [None]:
## model creation
from sklearn.neural_network import MLPClassifier
fpp_fe_ann = MLPClassifier( hidden_layer_sizes=(50,3),
                       learning_rate_init=0.1,
                       max_iter=100,
                       random_state=150) ## model object creation max_iter=Stopping parameter
fpp_fe_ann.fit(X_train,y_train) ## training the data
y_predict_proba = fpp_fe_ann.predict_proba(X_test) ## predicting the pro
## bability of class
y_predict1 = fpp_fe_ann.predict(X_test)
y_train_predict = fpp_fe_ann.predict(X_train)

In [None]:
## Evaluating the model created
print("Train accuracy :",accuracy_score(y_train,y_train_predict))
print("Test accuracy :",accuracy_score(y_test,y_predict1))

In [None]:
print(classification_report(y_test,y_predict1))