<a href="https://colab.research.google.com/github/Dheemanth2610/Flight-Price-Prediction-Project/blob/main/Flight_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries and Dataset**

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
pd.set_option('display.max_columns',None)

In [73]:
#reading dataset
data=pd.read_csv(r"flightdataset.csv")

In [74]:
#checking the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


**Exploratory Data Analysis**

In [75]:
#checking for null values
data.isnull().sum()

Unnamed: 0          0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [66]:
#data.drop(data.index[218247])

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1.0,5953.0
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1.0,5953.0
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1.0,5956.0
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1.0,5955.0
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1.0,5955.0
...,...,...,...,...,...,...,...,...,...,...,...,...
218242,218242,Air_India,AI-889,Delhi,Morning,one,Evening,Kolkata,Business,8.58,37.0,42212.0
218243,218243,Air_India,AI-889,Delhi,Morning,one,Afternoon,Kolkata,Business,27.58,37.0,42212.0
218244,218244,Air_India,AI-839,Delhi,Night,one,Morning,Kolkata,Business,10.67,37.0,43376.0
218245,218245,Air_India,AI-544,Delhi,Evening,one,Morning,Kolkata,Business,14.83,37.0,43376.0


In [76]:
data.shape

(300153, 12)

In [77]:
np.where(data['stops'].isnull())[0]

array([], dtype=int64)

In [78]:
#checking for duplicates
data.duplicated().sum()

0

In [79]:
#checking for outliers
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=data['duration'], notched=True, name='Duration', marker=dict(color='#6699ff'), 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 1)

fig.add_trace(go.Box(y=data['days_left'], notched=True, name='Days Left', marker_color = '#ff0066', 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 2)

fig.add_trace(go.Box(y=data['price'], notched=True, name='Price', marker=dict(color='lightseagreen'), 
                     boxmean=True, boxpoints='suspectedoutliers'), 1, 3)

fig.update_layout(title_text='<b>Box Plots for Numerical Variables<b>')

fig.show()

In [80]:
#function for detecting outliers
def detect_outliers(d):
  for i in d:
    Q3, Q1 = np.percentile(data[i], [75 ,25])
    IQR = Q3 - Q1

    ul = Q3+1.5*IQR
    ll = Q1-1.5*IQR

    outliers = data[i][(data[i] > ul) | (data[i] < ll)]
    print(f'*** {i} outlier points***', '\n', outliers, '\n')

In [81]:
detect_outliers(['duration','days_left','price'])

*** duration outlier points*** 
 10534     31.25
10535     33.17
10540     36.92
10891     31.25
10892     33.17
          ...  
296064    30.33
296297    30.33
296391    30.33
296716    30.33
297661    30.33
Name: duration, Length: 2110, dtype: float64 

*** days_left outlier points*** 
 Series([], Name: days_left, dtype: int64) 

*** price outlier points*** 
 215858    114434
215859    116562
216025    100395
216094     99129
216095    101369
           ...  
293474    107597
296001    102832
296081    102384
296170    104624
296404    102384
Name: price, Length: 123, dtype: int64 



In [82]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953


In [83]:
data.drop(['flight'],axis=1,inplace=True)

In [84]:
#splitting the data into train and test data
Train,Test=train_test_split(data,test_size=0.05,random_state=42)

In [85]:
Train.shape

(285145, 11)

In [86]:
Test.shape

(15008, 11)

In [87]:
categorical = [var for var in data.columns if data[var].dtype=='O']

In [88]:
#segregation using ordinal encoding
def category(Train, Test):
    for var in categorical:
        ordered_labels = Train.groupby([var])['price'].mean().sort_values().index

        ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
        ordinal_label
        Train[var] = Train[var].map(ordinal_label)
        Test[var] = Test[var].map(ordinal_label)

category(Train, Test)

In [89]:
Train.head(3)

Unnamed: 0.1,Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
74494,74494,2,3,1,2,5,1,0,7.08,30,7522
8488,8488,4,0,2,2,5,2,0,11.17,42,5852
19025,19025,2,0,1,2,3,3,0,6.92,44,6719


In [101]:
#Train.drop(['Unnamed: 0'],axis=1,inplace=True)
Test.drop(['Unnamed: 0'],axis=1,inplace=True)

In [102]:
x_train=Train.iloc[:,:-1] 
y_train=Train.iloc[:,9] 
x_test=Test.iloc[:,:-1] 
y_test=Test.iloc[:,-1] 

In [103]:
x_train.shape,y_train.shape,

((285145, 9), (285145,))

In [104]:
x_test.shape,y_test.shape

((15008, 9), (15008,))

**Decision Tree**

In [105]:
dt=DecisionTreeRegressor()
dt.fit(x_train,y_train)

In [111]:
y_pred1=dt.predict(x_test)
Test['Price1']=y_pred1

In [112]:
print("R2 Score= ",r2_score(y_test,y_pred1))
print("Mean Squared Error= ",mean_squared_error(y_test,y_pred1,squared=True))
print("Root Mean Squared Error= ",mean_squared_error(y_test,y_pred1,squared=False))
print("Mean Absolute Error= ",mean_absolute_error(y_test,y_pred1))

R2 Score=  0.9764666246746603
Mean Squared Error=  12300456.339069163
Root Mean Squared Error=  3507.2006414046464
Mean Absolute Error=  1154.3110007995735


In [113]:
fig = px.scatter(x=y_test, y=y_pred1)

fig.update_traces(marker=dict(size=10, color='#a64dff', line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'),
                  hovertemplate = '<b>True Y</b>: %{x}'+ '<br><b>Predicted Y</b>: %{y}<br>')

fig.update_layout(title_text='<b>True Y Vs. Predicted Y for Decision Tree<b>')
fig.update_xaxes(title_text='True Y', showspikes=True)
fig.update_yaxes(title_text='Predicted Y', showspikes=True)

fig.show()

 **Random Forest**

In [114]:
RF_S = RandomForestRegressor()
params_RF = {'n_estimators': list(range(50,100)), 'min_samples_leaf': list(range(1,10)), 'min_samples_split': list(range(2,10))}
grid_RF = RandomizedSearchCV(RF_S, param_distributions=params_RF, cv=5, n_jobs=-1, n_iter=20, random_state=42, return_train_score=True)
grid_RF.fit(x_train, y_train)
print('Best parameters:', grid_RF.best_estimator_)

Best parameters: RandomForestRegressor(min_samples_split=8, n_estimators=80)


In [115]:
rf=RandomForestRegressor(min_samples_leaf=2,min_samples_split=8,n_estimators=80)
rf.fit(x_train,y_train)

In [116]:
y_pred2=rf.predict(x_test)
Test['Price2']=y_pred2

In [117]:
print("R2 Score= ",r2_score(y_test,y_pred2))
print("Mean Squared Error= ",mean_squared_error(y_test,y_pred2,squared=True))
print("Root Mean Squared Error= ",mean_squared_error(y_test,y_pred2,squared=False))
print("Mean Absolute Error= ",mean_absolute_error(y_test,y_pred2))

R2 Score=  0.986382451381195
Mean Squared Error=  7117638.669128937
Root Mean Squared Error=  2667.8903030538822
Mean Absolute Error=  1092.5172717387957


In [118]:
fig = px.scatter(x=y_test, y=y_pred2)

fig.update_traces(marker=dict(size=10, color='salmon', line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'),
                  hovertemplate = '<b>True Y</b>: %{x}'+ '<br><b>Predicted Y</b>: %{y}<br>')

fig.update_layout(title_text='<b>True Y Vs. Predicted Y for Random Forest<b>')
fig.update_xaxes(title_text='True Y', showspikes=True)
fig.update_yaxes(title_text='Predicted Y', showspikes=True)

fig.show()

**Extra Tree**

In [119]:
et=ExtraTreesRegressor()
et.fit(x_train,y_train)

In [120]:
y_pred3=et.predict(x_test)
Test['Price3']=y_pred3

In [121]:
print("R2 Score= ",r2_score(y_test,y_pred3))
print("Mean Squared Error= ",mean_squared_error(y_test,y_pred3,squared=True))
print("Root Mean Squared Error= ",mean_squared_error(y_test,y_pred3,squared=False))
print("Mean Absolute Error= ",mean_absolute_error(y_test,y_pred3))

R2 Score=  0.9834054892252868
Mean Squared Error=  8673641.261854364
Root Mean Squared Error=  2945.1046266396656
Mean Absolute Error=  1130.332965517669


In [122]:
fig = px.scatter(x=y_test, y=y_pred3)

fig.update_traces(marker=dict(size=10, color='red', line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'),
                  hovertemplate = '<b>True Y</b>: %{x}'+ '<br><b>Predicted Y</b>: %{y}<br>')

fig.update_layout(title_text='<b>True Y Vs. Predicted Y for Extra Tree Regressor<b>')
fig.update_xaxes(title_text='True Y', showspikes=True)
fig.update_yaxes(title_text='Predicted Y', showspikes=True)

fig.show()

**Feature Importance**

In [123]:
d={'Features':x_train.columns,'Feature Importance':rf.feature_importances_}
df=pd.DataFrame(d)
df_sorted=df.sort_values(by='Feature Importance',ascending=True)
df_sorted

Unnamed: 0,Features,Feature Importance
3,stops,0.001918
2,departure_time,0.003761
4,arrival_time,0.004911
5,destination_city,0.009314
1,source_city,0.009729
0,airline,0.011469
8,days_left,0.015649
7,duration,0.058319
6,class,0.884929


In [124]:
fig = px.bar(x=df_sorted['Feature Importance'], y=df_sorted['Features'], color_continuous_scale=px.colors.sequential.Blues,
             title='<b>Feature Importance Based on Random Forest<b>', text_auto='.4f', color=df_sorted['Feature Importance'])

fig.update_traces(marker=dict(line=dict(color='black', width=2)))
fig.update_layout({'yaxis': {'title':'Features'}, 'xaxis': {'title':'Feature Importance'}})

fig.show()

**Results**

In [125]:
compare_models=[('Decision Tree',r2_score(y_test,y_pred1),mean_squared_error(y_test,y_pred1,squared=True),mean_squared_error(y_test,y_pred1,squared=False),mean_absolute_error(y_test,y_pred1,)),
('Random Forest',r2_score(y_test,y_pred2),mean_squared_error(y_test,y_pred2,squared=True),mean_squared_error(y_test,y_pred2,squared=False),mean_absolute_error(y_test,y_pred2,)),
('Extra Tree Regression',r2_score(y_test,y_pred3),mean_squared_error(y_test,y_pred3,squared=True),mean_squared_error(y_test,y_pred3,squared=False),mean_absolute_error(y_test,y_pred3,)),]

In [126]:
compare=pd.DataFrame(data=compare_models,columns=['models','R2','MSE','RMSE','MAE'])
compare

Unnamed: 0,models,R2,MSE,RMSE,MAE
0,Decision Tree,0.976467,12300460.0,3507.200641,1154.311001
1,Random Forest,0.986382,7117639.0,2667.890303,1092.517272
2,Extra Tree Regression,0.983405,8673641.0,2945.104627,1130.332966


In [127]:
fig = make_subplots(rows=1, cols=3, shared_yaxes=True)

fig.add_trace(go.Bar(x = compare['R2'], y = compare['models'], name='R2', marker_color='#cc0000', orientation='h', 
                     text = round(compare['R2'], 2), textposition = 'auto') , 1, 1)
fig.add_trace(go.Bar(x = compare['MSE'], y = compare['models'], name='MSE', marker_color='#80ff00', orientation='h', 
                     text = round(compare['MSE'], 2), textposition = 'auto') , 1, 2)
fig.add_trace(go.Bar(x = compare['MAE'], y = compare['models'], name='MAE', marker_color='#00b8e6', orientation='h', 
                     text = round(compare['MAE'], 2), textposition = 'auto') , 1, 3)

fig.update_layout(title_text='<b>Compare Models<b>')
fig.show()

In [128]:
test.head()

NameError: ignored

In [129]:
Test['Price1']=y_pred1
Test['Price2']=y_pred2
Test['Price3']=y_pred3

In [138]:
Test

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price,DTree,RandomForest,ExtraTree
27131,4,0,4,2,1,4,0,19.75,40,7366,7366.0,7286.705653,7233.66
266857,5,4,4,2,3,2,1,9.83,42,64831,72783.0,69026.175556,69289.61
141228,5,4,4,2,3,3,0,10.50,41,6195,6195.0,6200.067301,6195.00
288329,5,5,5,2,4,0,1,14.50,14,60160,60160.0,60396.334837,60245.45
97334,4,2,1,2,3,2,0,8.25,20,6578,6578.0,6517.966549,5666.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141456,5,4,3,2,3,3,0,25.67,42,8111,8111.0,8140.802251,8111.00
104068,0,2,2,2,5,4,0,13.33,12,5176,5176.0,5214.139310,5275.89
90396,4,2,0,2,5,0,0,15.33,32,5906,5906.0,6252.054005,5847.39
254815,5,2,4,2,5,4,1,8.33,32,60508,51817.0,57013.625139,51817.00


In [137]:
Test.rename(columns={'Price1':'DTree','Price2':'RandomForest','Price3':'ExtraTree'},inplace=True)

In [None]:
Test

In [139]:
Test.to_csv('final_results.csv',index=False)