# Flight Fare  Prediction

**Objective :**<br>
Predicting flight ticket prices based on the ticket details. This is a regression analysis since we are dealing with predicting the continuous variable (Price).<br>

[**Click here to download the data**](https://www.kaggle.com/nikhilmittal/flight-fare-prediction-mh)



**Data Pre-processing Steps involved**
1. Dealing with missing data
2. Dealing with datetime features
3. Dealing with categorical features
4. Dealing with imputation
5. Feature Selection
6. Modelling

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
import pickle

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for table display
sns.set_theme(style='whitegrid')
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)
from IPython.core import display as ICD
import warnings
warnings.filterwarnings('ignore')


In [4]:
# Appending all functions in this block

def convert_to_date(df,col):
    df[col] = pd.to_datetime(df[col])
    

def convert_and_drop(df,col,suffix):
    """
    Transform datetime features
    """
    if suffix=='_hour':
        df[col+suffix] = df[col].dt.hour
    elif suffix=='_minute':
        df[col+suffix] = df[col].dt.minute
    elif suffix=='_day':
        df[col+suffix] = df[col].dt.day
    elif suffix=='_month':
        df[col+suffix] = df[col].dt.month

def transform_duration(x):
    """
    Transform the duration feature
    """
    if len(x.split())==2:
        return x
    else:
        if 'h' in x:
            return x + ' 0m'
        else:
            return '0h '+ x

def plot_dist(data,col):
    """
    Plot distributions of a continuous variable
    """
    fig,(ax1,ax2)=plt.subplots(2,1,figsize=(10,10))
    ax1.set_title(f'Distribution of {col} with Skewness = {round(data[col].skew(),2)}',fontsize=16)
    sns.distplot(data[col],ax=ax1);
    sns.boxplot(data=data,x=col,ax=ax2);

def prediction(model,dump_filename=None):
    model.fit(X_train,y_train)
    print("Training Score : ",round(model.score(X_train,y_train),2))
    print("Test Score : ",round(model.score(X_test,y_test),2))


    y_pred = model.predict(X_test)
    print("R2 Score : ",round(r2_score(y_test,y_pred),2))
    print("MAE : ",round(mean_absolute_error(y_test,y_pred),2))
    print("MSE : ",round(mean_squared_error(y_test,y_pred),2))
    print("RMSE : ",round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
    
    variance = pd.DataFrame(np.array(y_test)-y_pred,columns=['y_test-y_pred'])
    plot_dist(variance,'y_test-y_pred')
    
    if dump_filename is not None:
        file = open(f'../Model/{dump_filename}.pkl')
        pickle.dump(model,file)
        
    

In [5]:
data = pd.read_excel('../Data/Data_Train.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '../Data/Data_Train.xlsx'

In [6]:
pwd

'/Users/eashwar/Documents/Learning/Flight_Fare_Prediction'

### 1. Dealing with Missing Data

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(data.isnull());

In [None]:
print(data.isnull().sum(),data.shape,sep='\n')

In [None]:
# Missing Values can be dropped
df1 = data.dropna(axis='rows')
df1.shape

### 2. Dealing with Datetime features

In [None]:
df1.info()

`Date_of_Journey` ---> `datetime`   _(extract day and month)_ <br>
`Dep_Time` ---> `datetime`   _(extract hour and minute)_ <br>
`Arrival_Time` ---> `datetime`   _(extract hour and minute)_ <br>
`Duration` ---> `numerical`  _(extract hour and minute)_<br>

In [None]:
# Original
df1.head()

In [None]:
df2=df1.copy()

# convert the features to datetime; extract the necessary metrics
date_cols = ['Date_of_Journey','Dep_Time','Arrival_Time']
for i in date_cols:
    convert_to_date(df2,i)
    if i=='Date_of_Journey':
        convert_and_drop(df2,i,'_day')
        convert_and_drop(df2,i,'_month')
    else:
        convert_and_drop(df2,i,'_hour')
        convert_and_drop(df2,i,'_minute')

# extract hours and minutes from Duration column; convert them to integers
df2['Duration'] = df2['Duration'].apply(lambda x : transform_duration(x))
df2['Duration_hour']  = df2['Duration'].apply(lambda x : int(x.split(' ')[0][0:-1]))
df2['Duration_minute']  = df2['Duration'].apply(lambda x : int(x.split(' ')[1][0:-1]))


# drop unnecessary columns
df2.drop(columns=date_cols+['Duration'],inplace=True)

df2.head()

In [None]:
df2.info()

In [None]:
# Gather all numerical features in one dataframe

num_cols = [col for col in df2.columns if df2[col].dtype!='O']
print("Numerical columns : ",num_cols)
numerical = df2[num_cols].copy()

### 3. Dealing with Categorical Features

In [None]:
# Extract categorical columns
cat_cols = [col for col in df2.columns if df2[col].dtype=='O']
df_cat = df2[cat_cols].copy()
print(cat_cols)
print(df2[cat_cols].nunique())

In [None]:
# Airline vs Price Distribution

print(df2['Airline'].value_counts())

plt.figure(figsize=(20,8))
plt.title('Airline vs Price Distribution',fontsize=16)
plt.ylabel('Price',fontsize=16)
plt.xlabel('Airline',fontsize=16)

sns.boxplot(data=df2.sort_values(by='Price',ascending=False),x='Airline',y='Price');

_Jet Airways charged tickets at a higher rate with negligible fluctuation.<br>
Other Airlines charged tickets almost within the same range._

In [None]:
# Source vs Price Distribution

print(df2['Source'].value_counts())

plt.figure(figsize=(20,8))
plt.title('Source vs Price Distribution',fontsize=16)
plt.ylabel('Price',fontsize=16)
plt.xlabel('Source',fontsize=16)

sns.boxplot(data=df2.sort_values(by='Price',ascending=False),x='Source',y='Price');

In [None]:
# Destination vs Price Distribution

print(df2['Destination'].value_counts())

plt.figure(figsize=(20,8))
plt.title('Destination vs Price Distribution',fontsize=16)
plt.ylabel('Price',fontsize=16)
plt.xlabel('Destination',fontsize=16)

sns.boxplot(data=df2.sort_values(by='Price',ascending=False),x='Destination',y='Price');

In [None]:
# Total Stops vs Price Distribution

print(df2['Total_Stops'].value_counts())


plt.figure(figsize=(20,8))
plt.title('Total Stops vs Price Distribution',fontsize=16)
plt.ylabel('Price',fontsize=16)
plt.xlabel('Total Stops',fontsize=16)

sns.boxplot(data=df2.sort_values(by='Price',ascending=False),x='Total_Stops',y='Price');

In [None]:
# DEALING WITH 'Route' COLUMN

print(df2['Route'].head(5))
print('\n')
print("Distinct number of routes : ",df2['Route'].apply(lambda x: len(x.split('→'))).unique())
print("Maximum number of routes → 6-1=5")

# Split the routes into 5 (maximum routes) seperate columns
df_cat['Route_1']=df_cat['Route'].str.split('→').str[0]
df_cat['Route_2']=df_cat['Route'].str.split('→').str[1]
df_cat['Route_3']=df_cat['Route'].str.split('→').str[2]
df_cat['Route_4']=df_cat['Route'].str.split('→').str[3]
df_cat['Route_5']=df_cat['Route'].str.split('→').str[4]

# replace empty routes with 'None'
for i in ['Route_3','Route_4','Route_5']:
    df_cat[i].fillna('None',inplace=True)

# drop original column
df_cat.drop(columns='Route',inplace=True)
df_cat.head()

In [None]:
print("No. of distinct values in each categorical feature")
df_cat.nunique()

**Encoding Approach**<br>
- One-hot encoding needed for ['Airline','Source','Destination'].
- 'Total_Stops' can be converted to number of stops. (non-stop→0, 1 stop→1, 2 stops→2, ...)
- 'Additional_Info' can be dropped.
- Label encoding needed for the 5 Routes.

In [None]:
# One hot encoding

Airline = pd.get_dummies(df_cat['Airline'],drop_first=True)
Source = pd.get_dummies(df_cat['Source'],drop_first=True)
Destination = pd.get_dummies(df_cat['Destination'],drop_first=True)
ICD.display('Airline',Airline.head())
ICD.display('Source',Source.head())
ICD.display('Destination',Destination.head())


In [None]:
# Label Encoding

label_enc = LabelEncoder()
route_cols = ['Route_1', 'Route_2','Route_3', 'Route_4', 'Route_5']
Routes=df_cat[route_cols].copy()

for i in route_cols:
    Routes[i]=label_enc.fit_transform(Routes[i])

ICD.display('Routes',Routes.head())

In [None]:
# Custom encoding on 'Total_Stops'

ICD.display(df_cat['Total_Stops'].head())
print("Distinct Total_Stops : ",df_cat['Total_Stops'].unique())
dict_stops = {'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4}
Total_Stops = df_cat['Total_Stops'].map(dict_stops)
ICD.display('Total_Stops',Total_Stops.head())

In [None]:
# Merge all transformed columns
categorical = pd.concat([Airline,Source,Destination,Routes,Total_Stops],axis=1)
df3 = pd.concat([categorical,numerical],axis=1)
df3.head()

### 4. Dealing with Imputation

In [None]:
df4=df3.copy()
plot_dist(df4,'Price')

Prices > 40000 could be considered as outliers, to be replaced with median price.

In [None]:
df4['Price'] = np.where(df4['Price']>40000,df4['Price'].median(),df4['Price'])
plot_dist(df4,'Price')

In [None]:
X = df4.drop(columns='Price')
y = df4['Price']
print(X.shape,y.shape)

In [None]:
ICD.display('Features',X.head())
ICD.display('Target',y.head())

### 5. Feature Selection

In [None]:
# Top 5 correlated features with Price (by magnitude)
corr_price = df4.corr()[['Price']]
corr_price['abs_Price'] = corr_price.abs()
corr_price = corr_price.sort_values(by='abs_Price',ascending=False)
corr_price.head(5)

In [None]:
# Bottom 5 correlated features with Price
corr_price.tail(5)

In [None]:
# Sorting features based on importance
col_dep = pd.DataFrame(mutual_info_classif(X,y),index=X.columns,columns=['Importance'])
col_dep.sort_values(inplace=True,by='Importance',ascending=False)
col_dep.head(5)

In [None]:
col_dep.tail(5)

Based on the two observations, feature 'Jet Airways Business' can be dropped due to irrelevance with the target variable - Price

### 6. Modelling

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=27)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
rf = RandomForestRegressor()
prediction(rf)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
prediction(lr)

In [None]:
y_test.skew()