In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_excel(r'E:\FlightFarePrediction\notebooks\data\Data_Train.xlsx')

In [4]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [5]:
df.shape

(10683, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [7]:
df.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [8]:
df['Route'].value_counts()

Route
DEL → BOM → COK          2376
BLR → DEL                1552
CCU → BOM → BLR           979
CCU → BLR                 724
BOM → HYD                 621
                         ... 
CCU → VTZ → BLR             1
CCU → IXZ → MAA → BLR       1
BOM → COK → MAA → HYD       1
BOM → CCU → HYD             1
BOM → BBI → HYD             1
Name: count, Length: 128, dtype: int64

In [9]:
df['Day_of_Journey']=pd.to_datetime(df['Date_of_Journey']).dt.day
df['Month_of_Journey']=pd.to_datetime(df['Date_of_Journey']).dt.month
df['Year_of_Journey']=pd.to_datetime(df['Date_of_Journey']).dt.year
df['Dep_Hour']=pd.to_datetime(df['Dep_Time']).dt.hour
df['Dep_Minute']=pd.to_datetime(df['Dep_Time']).dt.minute

In [10]:
route=df[['Route']]
df['Total_Stops'].value_counts()

Total_Stops
1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: count, dtype: int64

In [11]:
route['Route_1'] = route['Route'].str.split('→').str[0]
route['Route_2'] = route['Route'].str.split('→').str[1]
route['Route_3'] = route['Route'].str.split('→').str[2]
route['Route_4'] = route['Route'].str.split('→').str[3]
route['Route_5'] = route['Route'].str.split('→').str[4]
route.head()

Unnamed: 0,Route,Route_1,Route_2,Route_3,Route_4,Route_5
0,BLR → DEL,BLR,DEL,,,
1,CCU → IXR → BBI → BLR,CCU,IXR,BBI,BLR,
2,DEL → LKO → BOM → COK,DEL,LKO,BOM,COK,
3,CCU → NAG → BLR,CCU,NAG,BLR,,
4,BLR → NAG → DEL,BLR,NAG,DEL,,


In [12]:
route.fillna('None',inplace=True)
route.head()

Unnamed: 0,Route,Route_1,Route_2,Route_3,Route_4,Route_5
0,BLR → DEL,BLR,DEL,,,
1,CCU → IXR → BBI → BLR,CCU,IXR,BBI,BLR,
2,DEL → LKO → BOM → COK,DEL,LKO,BOM,COK,
3,CCU → NAG → BLR,CCU,NAG,BLR,,
4,BLR → NAG → DEL,BLR,NAG,DEL,,


In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in range(1,6):
    col ='Route_' + str(i)
    route[col] = le.fit_transform(route[col])

route.head()


Unnamed: 0,Route,Route_1,Route_2,Route_3,Route_4,Route_5
0,BLR → DEL,0,13,29,13,5
1,CCU → IXR → BBI → BLR,2,25,1,3,5
2,DEL → LKO → BOM → COK,3,32,4,5,5
3,CCU → NAG → BLR,2,34,3,13,5
4,BLR → NAG → DEL,0,34,8,13,5


In [14]:
route.drop(['Route'],axis=1,inplace=True)

In [15]:
data=pd.concat([df,route],axis=1)

In [16]:
data['Airline_encoded']=le.fit_transform(data['Airline'])

In [17]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,Month_of_Journey,Year_of_Journey,Dep_Hour,Dep_Minute,Route_1,Route_2,Route_3,Route_4,Route_5,Airline_encoded
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,...,3,2019,22,20,0,13,29,13,5,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,...,5,2019,5,50,2,25,1,3,5,1
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,...,6,2019,9,25,3,32,4,5,5,4
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,...,5,2019,18,5,2,34,3,13,5,3
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,...,3,2019,16,50,0,34,8,13,5,3


In [18]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,Month_of_Journey,Year_of_Journey,Dep_Hour,Dep_Minute,Route_1,Route_2,Route_3,Route_4,Route_5,Airline_encoded
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,...,3,2019,22,20,0,13,29,13,5,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,...,5,2019,5,50,2,25,1,3,5,1
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,...,6,2019,9,25,3,32,4,5,5,4
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,...,5,2019,18,5,2,34,3,13,5,3
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,...,3,2019,16,50,0,34,8,13,5,3


In [19]:
data['Total_Stops'].value_counts()

Total_Stops
1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: count, dtype: int64

In [20]:
data[['Source',	'Destination']].value_counts()

Source    Destination
Delhi     Cochin         4537
Kolkata   Banglore       2871
Banglore  Delhi          1265
          New Delhi       932
Mumbai    Hyderabad       697
Chennai   Kolkata         381
Name: count, dtype: int64

In [21]:
data['Total_Stops'].isnull().sum()

1

In [22]:
data['Total_Stops']=data['Total_Stops'].map({
    'non-stop':0,
    '1 stop':1,
    '2 stops':2,
    '3 stops':3,
    '4 stops':4
}
)

In [23]:
data['Source']=le.fit_transform(data['Source'])
data['Destination']=le.fit_transform(data['Destination'])
data['Arrival_hour']=pd.to_datetime(data['Arrival_Time']).dt.hour
data['Arrival_minute']=pd.to_datetime(data['Arrival_Time']).dt.minute

In [24]:
def convert_duration(Duration):
        if len(Duration.split()) == 2:
            hours = int(Duration.split()[0][: -1])
            minutes = int(Duration.split()[1][: -1])
            return hours * 60 + minutes
        else:
            return int(Duration[: -1]) * 60

In [25]:
data['Duration'] = data['Duration'].apply(convert_duration)
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,Dep_Hour,Dep_Minute,Route_1,Route_2,Route_3,Route_4,Route_5,Airline_encoded,Arrival_hour,Arrival_minute
0,IndiGo,24/03/2019,0,5,BLR → DEL,22:20,01:10 22 Mar,170,0.0,No info,...,22,20,0,13,29,13,5,3,1,10
1,Air India,1/05/2019,3,0,CCU → IXR → BBI → BLR,05:50,13:15,445,2.0,No info,...,5,50,2,25,1,3,5,1,13,15
2,Jet Airways,9/06/2019,2,1,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,1140,2.0,No info,...,9,25,3,32,4,5,5,4,4,25
3,IndiGo,12/05/2019,3,0,CCU → NAG → BLR,18:05,23:30,325,1.0,No info,...,18,5,2,34,3,13,5,3,23,30
4,IndiGo,01/03/2019,0,5,BLR → NAG → DEL,16:50,21:35,285,1.0,No info,...,16,50,0,34,8,13,5,3,21,35


In [26]:
data.head(1)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,Dep_Hour,Dep_Minute,Route_1,Route_2,Route_3,Route_4,Route_5,Airline_encoded,Arrival_hour,Arrival_minute
0,IndiGo,24/03/2019,0,5,BLR → DEL,22:20,01:10 22 Mar,170,0.0,No info,...,22,20,0,13,29,13,5,3,1,10


In [27]:
data.drop(['Route','Date_of_Journey','Airline','Additional_Info','Dep_Time','Arrival_Time'],axis=1,inplace=True)

In [28]:
data.head()

Unnamed: 0,Source,Destination,Duration,Total_Stops,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Hour,Dep_Minute,Route_1,Route_2,Route_3,Route_4,Route_5,Airline_encoded,Arrival_hour,Arrival_minute
0,0,5,170,0.0,3897,24,3,2019,22,20,0,13,29,13,5,3,1,10
1,3,0,445,2.0,7662,1,5,2019,5,50,2,25,1,3,5,1,13,15
2,2,1,1140,2.0,13882,9,6,2019,9,25,3,32,4,5,5,4,4,25
3,3,0,325,1.0,6218,12,5,2019,18,5,2,34,3,13,5,3,23,30
4,0,5,285,1.0,13302,1,3,2019,16,50,0,34,8,13,5,3,21,35


In [29]:
data.dtypes

Source                int32
Destination           int32
Duration              int64
Total_Stops         float64
Price                 int64
Day_of_Journey        int32
Month_of_Journey      int32
Year_of_Journey       int32
Dep_Hour              int32
Dep_Minute            int32
Route_1               int32
Route_2               int32
Route_3               int32
Route_4               int32
Route_5               int32
Airline_encoded       int32
Arrival_hour          int32
Arrival_minute        int32
dtype: object

In [30]:
num_col=data.columns

In [31]:
scaler=StandardScaler()

In [32]:
X=data.drop(['Price'],axis=1)
Y=data['Price']

In [33]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [35]:
num_pipeline=Pipeline(
    steps=[
        
            ('imputer',SimpleImputer(strategy='median')),
            ('scaler',StandardScaler())
            
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('Labelencoder',LabelEncoder()),
        ('scale',StandardScaler())
        
    ]
)


In [36]:
preprocessor=ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols),
])

In [37]:
preprocessor

In [38]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=42)

In [39]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [40]:
X_train.head()

Unnamed: 0,num_pipeline__Source,num_pipeline__Destination,num_pipeline__Duration,num_pipeline__Total_Stops,num_pipeline__Day_of_Journey,num_pipeline__Month_of_Journey,num_pipeline__Year_of_Journey,num_pipeline__Dep_Hour,num_pipeline__Dep_Minute,num_pipeline__Route_1,num_pipeline__Route_2,num_pipeline__Route_3,num_pipeline__Route_4,num_pipeline__Route_5,num_pipeline__Airline_encoded,num_pipeline__Arrival_hour,num_pipeline__Arrival_minute
0,1.736591,1.050856,-0.070151,1.744328,-0.187208,-1.46907,0.0,-1.130391,0.298051,-0.836963,3.870429,-0.375346,-0.953172,0.065965,0.013812,0.38532,0.626263
1,0.04339,-0.297686,1.706142,0.262221,-0.541111,0.250513,0.0,-0.259527,0.298051,0.817411,-0.616675,-0.640738,0.407581,0.065965,0.013812,-0.19727,0.626263
2,0.04339,-0.297686,-0.514224,0.262221,1.228405,-0.609279,0.0,0.437165,1.098872,0.817411,2.624011,-0.640738,0.407581,0.065965,1.712135,1.259207,-1.193541
3,0.04339,-0.297686,0.245635,0.262221,0.874502,-1.46907,0.0,-0.085354,1.365812,0.817411,-0.616675,-0.640738,0.407581,0.065965,0.862973,-1.799395,0.626263
4,0.04339,-0.297686,1.814694,1.744328,1.228405,1.110304,0.0,0.78551,-0.50277,0.817411,1.003668,-0.817667,-2.313924,0.065965,-1.259931,0.822264,-0.58694


In [41]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [42]:
models = {     
    "Linear Regression": LinearRegression(),
    'Ridge Regression':Ridge(),
    'Lasso Regression':Lasso(),
    'Elasticnet Regression':ElasticNet(),
    
    
    # "Random Forest": RandomForestRegressor(),
    # "Decision Tree": DecisionTreeRegressor(),
    # "XGBRegressor": XGBRegressor(),
            }

In [43]:
params={
    "Linear Regression":{},
    
    'Ridge Regression':{},
    
    'Lasso Regression':{},
    'Elasticnet Regression':{},
    
    
#     "Decision Tree": {
#         # 'splitter':['best','random'],
#         'max_features':['sqrt','log2'],
#     },
#     "Random Forest":{
#         'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],

#         'max_features':['sqrt','log2',None],
#         'n_estimators': [8,16,32,64,128,256]
#     },
    
#     "XGBRegressor":{
#         'learning_rate':[.1,.01,.05,.001],
#         'n_estimators': [8,16,32,64,128,256]
#     } 
}

In [44]:
from sklearn.model_selection import GridSearchCV
def get_best_model(X_train,y_train,X_test,y_test,model_dict,params_,base_accuracy=0.70):
    models=model_dict.keys()
    for model in models:
        print(f'intial base accuracy is {base_accuracy}')
        print(20*'*','Estimator: ',model,20*'*')
        gs=GridSearchCV(estimator=model_dict[model],param_grid=param_[model],cv=3)
        gs.fit(X_train,y_train)
        print(f'Best parameters are : {gs.best_params_} With R2 Score: {gs.best_score_}')
        
        regressor=model_dict[model].set_params(**gs.best_params_)
        regressor.fit(X_train,y_train)
        
        y_train_pred=regressor.predict(X_train)      
        y_test_pred=regressor.predict(X_test) 
        
        train_model_score=r2_score(y_train,y_train_pred)
        test_model_score=r2_score(y_test,y_test_pred)
        
        if test_model_score >=base_accuracy:
            print('Acceptable model found!')
            base_accuracy=test_model_score
            best_model= {model:regressor} 
            
        else:
            print('Model Rejected\n',30*'__')
            
            
    print(f'Best model is {list(best_model.keys())} with R2 score: ')    
        
        
        

SyntaxError: unterminated string literal (detected at line 29) (3091704076.py, line 29)