In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
train=pd.read_csv(r"Data_Train.csv")
test=pd.read_csv(r"Test_set.csv")

In [5]:
train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [6]:
test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL ? BOM ? COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? MAA ? BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL ? BOM ? COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL ? BOM ? COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR ? DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [7]:
total=pd.concat([train,test],axis=0)

In [8]:
total.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0


In [9]:
print(train.shape)
print(test.shape)
print(total.shape)

(10683, 11)
(2671, 10)
(13354, 11)


In [10]:
total.isnull().sum()

Airline               0
Date_of_Journey       0
Source                0
Destination           0
Route                 1
Dep_Time              0
Arrival_Time          0
Duration              0
Total_Stops           1
Additional_Info       0
Price              2671
dtype: int64

In [11]:
total["Route"]=total["Route"].fillna(method='ffill')
total["Total_Stops"]=total["Total_Stops"].fillna(method='ffill')

In [12]:
#total.duplicated(keep=False).value_counts()

In [13]:
#total.drop_duplicates(inplace=True)

In [14]:
total['Total_Stops']=total['Total_Stops'].replace({"1 stop":1,"non-stop":0,"2 stops":2,"3 stops":3,"4 stops":4})

In [15]:
from datetime import datetime
total["Date_of_Journey"]=total["Date_of_Journey"].apply(lambda x:datetime.strptime(x,"%d/%m/%Y"))

In [16]:
total=total.drop("Additional_Info",axis=1)

In [17]:
total.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,0,3897.0
1,Air India,2019-05-01,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2,7662.0
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2,13882.0
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1,6218.0
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1,13302.0


In [18]:
Airline=pd.get_dummies(total["Airline"],prefix="Airline")
total=pd.concat([total,Airline],axis=1)
total.drop("Airline",axis=1,inplace=True)

In [18]:
total["Arrival_Time_hrs"]=total["Arrival_Time"].apply(lambda x:x.split(" ")[0].split(":")[0])
total["Arrival_Time_min"]=total["Arrival_Time"].apply(lambda x:x.split(" ")[0].split(":")[1])
total.drop("Arrival_Time",axis=1,inplace=True)

In [19]:
total["Dep_Time_hrs"]=total["Dep_Time"].apply(lambda x:x.split(":")[0])
total["Dep_Time_min"]=total["Dep_Time"].apply(lambda x:x.split(":")[1])
total.drop("Dep_Time",axis=1,inplace=True)

In [21]:
source=pd.get_dummies(total["Source"],prefix="Source")
total=pd.concat([total,source],axis=1)
total=total.drop("Source",axis=1)

In [20]:
total["Duration_hrs"]=total["Duration"].apply(lambda x:x.split("h")[0])
total["Duration_hrs"]=total["Duration_hrs"].replace({"5m":5})
total.drop("Duration",axis=1,inplace=True)
#total["Duration_min"]=total["Duration"].apply(lambda x:x.split("m")[0].split(" ")[-1])
#total["Duration_min"].replace({"19h":19,"23h":23,"22h":22,"12h":12,"3h":3,"5h":5,"10h":10,"18h":18,"24h":24,"15h":15,"16h":16,"8h":8,"14h":14,"20h":20,"13h":13,"11h":11,"9h":9,"27h":27,"26h":26,"4h":4,"7h":7,"30h":30,"21h":21,"28h":28,"47h":47,"6h":6,"25h":25,"38h":38,"34h":34})

In [21]:
total["Route"]=total["Route"].apply(lambda x:len(x.split("?")))

In [22]:
total["Destination"]=total["Destination"].replace({'New Delhi':'Delhi'})
total["Destination"].unique()

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

In [25]:
Destination=pd.get_dummies(total["Destination"],prefix="Destiny")
total=pd.concat([total,Destination],axis=1)
total=total.drop("Destination",axis=1)

In [23]:
total_col=["Duration_hrs","Arrival_Time_min","Dep_Time_hrs","Dep_Time_min","Arrival_Time_hrs"]
total[total_col]=total[total_col].astype(np.int64)

In [24]:
total.drop("Date_of_Journey",axis=1,inplace=True)

In [24]:
train=total.iloc[:10683,:]
test=total.iloc[10683:,:]

In [29]:
x_train=train.drop("Price",axis=1)
y_train=train["Price"]

In [30]:
x_test=test.drop("Price",axis=1)
y_test=test["Price"]

In [31]:
total.head()

Unnamed: 0,Route,Total_Stops,Price,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Duration_hrs,Destiny_Banglore,Destiny_Cochin,Destiny_Delhi,Destiny_Hyderabad,Destiny_Kolkata
0,2,0,3897.0,0,0,0,1,0,0,0,...,0,0,0,0,2,0,0,1,0,0
1,4,2,7662.0,0,1,0,0,0,0,0,...,0,0,1,0,7,1,0,0,0,0
2,4,2,13882.0,0,0,0,0,1,0,0,...,0,1,0,0,19,0,1,0,0,0
3,3,1,6218.0,0,0,0,1,0,0,0,...,0,0,1,0,5,1,0,0,0,0
4,3,1,13302.0,0,0,0,1,0,0,0,...,0,0,0,0,4,0,0,1,0,0


In [32]:
from sklearn.tree import ExtraTreeRegressor
reg=ExtraTreeRegressor()
reg.fit(x_train,y_train)

ExtraTreeRegressor()

In [33]:
n=pd.DataFrame({"columns":x_train.columns,"importances":reg.feature_importances_}).sort_values("importances",axis=0,ascending=False)
n

Unnamed: 0,columns,importances
1,Total_Stops,0.491679
6,Airline_Jet Airways,0.089646
7,Airline_Jet Airways Business,0.080702
17,Dep_Time_min,0.047623
15,Arrival_Time_min,0.039318
0,Route,0.039271
16,Dep_Time_hrs,0.038547
8,Airline_Multiple carriers,0.034957
14,Arrival_Time_hrs,0.028032
23,Duration_hrs,0.027136


In [34]:
m=list(n["columns"].head(20))

In [35]:
model={
       "LinearRegression":LinearRegression(),
                  "Ridge":Ridge(),
                  "Lasso":Lasso(),
           "XGBRegressor":XGBRegressor()
      }

In [36]:
from sklearn.metrics import mean_squared_error,r2_score
res=[]
for name,func in model.items():
    mod=func.fit(x_train,y_train)
    pred=mod.predict(x_train)
    mse=mean_squared_error(y_train,pred)
    res.append({"model":name,"mse":mse})

In [37]:
res=pd.DataFrame(res)
print(res)

              model           mse
0  LinearRegression  9.114539e+06
1             Ridge  9.138192e+06
2             Lasso  9.120214e+06
3      XGBRegressor  5.031655e+06


In [38]:
model1={
       "LinearRegression":{
                            "model":LinearRegression(),
                            "param":{
                                    "fit_intercept":[True,False]
                                    }
                          },
                  "Ridge":{
                             "model":Ridge(),
                             "param":{
                                      "alpha":[0,5,10,25,30,45,50]
                                     }
                          },
                  "Lasso":{
                             "model":Lasso(),
                             "param":{
                                     "alpha":[0,0.8,5,10,15,20,30]
                                     }
                          },
                   
                 
          "XGBRegressor":{
                            "model":XGBRegressor(),
                            "param":{
                                   "n_estimators":[10,5]
                                    }
                         }

      }

In [39]:
res1=[]
for it,name in model1.items():
    clf=GridSearchCV(name["model"],name["param"],cv=5)
    clf.fit(x_train,y_train)
    pred_n=clf.predict(x_train)
    mse=mean_squared_error(y_train,pred_n)
    r2_score1=r2_score(y_train,pred_n)
    res1.append({"GRID_mse":mse,"GRID_r2_score1":r2_score1})
    
res1=pd.DataFrame(res1)

In [40]:
res1  

Unnamed: 0,GRID_mse,GRID_r2_score1
0,9114539.0,0.571336
1,9354517.0,0.560049
2,9114539.0,0.571336
3,6858093.0,0.677458


In [41]:
pd.concat([res,res1],axis=1)

Unnamed: 0,model,mse,GRID_mse,GRID_r2_score1
0,LinearRegression,9114539.0,9114539.0,0.571336
1,Ridge,9138192.0,9354517.0,0.560049
2,Lasso,9120214.0,9114539.0,0.571336
3,XGBRegressor,5031655.0,6858093.0,0.677458


In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
pipe=Pipeline([("linear",LinearRegression())])

In [43]:
pipe1=pipe.fit(x_train,y_train)

In [44]:
import pickle
file=open("flight22.pkl","wb")
pickle.dump(pipe1,file)

In [None]:
ld.predict(x_train)

In [44]:
pipe.score(x_train,y_train)

0.5713355330706611

In [45]:
cross_val_score(pipe,x_train,y_train,cv=5).mean()

-2.8467691379816915e+17

array([ 5389.61736224, 11826.1246612 , 14586.59615071, ...,
        9016.36412611,  6764.3752299 , 11630.07349639])

In [51]:
x_train.columns

Index(['Route', 'Total_Stops', 'Airline_Air Asia', 'Airline_Air India',
       'Airline_GoAir', 'Airline_IndiGo', 'Airline_Jet Airways',
       'Airline_Jet Airways Business', 'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Arrival_Time_hrs', 'Arrival_Time_min', 'Dep_Time_hrs', 'Dep_Time_min',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Duration_hrs', 'Destiny_Banglore', 'Destiny_Cochin',
       'Destiny_Hyderabad', 'Destiny_Kolkata', 'Destiny_New Delhi'],
      dtype='object')

In [20]:
total["Destination"].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

In [25]:
train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Total_Stops,Price,Arrival_Time_hrs,Arrival_Time_min,Dep_Time_hrs,Dep_Time_min,Duration_hrs
0,IndiGo,2019-03-24,Banglore,Delhi,2,0,3897.0,1,10,22,20,2
1,Air India,2019-05-01,Kolkata,Banglore,4,2,7662.0,13,15,5,50,7
2,Jet Airways,2019-06-09,Delhi,Cochin,4,2,13882.0,4,25,9,25,19
3,IndiGo,2019-05-12,Kolkata,Banglore,3,1,6218.0,23,30,18,5,5
4,IndiGo,2019-03-01,Banglore,Delhi,3,1,13302.0,21,35,16,50,4


In [39]:
px.bar(train,x="Airline",y="Route",animation_frame="Destination")