# Initialisation

In [None]:
#Importing all the necessary libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from datetime import datetime as dt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as TTS, KFold as K, cross_val_score as score  
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR 
from xgboost import XGBRegressor as XGR
import warnings
warnings.filterwarnings('ignore')
from sklearn.compose import ColumnTransformer

import pickle



In [None]:
#Importing the datset
df = pd.read_csv(r'../Data/flight_price.csv')
target=df.Price

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class date_splitter(BaseEstimator,TransformerMixin):
    def __init__(self,Date_of_Journey):
        self.Date_of_Journey=Date_of_Journey

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        
        try:
            X['Date_of_Journey'] = X['Date_of_Journey'].apply(lambda x: dt.strptime(str(x), '%d-%m-%y'))
        except:
            X['Date_of_Journey'] = X['Date_of_Journey'].apply(lambda x: dt.strptime(str(x), '%Y-%m-%d %H:%M:%S'))

        #24-03-19
        
        X['Day of month'] = X.Date_of_Journey.apply(lambda x: x.strftime("%d")).astype(int)
        X['Day of week'] = X.Date_of_Journey.apply(lambda x: x.strftime("%w")).astype(int)
        X['Month of year'] = X.Date_of_Journey.apply(lambda x: x.strftime("%m")).astype(int)
        X['Day of year'] = X.Date_of_Journey.apply(lambda x: x.timetuple().tm_yday)
        
        X.drop(['Date_of_Journey'],axis=1,inplace=True)
        return X[['Day of month','Day of week','Month of year','Day of year']].values


class route(BaseEstimator,TransformerMixin):
    def __init__(self,Source,Destination):
        self.Source=Source
        self.Destination=Destination

    def fit(self,X,y=None):
        return self

    def transform(self, X, y=None):

        mapper={
        'BangaloreNew Delhi':1, 
        'ChennaiKolkata':2, 
        'New DelhiCochin':3,
        'KolkataBangalore':4, 
        'MumbaiHyderabad':5}        

        X['route'] = X['Source']+X['Destination']

        X['route'] = X['route'].map(mapper)
        
        X.drop(['Source','Destination','Route','Additional_Info','Arrival_Time'],axis=1,inplace=True)
        
        return X[['route']].values

class time_trier(BaseEstimator,TransformerMixin):
    def __init__(self, Dep_Time, Duration):
        self.Dep_Time=Dep_Time
        self.Duration= Duration

    def fit(self,X,y=None):
        return self

    def transform(self, X, y=None):
             
        hour = lambda x: x[:x.index(":")]
        minutes = lambda x: x[x.index(":")+1:]
        dur_hour = lambda x:x[:x.index("h")] if 'h' in x else 0
        dur_min = lambda x: x[x.index("m")-2:x.index("m")] if 'm' in x else 0

        df['Dep_hour'] = df.Dep_Time.apply(hour)
        df['Dep_minutes'] = df.Dep_Time.apply(minutes)
        df['Duration_hours'] = df.Duration.apply(dur_hour)
        df['Duration_mins'] = df.Duration.apply(dur_min)
        
        df.Duration_mins.replace({'':'0'},inplace=True)
          
        df.Dep_hour = df.Dep_hour.astype(int)
        df.Dep_minutes = df.Dep_minutes.astype(int)
        df.Duration_hours = df.Duration_hours.astype(int)
        df.Duration_mins = df.Duration_mins.astype(int)
        X.drop(['Duration','Dep_Time'],axis=1,inplace=True)
        
        return X[['Dep_hour','Dep_minutes','Duration_hours','Duration_mins']].values

class filters(BaseEstimator,TransformerMixin):
    def __init__(self, Total_Stops):
        self.Total_Stops=Total_Stops

    def fit(self,X,y=None):
        return self

    def transform(self, X, y=None):
             
        non_stop={'non-stop':1, np.nan:1, '2 stops':0, '1 stop':0, '3 stops':0,'4 stops':0}
        df.Total_Stops = df.Total_Stops.map(non_stop)
        
        df = df[df.Total_Stops==1]

        return X.values




In [None]:
airline=ColumnTransformer([('airline',OneHotEncoder(drop=['Multiple carriers Premium economy']),['Airline'])], remainder='passthrough')

In [None]:
f=FeatureUnion(
    transformer_list=[
        ('airline',airline),
        ('date_spliiter',date_splitter('Date_of_Journey')),
        ('route_identifier', route('Source','Destination')),
        ('timer', time_trier('Dep_Time','Duration'))    
])

In [None]:
f.fit(df)

In [None]:
op=pd.DataFrame(f.transform(df))

In [None]:
op.head()

In [None]:
X,y=preprocess(df)

In [None]:
def preprocess(df):

   stops = lambda x: 1 if '1' in str(x) else (2  if '2' in str(x) else(3 if '3' in str(x) else(4 if '4' in str(x) else (5 if '5' in str(x)else 0))))
   tod = lambda x: 'early morning' if 0<x<=6 else('morning' if 6<x<=12 else ('noon' if 12<x<=16 else ('evening' if 16<x<=20 else 'night')))
   df.Duration_mins.replace({'':'0'},inplace=True)
   df['N_stops'] = df.Total_Stops.apply(stops)
   df.N_stops = df.N_stops.astype(int)
   df=df[df.N_stops==0]
  
   df[['Dep_early morning', 'Dep_evening', 'Dep_morning', 'Dep_night', 'Dep_noon']] = pd.get_dummies(df['TOD of departure'])
   #3. Dropping previous and now irrelevant features

   thresh = np.quantile(df.Price,0.75) + 1.5*(np.quantile(df.Price,0.75)-np.quantile(df.Price,0.25))
   df = df[df.Price<thresh]
   df=df[((df['IndiGo']==0) & (df['Air India']==0) & (df['Air Asia']==0) & (df['SpiceJet']==0) & (df['Vistara']==0) & (df['GoAir']==0))==0]
   X = df.drop(['Price'],axis=1)
   y = df['Price'] 
   return X,y 

In [None]:
X.head()

# Preprocessing trial

Tasks:

1. Airline -> OneHotEncoder() `DONE`
2. Date_of_Journey -> Month of year, Day of month, day of week, day of year  `DONE`
3. Source and Destination -> Route  `DONE`  
4. Drop Route, Arrival_Time, Total_Stops
5. Dep_Time -> Extract hour and min `DONE`
6. Dep hour -> Time of day
7. Duration -> Duration hour, duration minutes `DONE`

In [None]:
df.drop(['Additional_Info','Arrival_Time','Route'],axis=1,inplace=True)

In [None]:
p2.fit(df)

In [None]:
pp.transform(df)

In [None]:
df.Dep_Time

In [None]:
type(df.Date_of_Journey[0])


In [None]:
x.

In [None]:
sns.boxplot(y)

# Model

In [None]:
def models(model):

    folds=K(n_splits=5, shuffle=True, random_state=4)
    r=score(model,X,y,scoring='neg_mean_squared_error',cv=folds)
    scores = str(-round(np.sqrt(r.mean()),2))+" ± "+str(round(np.sqrt(r.std()),2))
    return scores

In [None]:
d=DTR(max_depth=14)
r=RFR(max_depth=16, random_state=123456)
x=XGR(max_depth=5,eta=0.5, subsample=0.92)
print("Absolute error for Decision tree regressor",models(d))    #Median best max_depth after 100 iterations of cross validation
print("Absolute error for Random forest regressor",models(r))
print("Absolute error for XGB Regressor",models(x))

In [None]:
r.fit(X,y)

In [None]:
rr=DTR(max_depth=16)
rr.fit(X,y)
pd.DataFrame(rr.feature_importances_*100,index=X.columns).sort_values(0,ascending=False).style.background_gradient(cmap='Reds')

# Deployment

In [None]:

hours_calc=df[['Airline','Source','Destination','Duration_hours','Duration_mins']]
hours_calc=hours_calc[hours_calc.Airline.isin(['IndiGo','Air India','SpiceJet','Air Asia','GoAir','Vistara'])]
hours_calc=hours_calc.groupby(['Airline','Source','Destination'])[['Duration_hours','Duration_mins']].mean().reset_index()

In [None]:
hours_calc.to_csv(r"../deployment/hour_calculation.csv", index=False)

In [None]:
with open('..\deployment\model','wb') as f:
    pickle.dump(r,f)