In [16]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,accuracy_score,mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder



In [2]:
train=pd.read_excel('D:/Inuron/Internships/Flight_Fare_Prediction/notebooks/data/Data_Train.xlsx')
test=pd.read_excel('D:/Inuron/Internships/Flight_Fare_Prediction/notebooks/data/Test_set.xlsx')


In [3]:
train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
X=train.drop(labels=['Price'],axis=1)
y=train[['Price']]

In [5]:
X.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'],
      dtype='object')

In [15]:
duration_columns=X['Duration']
Journey_time_column=X['Date_of_Journey']
dep_time=X[['Dep_Time','Arrival_Time']]
categorical_columns=X[['Airline','Source','Destination','Route']]
drop_columns=X[['Duration','Date_of_Journey','Dep_Time','Arrival_Time','Source','Destination','Route']]
mapping_columns=X[['Total_Stops']]
numerical_columns=X.select_dtypes(exclude='object').columns

Index([], dtype='object')

In [22]:
#for duration pipeline

def trans_duration(X,column):
    X[column]=pd.to_datetime(X[column])
    X['Journey_day']=X[column].dt.day
    X['Journey_year']=X[column].dt.year
    X['Journey_month']=X[column].dt.month

    return X


In [24]:
##Date time Pipeline
def transform_date_time(X,column):

    duration_train=list(X[column])
    duration_hours=[]
    duration_mins=[]

    for i in range(len(duration_train)):
        if len(duration_train[i].split()) !=2:
            if "h" in duration_train[i]:
                duration_train[i]=duration_train[i].strip() + ' 0m'
            else:
                duration_train[i]='0h '+duration_train[i]
        
        hours=int(duration_train[i].split(sep='h')[0])
        mins=int(duration_train[i].split(sep='m')[0].splt()[-1])

        duration_hours.append(hours)
        duration_mins.append(mins)

        X['Duration_hrs']=duration_hours
        X['Duration_min']=duration_mins

        return X
        


In [25]:
#departure Pipeline
def transform_departure(X):
    X['Dep_hr']=pd.to_datetime(X['Dep_Time']).dt.hour
    X['Dep_min']=pd.to_datetime(X['Dep_Time']).dt.minute
    X['Arrival_hr']=pd.to_datetime(X['Arrival_Time']).dt.hour
    X['Arrival_min']=pd.to_datetime(X['Arrival_Time']).dt.minute

    return X


In [26]:
##Mapping Pipeline
def Map(X):
    X=X.replace({"non-stop":0,"1 stop":1,"2 stops":2,"3 stops":3,"4 stops":4})
    return X


In [28]:
def drop_columns(X,column):
    
    X=X.drop(columns=[column])

    return X

In [36]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

Journey_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('Transforming_to_int',FunctionTransformer(trans_duration,kw_args={'column':'Duration'})),
        ('scaler',StandardScaler())
    ]
)

date_time_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('tansformin_to_int',FunctionTransformer(transform_date_time,kw_args={'column':'trans_duration'})),

        ('scaler',StandardScaler())

    ]
)

hour_minute_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('Transforming_to_int',FunctionTransformer(transform_departure)),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline=Pipeline(
    steps=(
        [('imputer',SimpleImputer(strategy='median')),
         ('one_hot',OneHotEncoder()),
         ('scaler',StandardScaler())]
    )
)

mapping_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('encoding',FunctionTransformer(Map))
    ]
)
    
numerical_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())

    ]
    
)

drop_pipeline=Pipeline(
    steps=[
        ('drop',FunctionTransformer(drop_columns,kw_args={'column':'Duration'}))
    ]
)

preprocessor=ColumnTransformer([
    ('Journey_pipeline',Journey_pipeline,duration_columns),
    ('date_time_pipeline',date_time_pipeline,Journey_time_column),
    ('hour_minute_pipeline',hour_minute_pipeline,dep_time),
    ('categorical_pipeline',categorical_pipeline,categorical_columns),
    ('mapping pipeline',mapping_pipeline,mapping_columns),
    #('numerical_pipeline',numerical_pipeline,numerical_columns),
    ('drop_columns',drop_pipeline,['Duration', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time', 'Source', 'Destination', 'Route'])
])

In [37]:
train=pd.read_excel(preprocessor.fit_transform(train),columns=preprocessor.get_feature_names_out())

ValueError: A given column is not a column of the dataframe