### 1. Import Libraries

In [125]:
import numpy as np

import pandas as pd

import sklearn

from sklearn.metrics.pairwise import cosine_similarity,rbf_kernel

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from feature_engine.encoding import RareLabelEncoder, MeanEncoder,CountFrequencyEncoder

from feature_engine.selection import SelectBySingleFeaturePerformance

from feature_engine.datetime import DatetimeFeatures

from feature_engine.outliers import Winsorizer

from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    StandardScaler,
    OrdinalEncoder
    
)

import warnings

In [15]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.7.0-py2.py3-none-any.whl (344 kB)
   ---------------------------------------- 0.0/344.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/344.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/344.3 kB ? eta -:--:--
   --- ----------------------------------- 30.7/344.3 kB 325.1 kB/s eta 0:00:01
   ------ -------------------------------- 61.4/344.3 kB 544.7 kB/s eta 0:00:01
   ------------------ ------------------- 163.8/344.3 kB 984.6 kB/s eta 0:00:01
   ----------------------------- ---------- 256.0/344.3 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 344.3/344.3 kB 1.4 MB/s eta 0:00:00
Installing collected packages: feature_engine
Successfully installed feature_engine-1.7.0


### 2. Display Settings

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
sklearn.set_config(transform_output = 'pandas')

In [6]:
warnings.filterwarnings('ignore')

In [8]:
path = r'C:\Users\harsh\OneDrive\Desktop\Flight-Price-Prediction\data\train.csv'

train = pd.read_csv(path)

train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,18:15:00,1400,1.0,In-flight meal not included,8529
1,Indigo,2019-03-18,Mumbai,Hyderabad,06:20:00,07:45:00,85,0.0,No info,3342
2,Jet Airways,2019-06-09,Kolkata,Banglore,09:35:00,22:05:00,750,1.0,In-flight meal not included,13994
3,Jet Airways,2019-06-06,Kolkata,Banglore,20:25:00,09:45:00,800,1.0,In-flight meal not included,10539
4,Indigo,2019-03-24,Banglore,New Delhi,08:30:00,11:30:00,180,0.0,No info,7303
...,...,...,...,...,...,...,...,...,...,...
635,Multiple Carriers,2019-03-06,Delhi,Cochin,11:40:00,19:15:00,455,1.0,No info,15353
636,Indigo,2019-03-03,Delhi,Cochin,10:45:00,01:35:00,890,1.0,No info,14871
637,Jet Airways,2019-06-15,Delhi,Cochin,19:15:00,19:00:00,1425,1.0,In-flight meal not included,10262
638,Air Asia,2019-04-21,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No info,7452


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [10]:
X_train = train.drop(columns = 'price')

Y_train = train.price

### 4. Transformation Operations

In [12]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

### 4.1 Airline

In [13]:
X_train.airline

0            Jet Airways
1                 Indigo
2            Jet Airways
3            Jet Airways
4                 Indigo
             ...        
635    Multiple Carriers
636               Indigo
637          Jet Airways
638             Air Asia
639               Indigo
Name: airline, Length: 640, dtype: object

In [18]:
air_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                          ('grouper' , RareLabelEncoder(tol = 0.1, replace_with = 'Other',n_categories = 2)),
                           ('encoder', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))
                          
                          
                                   ])


air_transformer.fit_transform(X_train.loc[:,['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
635,0.0,0.0,0.0,1.0,0.0
636,0.0,1.0,0.0,0.0,0.0
637,0.0,0.0,1.0,0.0,0.0
638,0.0,0.0,0.0,0.0,1.0


In [32]:
feature_to_extract = ['month','week', 'day_of_week','day_of_year']


doj_transformer = Pipeline(steps = [('dt', DatetimeFeatures(features_to_extract = feature_to_extract,yearfirst = True, format = 'mixed')),
                                    ('scaler', MinMaxScaler())
                                   ])

doj_transformer.fit_transform(X_train.loc[:,['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.882353,0.333333,0.872881
1,0.000000,0.176471,0.000000,0.144068
2,1.000000,0.823529,1.000000,0.847458
3,1.000000,0.823529,0.500000,0.822034
4,0.000000,0.176471,1.000000,0.194915
...,...,...,...,...
635,0.000000,0.058824,0.333333,0.042373
636,0.000000,0.000000,1.000000,0.016949
637,1.000000,0.882353,0.833333,0.898305
638,0.333333,0.411765,1.000000,0.432203


### 4.3 Source & Destination

In [34]:
X_train.source

0       Kolkata
1        Mumbai
2       Kolkata
3       Kolkata
4      Banglore
         ...   
635       Delhi
636       Delhi
637       Delhi
638       Delhi
639     Kolkata
Name: source, Length: 640, dtype: object

In [35]:
X_train.destination

0       Banglore
1      Hyderabad
2       Banglore
3       Banglore
4      New Delhi
         ...    
635       Cochin
636       Cochin
637       Cochin
638       Cochin
639     Banglore
Name: destination, Length: 640, dtype: object

In [38]:
location_subset = X_train.loc[:,['source','destination']]

location_subset

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Mumbai,Hyderabad
2,Kolkata,Banglore
3,Kolkata,Banglore
4,Banglore,New Delhi
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Delhi,Cochin
638,Delhi,Cochin


In [43]:
location_pipe1 = Pipeline(steps = [('grouper',RareLabelEncoder(tol = 0.1,replace_with = 'Other',n_categories = 2)),
                                   ('encoder',MeanEncoder()),
                                   ('scaler', PowerTransformer())
                                  
                                  
                                  ])

location_pipe1.fit_transform(location_subset,Y_train)

Unnamed: 0,source,destination
0,0.009170,-0.020576
1,-1.645431,-1.073908
2,0.009170,-0.020576
3,0.009170,-0.020576
4,-1.202656,-1.073908
...,...,...
635,0.985309,0.985775
636,0.985309,0.985775
637,0.985309,0.985775
638,0.985309,0.985775


In [48]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ['Delhi','Kolkata','Mumbai','New Delhi']
    return (
        X
        .assign(**{
            f"{col}_is_north":X.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        }
            
        )
        .drop(columns = columns)
    )
FunctionTransformer(func = is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [52]:
location_transformer = FeatureUnion(transformer_list = [
    ('part1',location_pipe1),
    ('part2', FunctionTransformer(func = is_north))
])

location_transformer.fit_transform(location_subset,Y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,0.009170,-0.020576,1,0
1,-1.645431,-1.073908,1,0
2,0.009170,-0.020576,1,0
3,0.009170,-0.020576,1,0
4,-1.202656,-1.073908,0,1
...,...,...,...,...
635,0.985309,0.985775,1,0
636,0.985309,0.985775,1,0
637,0.985309,0.985775,1,0
638,0.985309,0.985775,1,0


### 4.4 Dep_time and Arrival_time

In [55]:
X_train.dep_time

0      18:55:00
1      06:20:00
2      09:35:00
3      20:25:00
4      08:30:00
         ...   
635    11:40:00
636    10:45:00
637    19:15:00
638    07:55:00
639    04:40:00
Name: dep_time, Length: 640, dtype: object

In [57]:
time_subset = X_train.loc[:,['dep_time','arrival_time']]
time_subset

Unnamed: 0,dep_time,arrival_time
0,18:55:00,18:15:00
1,06:20:00,07:45:00
2,09:35:00,22:05:00
3,20:25:00,09:45:00
4,08:30:00,11:30:00
...,...,...
635,11:40:00,19:15:00
636,10:45:00,01:35:00
637,19:15:00,19:00:00
638,07:55:00,13:25:00


In [59]:
time_pipe1 = Pipeline(steps = [
    ('dt',DatetimeFeatures(features_to_extract = ['hour','minute'])),
    ('scaler',MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.782609,1.000000,0.782609,0.272727
1,0.260870,0.363636,0.304348,0.818182
2,0.391304,0.636364,0.956522,0.090909
3,0.869565,0.454545,0.391304,0.818182
4,0.347826,0.545455,0.478261,0.545455
...,...,...,...,...
635,0.478261,0.727273,0.826087,0.272727
636,0.434783,0.818182,0.043478,0.636364
637,0.826087,0.272727,0.826087,0.000000
638,0.304348,1.000000,0.565217,0.454545


In [66]:
def part_of_day(X,morning = 4, noon=12, eve=16,night=20):
    columns = X.columns
    X_temp = X.assign(**{
        col : pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
                            
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning,noon,inclusive = 'left'),
                X_temp.loc[:,col].between(noon,eve,inclusive = 'left'),
                X_temp.loc[:,col].between(eve,night,inclusive = 'left')],
                ['morning','afternoon','night'],
                default = 'night'
            )
            for col in columns
        })
        .drop(columns = columns)
    )
FunctionTransformer(func = part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,night,night
1,morning,morning
2,morning,night
3,night,morning
4,morning,morning
...,...,...
635,morning,night
636,morning,night
637,night,night
638,morning,afternoon


In [70]:
time_pipe2 = Pipeline(steps = [
    ('part',FunctionTransformer(func = part_of_day)),
    ('encoder',CountFrequencyEncoder()),
    ('scaler',MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.808081,1.000000
1,1.000000,0.289377
2,1.000000,1.000000
3,0.808081,0.289377
4,1.000000,0.289377
...,...,...
635,1.000000,1.000000
636,1.000000,1.000000
637,0.808081,1.000000
638,1.000000,0.000000


In [71]:
time_transformer = FeatureUnion(transformer_list = [
    ('part1',time_pipe1),
    ('part2',time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.782609,1.000000,0.782609,0.272727,0.808081,1.000000
1,0.260870,0.363636,0.304348,0.818182,1.000000,0.289377
2,0.391304,0.636364,0.956522,0.090909,1.000000,1.000000
3,0.869565,0.454545,0.391304,0.818182,0.808081,0.289377
4,0.347826,0.545455,0.478261,0.545455,1.000000,0.289377
...,...,...,...,...,...,...
635,0.478261,0.727273,0.826087,0.272727,1.000000,1.000000
636,0.434783,0.818182,0.043478,0.636364,1.000000,1.000000
637,0.826087,0.272727,0.826087,0.000000,0.808081,1.000000
638,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000


In [87]:
(
    X_train
    .duration
    .quantile([0.25,0.5,0.75])
    .values
    .reshape(-1,1)
)

array([[175.  ],
       [552.5 ],
       [911.25]])

In [99]:


class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma

    def fit(self, X, Y=None):
        # If variables are not specified, select all numerical columns
        if not self.variables:
            self.variables = X.select_dtypes(include='number').columns.tolist()

        # Calculate reference values for percentiles
        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self

    def transform(self, X):
        objects = []

        # Iterate over specified variables
        for col in self.variables:
            # Generate column names based on percentiles
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]

            # Calculate RBF kernel similarity between X and reference values
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)

        # Concatenate the generated DataFrames horizontally
        return pd.concat(objects, axis=1)


In [100]:
RBFPercentileSimilarity().fit_transform(X_train.loc[:,['duration']])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75
0,0.000000e+00,0.0,0.000000e+00
1,0.000000e+00,0.0,0.000000e+00
2,0.000000e+00,0.0,0.000000e+00
3,0.000000e+00,0.0,0.000000e+00
4,8.208500e-02,0.0,0.000000e+00
...,...,...,...
635,0.000000e+00,0.0,0.000000e+00
636,0.000000e+00,0.0,2.448442e-20
637,0.000000e+00,0.0,0.000000e+00
638,0.000000e+00,0.0,0.000000e+00


In [111]:
def duration_category(X, short = 180, med = 400):
    return (
        X
        .assign(duration_cat = np.select([ X.duration.lt(short),
                                          X.duration.between(short,med,inclusive = 'left')],
                                         ['short','medium'],
                                         default = 'long'))
    .drop(columns = 'duration'))

In [112]:
def is_over(X,value = 1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}":X.duration.ge(value).astype(int)
        }
            
        ).drop(columns = 'duration')
    )

In [114]:
duration_pipe1 = Pipeline(steps = [
    ('rbf',RBFPercentileSimilarity()),
    ('scaler',PowerTransformer())
])

duration_pipe2 = Pipeline(steps = [
    ('cat', FunctionTransformer(func = duration_category )),
    ('encoder',OrdinalEncoder(categories = [['short','medium','long']]))
])

duration_union = FeatureUnion(transformer_list = [
    ('part1', duration_pipe1),
    ('part2',duration_pipe2),
    ('part3',FunctionTransformer(func = is_over)),   
    ('part4',StandardScaler())
])

duration_transformer = Pipeline(steps = [
    ("outliers",Winsorizer(capping_method = 'iqr', fold = 1.5)),
    ('imputer', SimpleImputer(strategy = 'median')),
    ('union',duration_union)
])

duration_transformer.fit_transform(X_train.loc[:,['duration']])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration_cat,duration_over_1000,duration
0,-0.352652,-0.088185,-0.088768,2.0,1,1.564661
1,-0.352652,-0.088185,-0.088768,0.0,0,-1.127081
2,-0.352652,-0.088185,-0.088768,2.0,0,0.234142
3,-0.352652,-0.088185,-0.088768,2.0,0,0.336490
4,2.687230,-0.088185,-0.088768,1.0,0,-0.932621
...,...,...,...,...,...,...
635,-0.352652,-0.088185,-0.088768,2.0,0,-0.369709
636,-0.352652,-0.088185,-0.088768,2.0,0,0.520715
637,-0.352652,-0.088185,-0.088768,2.0,1,1.615835
638,-0.352652,-0.088185,-0.088768,1.0,0,-0.625578


### 4.6 total_stops

In [115]:
X_train.total_stops

0      1.0
1      0.0
2      1.0
3      1.0
4      0.0
      ... 
635    1.0
636    1.0
637    1.0
638    1.0
639    0.0
Name: total_stops, Length: 640, dtype: float64

In [117]:
def is_direct(X):
    return X.assign(is_direct_flight = X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ("",FunctionTransformer(func = is_direct))
    
])

total_stops_transformer.fit_transform(X_train.loc[:,['total_stops']])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,0.0,1
2,1.0,0
3,1.0,0
4,0.0,1
...,...,...
635,1.0,0
636,1.0,0
637,1.0,0
638,1.0,0


### 4.7 Additional_Info

In [118]:
X_train.additional_info

0      In-flight meal not included
1                          No info
2      In-flight meal not included
3      In-flight meal not included
4                          No info
                  ...             
635                        No info
636                        No info
637    In-flight meal not included
638                        No info
639                        No info
Name: additional_info, Length: 640, dtype: object

In [119]:
info_pipe1 = Pipeline(steps = [
    ('group',RareLabelEncoder(tol = 0.1,n_categories = 2, replace_with = 'Other')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

info_pipe1.fit_transform(X_train.loc[:,['additional_info']])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No info,additional_info_Other
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
635,0.0,1.0,0.0
636,0.0,1.0,0.0
637,1.0,0.0,0.0
638,0.0,1.0,0.0


In [120]:
def have_info(X):
    return X.assign(additional_info = X.additional_info.ne('No Info').astype(int))

In [121]:
info_union = FeatureUnion(transformer_list = [
    ('part1',info_pipe1),
    ('part2',FunctionTransformer(func = have_info))
])

In [122]:
info_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'unknown')),
    ('union',info_union)
])

info_transformer.fit_transform(X_train.loc[:,['additional_info']])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No info,additional_info_Other,additional_info
0,1.0,0.0,0.0,1
1,0.0,1.0,0.0,1
2,1.0,0.0,0.0,1
3,1.0,0.0,0.0,1
4,0.0,1.0,0.0,1
...,...,...,...,...
635,0.0,1.0,0.0,1
636,0.0,1.0,0.0,1
637,1.0,0.0,0.0,1
638,0.0,1.0,0.0,1


### 5. Column Transformer

In [123]:
column_transformer = ColumnTransformer(transformers = [
    ('air',air_transformer,['airline']),
    ('doj',doj_transformer,['date_of_journey']),
    ('location',location_transformer,['source','destination']),
    ('time',time_transformer,['dep_time','arrival_time']),
    ('dur',duration_transformer,['duration']),
    ('stops',total_stops_transformer,['total_stops']),
    ('info',info_transformer,['additional_info'])
    
],remainder = 'passthrough')

column_transformer.fit_transform(X_train,Y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration_rbf_25,dur__duration_rbf_50,dur__duration_rbf_75,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No info,info__additional_info_Other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,0.009170,-0.020576,1,0,0.782609,1.000000,0.782609,0.272727,0.808081,1.000000,-0.352652,-0.088185,-0.088768,2.0,1,1.564661,1.0,0,1.0,0.0,0.0,1
1,0.0,1.0,0.0,0.0,0.0,0.000000,0.176471,0.000000,0.144068,-1.645431,-1.073908,1,0,0.260870,0.363636,0.304348,0.818182,1.000000,0.289377,-0.352652,-0.088185,-0.088768,0.0,0,-1.127081,0.0,1,0.0,1.0,0.0,1
2,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,0.009170,-0.020576,1,0,0.391304,0.636364,0.956522,0.090909,1.000000,1.000000,-0.352652,-0.088185,-0.088768,2.0,0,0.234142,1.0,0,1.0,0.0,0.0,1
3,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,0.500000,0.822034,0.009170,-0.020576,1,0,0.869565,0.454545,0.391304,0.818182,0.808081,0.289377,-0.352652,-0.088185,-0.088768,2.0,0,0.336490,1.0,0,1.0,0.0,0.0,1
4,0.0,1.0,0.0,0.0,0.0,0.000000,0.176471,1.000000,0.194915,-1.202656,-1.073908,0,1,0.347826,0.545455,0.478261,0.545455,1.000000,0.289377,2.687230,-0.088185,-0.088768,1.0,0,-0.932621,0.0,1,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.0,1.0,0.0,0.000000,0.058824,0.333333,0.042373,0.985309,0.985775,1,0,0.478261,0.727273,0.826087,0.272727,1.000000,1.000000,-0.352652,-0.088185,-0.088768,2.0,0,-0.369709,1.0,0,0.0,1.0,0.0,1
636,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.016949,0.985309,0.985775,1,0,0.434783,0.818182,0.043478,0.636364,1.000000,1.000000,-0.352652,-0.088185,-0.088768,2.0,0,0.520715,1.0,0,0.0,1.0,0.0,1
637,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.833333,0.898305,0.985309,0.985775,1,0,0.826087,0.272727,0.826087,0.000000,0.808081,1.000000,-0.352652,-0.088185,-0.088768,2.0,1,1.615835,1.0,0,1.0,0.0,0.0,1
638,0.0,0.0,0.0,0.0,1.0,0.333333,0.411765,1.000000,0.432203,0.985309,0.985775,1,0,0.304348,1.000000,0.565217,0.454545,1.000000,0.000000,-0.352652,-0.088185,-0.088768,1.0,0,-0.625578,1.0,0,0.0,1.0,0.0,1


### 5. Feature Selection

In [128]:
estimator = RandomForestRegressor(n_estimators = 10, max_depth = 3, random_state = 42)


selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold = 0.1
)

### 6. Putting it all Together

In [129]:
preprocessor = Pipeline(steps = [
    ('ct', column_transformer),
    ('selector', selector)
])

preprocessor.fit_transform(X_train,Y_train)

Unnamed: 0,air__airline_Indigo,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.872881,0.009170,-0.020576,2.0,1.564661,1.0,0
1,1.0,0.144068,-1.645431,-1.073908,0.0,-1.127081,0.0,1
2,0.0,0.847458,0.009170,-0.020576,2.0,0.234142,1.0,0
3,0.0,0.822034,0.009170,-0.020576,2.0,0.336490,1.0,0
4,1.0,0.194915,-1.202656,-1.073908,1.0,-0.932621,0.0,1
...,...,...,...,...,...,...,...,...
635,0.0,0.042373,0.985309,0.985775,2.0,-0.369709,1.0,0
636,1.0,0.016949,0.985309,0.985775,2.0,0.520715,1.0,0
637,0.0,0.898305,0.985309,0.985775,2.0,1.615835,1.0,0
638,0.0,0.432203,0.985309,0.985775,1.0,-0.625578,1.0,0


### 7. Visualizations

In [130]:
preprocessor