# Import libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from feature_engine.selection import SelectBySingleFeaturePerformance


from sklearn.preprocessing import(
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    StandardScaler,
    OrdinalEncoder
    
)
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import(
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder,

)

from feature_engine.datetime import DatetimeFeatures
import warnings

# Display settings 

In [2]:
pd.set_option("display.max_columns",None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings("ignore")

# Read the data 

In [5]:
path = r"C:\Users\Atul kishore\AWS Flight Price Prediction\data\train.csv"
train = pd.read_csv(path)
train


Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-12,Kolkata,Banglore,16:30:00,12:00:00,1170,1.0,In-flight meal not included,8529
1,Jet Airways,2019-05-18,Banglore,Delhi,11:10:00,14:05:00,175,0.0,In-flight meal not included,5198
2,Multiple Carriers,2019-06-12,Delhi,Cochin,08:45:00,19:00:00,615,1.0,No Info,11789
3,Multiple Carriers,2019-04-24,Delhi,Cochin,07:10:00,16:10:00,540,1.0,In-flight meal not included,6093
4,Jet Airways,2019-03-27,Delhi,Cochin,15:05:00,04:25:00,800,1.0,No Info,12242
...,...,...,...,...,...,...,...,...,...,...
1704,Jet Airways,2019-06-03,Delhi,Cochin,18:15:00,19:00:00,1485,1.0,In-flight meal not included,10262
1705,Multiple Carriers,2019-05-09,Delhi,Cochin,07:30:00,19:15:00,705,1.0,No Info,8266
1706,Jet Airways,2019-06-09,Delhi,Cochin,20:55:00,04:25:00,450,1.0,In-flight meal not included,10577
1707,Jet Airways,2019-06-12,Kolkata,Banglore,06:30:00,16:20:00,590,1.0,In-flight meal not included,9899


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1709 entries, 0 to 1708
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          1709 non-null   object 
 1   date_of_journey  1709 non-null   object 
 2   source           1709 non-null   object 
 3   destination      1709 non-null   object 
 4   dep_time         1709 non-null   object 
 5   arrival_time     1709 non-null   object 
 6   duration         1709 non-null   int64  
 7   total_stops      1709 non-null   float64
 8   additional_info  1709 non-null   object 
 9   price            1709 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 133.6+ KB


In [7]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

# transformation operations

In [8]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

In [9]:
X_train.airline

0             Jet Airways
1             Jet Airways
2       Multiple Carriers
3       Multiple Carriers
4             Jet Airways
              ...        
1704          Jet Airways
1705    Multiple Carriers
1706          Jet Airways
1707          Jet Airways
1708               Indigo
Name: airline, Length: 1709, dtype: object

In [10]:
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1,replace_with="others",n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])
air_transformer.fit_transform(X_train.loc[:,['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_others
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
1704,0.0,0.0,1.0,0.0,0.0
1705,0.0,0.0,0.0,1.0,0.0
1706,0.0,0.0,1.0,0.0,0.0
1707,0.0,0.0,1.0,0.0,0.0


In [11]:
X_train.date_of_journey

0       2019-06-12
1       2019-05-18
2       2019-06-12
3       2019-04-24
4       2019-03-27
           ...    
1704    2019-06-03
1705    2019-05-09
1706    2019-06-09
1707    2019-06-12
1708    2019-05-15
Name: date_of_journey, Length: 1709, dtype: object

In [12]:
feature_to_extract = ["month","week","day_of_week","day_of_year"]
doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract = feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler",MinMaxScaler())
    
])

doj_transformer.fit_transform(X_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.882353,0.333333,0.872881
1,0.666667,0.647059,0.833333,0.661017
2,1.000000,0.882353,0.333333,0.872881
3,0.333333,0.470588,0.333333,0.457627
4,0.000000,0.235294,0.333333,0.220339
...,...,...,...,...
1704,1.000000,0.823529,0.000000,0.796610
1705,0.666667,0.588235,0.500000,0.584746
1706,1.000000,0.823529,1.000000,0.847458
1707,1.000000,0.882353,0.333333,0.872881


# source&destination

In [13]:
X_train.source

0        Kolkata
1       Banglore
2          Delhi
3          Delhi
4          Delhi
          ...   
1704       Delhi
1705       Delhi
1706       Delhi
1707     Kolkata
1708       Delhi
Name: source, Length: 1709, dtype: object

In [14]:
X_train.destination

0       Banglore
1          Delhi
2         Cochin
3         Cochin
4         Cochin
          ...   
1704      Cochin
1705      Cochin
1706      Cochin
1707    Banglore
1708      Cochin
Name: destination, Length: 1709, dtype: object

In [15]:
location_subset = X_train.loc[:,["source","destination"]]
location_subset

Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Banglore,Delhi
2,Delhi,Cochin
3,Delhi,Cochin
4,Delhi,Cochin
...,...,...
1704,Delhi,Cochin
1705,Delhi,Cochin
1706,Delhi,Cochin
1707,Kolkata,Banglore


In [16]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])
location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-0.186011,-0.152980
1,-0.999315,-1.812625
2,0.990136,0.992114
3,0.990136,0.992114
4,0.990136,0.992114
...,...,...
1704,0.990136,0.992114
1705,0.990136,0.992114
1706,0.990136,0.992114
1707,-0.186011,-0.152980


In [17]:
def is_north(X):
     columns = X.columns.to_list()
   
     north_cities= ['Delhi','Kolkata','Mumbai','New Delhi']
     return (
         X
         .assign(**{
             f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
             for col in columns
         })
         .drop(columns=columns)
     )
     
FunctionTransformer(func = is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
1704,1,0
1705,1,0
1706,1,0
1707,1,0


In [18]:
location_transformer =  FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])
location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.186011,-0.152980,1,0
1,-0.999315,-1.812625,0,1
2,0.990136,0.992114,1,0
3,0.990136,0.992114,1,0
4,0.990136,0.992114,1,0
...,...,...,...,...
1704,0.990136,0.992114,1,0
1705,0.990136,0.992114,1,0
1706,0.990136,0.992114,1,0
1707,-0.186011,-0.152980,1,0


# dep&arrival time

In [19]:
time_subset = X_train.loc[:,["dep_time","arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,16:30:00,12:00:00
1,11:10:00,14:05:00
2,08:45:00,19:00:00
3,07:10:00,16:10:00
4,15:05:00,04:25:00
...,...,...
1704,18:15:00,19:00:00
1705,07:30:00,19:15:00
1706,20:55:00,04:25:00
1707,06:30:00,16:20:00


In [20]:
time_pipe1 = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])
time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.695652,0.545455,0.521739,0.000000
1,0.478261,0.181818,0.608696,0.090909
2,0.347826,0.818182,0.826087,0.000000
3,0.304348,0.181818,0.695652,0.181818
4,0.652174,0.090909,0.173913,0.454545
...,...,...,...,...
1704,0.782609,0.272727,0.826087,0.000000
1705,0.304348,0.545455,0.826087,0.272727
1706,0.869565,1.000000,0.173913,0.454545
1707,0.260870,0.545455,0.695652,0.363636


In [21]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })
    
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning","afternoon","evening"],
                default = "night"
            )
            for col in columns
        
        })
        .drop(columns=columns)
    )
FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,evening,afternoon
1,morning,afternoon
2,morning,evening
3,morning,evening
4,afternoon,morning
...,...,...
1704,evening,evening
1705,morning,evening
1706,night,morning
1707,morning,evening


In [22]:
time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler",MinMaxScaler())
])
time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.285464,0.000000
1,1.000000,0.000000
2,1.000000,0.590226
3,1.000000,0.590226
4,0.000000,0.924812
...,...,...
1704,0.285464,0.590226
1705,1.000000,0.590226
1706,0.201401,0.924812
1707,1.000000,0.590226


In [23]:
time_transformer = FeatureUnion(transformer_list=[
    ("part1",time_pipe1),
    ("part2",time_pipe2)
])
time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.695652,0.545455,0.521739,0.000000,0.285464,0.000000
1,0.478261,0.181818,0.608696,0.090909,1.000000,0.000000
2,0.347826,0.818182,0.826087,0.000000,1.000000,0.590226
3,0.304348,0.181818,0.695652,0.181818,1.000000,0.590226
4,0.652174,0.090909,0.173913,0.454545,0.000000,0.924812
...,...,...,...,...,...,...
1704,0.782609,0.272727,0.826087,0.000000,0.285464,0.590226
1705,0.304348,0.545455,0.826087,0.272727,1.000000,0.590226
1706,0.869565,1.000000,0.173913,0.454545,0.201401,0.924812
1707,0.260870,0.545455,0.695652,0.363636,1.000000,0.590226


# duration

In [24]:
X_train.duration

0       1170
1        175
2        615
3        540
4        800
        ... 
1704    1485
1705     705
1706     450
1707     590
1708     300
Name: duration, Length: 1709, dtype: int64

In [25]:
(
	X_train
	.duration
	.quantile([0.25, 0.5, 0.75])
	.values
	.reshape(-1, 1)
	# .shape
)

array([[175.],
       [500.],
       [950.]])

In [26]:
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)

In [27]:
RBFPercentileSimilarity(percentiles=[0.4, 0.8]).fit_transform(X_train)


Unnamed: 0,duration_rbf_40,duration_rbf_80,total_stops_rbf_40,total_stops_rbf_80
0,0.000000e+00,0.40657,1.000000,1.000000
1,0.000000e+00,0.00000,0.904837,0.904837
2,0.000000e+00,0.00000,1.000000,1.000000
3,0.000000e+00,0.00000,1.000000,1.000000
4,0.000000e+00,0.00000,1.000000,1.000000
...,...,...,...,...
1704,0.000000e+00,0.00000,1.000000,1.000000
1705,0.000000e+00,0.00000,1.000000,1.000000
1706,0.000000e+00,0.00000,1.000000,1.000000
1707,0.000000e+00,0.00000,1.000000,1.000000


In [28]:
def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

In [29]:
def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)

In [30]:
duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75,duration_cat,duration_over_1000,duration
0,-0.335581,-0.101367,-0.055714,2.0,1,1.011302
1,3.159617,-0.101367,-0.055714,0.0,0,-0.916232
2,-0.335581,-0.101367,-0.055714,2.0,0,-0.063855
3,-0.335581,-0.101367,-0.055714,2.0,0,-0.209147
4,-0.335581,-0.101367,-0.055714,2.0,0,0.294531
...,...,...,...,...,...,...
1704,-0.335581,-0.101367,-0.055714,2.0,1,1.621526
1705,-0.335581,-0.101367,-0.055714,2.0,0,0.110495
1706,-0.335581,-0.101367,-0.055714,2.0,0,-0.383496
1707,-0.335581,-0.101367,-0.055714,2.0,0,-0.112286


# total_stops

In [31]:
X_train.total_stops

0       1.0
1       0.0
2       1.0
3       1.0
4       1.0
       ... 
1704    1.0
1705    1.0
1706    1.0
1707    1.0
1708    1.0
Name: total_stops, Length: 1709, dtype: float64

In [32]:
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,0.0,1
2,1.0,0
3,1.0,0
4,1.0,0
...,...,...
1704,1.0,0
1705,1.0,0
1706,1.0,0
1707,1.0,0


# additional_info

In [33]:
X_train.additional_info

0       In-flight meal not included
1       In-flight meal not included
2                           No Info
3       In-flight meal not included
4                           No Info
                   ...             
1704    In-flight meal not included
1705                        No Info
1706    In-flight meal not included
1707    In-flight meal not included
1708                        No Info
Name: additional_info, Length: 1709, dtype: object

In [34]:
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
info_pipe1.fit_transform(X_train.loc[:,["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_other
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
1704,1.0,0.0,0.0
1705,0.0,1.0,0.0
1706,1.0,0.0,0.0
1707,1.0,0.0,0.0


In [35]:
def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("no info").astype(int))

In [36]:
info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info))
])

In [37]:
info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
    
])
info_transformer.fit_transform(X_train.loc[:,["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_other,additional_info
0,1.0,0.0,0.0,1
1,1.0,0.0,0.0,1
2,0.0,1.0,0.0,1
3,1.0,0.0,0.0,1
4,0.0,1.0,0.0,1
...,...,...,...,...
1704,1.0,0.0,0.0,1
1705,0.0,1.0,0.0,1
1706,1.0,0.0,0.0,1
1707,1.0,0.0,0.0,1


# column transformer

In [38]:
column_transformer = ColumnTransformer(transformers=[
    ("air",air_transformer, ["airline"]),
    ("doj",doj_transformer,["date_of_journey"]),
    ("location",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("dur", duration_transformer, ["duration"]),
    ("stops", total_stops_transformer,["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder = "passthrough")

column_transformer.fit_transform(X_train,y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_others,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration_rbf_25,dur__duration_rbf_50,dur__duration_rbf_75,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,-0.186011,-0.152980,1,0,0.695652,0.545455,0.521739,0.000000,0.285464,0.000000,-0.335581,-0.101367,-0.055714,2.0,1,1.011302,1.0,0,1.0,0.0,0.0,1
1,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,-0.999315,-1.812625,0,1,0.478261,0.181818,0.608696,0.090909,1.000000,0.000000,3.159617,-0.101367,-0.055714,0.0,0,-0.916232,0.0,1,1.0,0.0,0.0,1
2,0.0,0.0,0.0,1.0,0.0,1.000000,0.882353,0.333333,0.872881,0.990136,0.992114,1,0,0.347826,0.818182,0.826087,0.000000,1.000000,0.590226,-0.335581,-0.101367,-0.055714,2.0,0,-0.063855,1.0,0,0.0,1.0,0.0,1
3,0.0,0.0,0.0,1.0,0.0,0.333333,0.470588,0.333333,0.457627,0.990136,0.992114,1,0,0.304348,0.181818,0.695652,0.181818,1.000000,0.590226,-0.335581,-0.101367,-0.055714,2.0,0,-0.209147,1.0,0,1.0,0.0,0.0,1
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,0.990136,0.992114,1,0,0.652174,0.090909,0.173913,0.454545,0.000000,0.924812,-0.335581,-0.101367,-0.055714,2.0,0,0.294531,1.0,0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1704,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,0.000000,0.796610,0.990136,0.992114,1,0,0.782609,0.272727,0.826087,0.000000,0.285464,0.590226,-0.335581,-0.101367,-0.055714,2.0,1,1.621526,1.0,0,1.0,0.0,0.0,1
1705,0.0,0.0,0.0,1.0,0.0,0.666667,0.588235,0.500000,0.584746,0.990136,0.992114,1,0,0.304348,0.545455,0.826087,0.272727,1.000000,0.590226,-0.335581,-0.101367,-0.055714,2.0,0,0.110495,1.0,0,0.0,1.0,0.0,1
1706,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,0.990136,0.992114,1,0,0.869565,1.000000,0.173913,0.454545,0.201401,0.924812,-0.335581,-0.101367,-0.055714,2.0,0,-0.383496,1.0,0,1.0,0.0,0.0,1
1707,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,-0.186011,-0.152980,1,0,0.260870,0.545455,0.695652,0.363636,1.000000,0.590226,-0.335581,-0.101367,-0.055714,2.0,0,-0.112286,1.0,0,1.0,0.0,0.0,1


# feature_selection

In [39]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = "r2",
    threshold = 0.1
)

# putting all together

In [40]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_others,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.882353,0.872881,-0.186011,-0.152980,2.0,1,1.011302,1.0,0
1,0.0,1.0,0.0,0.647059,0.661017,-0.999315,-1.812625,0.0,0,-0.916232,0.0,1
2,0.0,0.0,0.0,0.882353,0.872881,0.990136,0.992114,2.0,0,-0.063855,1.0,0
3,0.0,0.0,0.0,0.470588,0.457627,0.990136,0.992114,2.0,0,-0.209147,1.0,0
4,0.0,1.0,0.0,0.235294,0.220339,0.990136,0.992114,2.0,0,0.294531,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1704,0.0,1.0,0.0,0.823529,0.796610,0.990136,0.992114,2.0,1,1.621526,1.0,0
1705,0.0,0.0,0.0,0.588235,0.584746,0.990136,0.992114,2.0,0,0.110495,1.0,0
1706,0.0,1.0,0.0,0.823529,0.847458,0.990136,0.992114,2.0,0,-0.383496,1.0,0
1707,0.0,1.0,0.0,0.882353,0.872881,-0.186011,-0.152980,2.0,0,-0.112286,1.0,0


# viz

In [41]:
preprocessor

In [42]:
preprocessor.named_steps["selector"]

In [43]:
feature_performances = preprocessor.named_steps["selector"].feature_performance_
feature_performances

{'air__airline_Air India': 0.00387141461115866,
 'air__airline_Indigo': 0.1264636075214642,
 'air__airline_Jet Airways': 0.1370818659822268,
 'air__airline_Multiple Carriers': 0.030069666148454743,
 'air__airline_others': 0.10833533658964367,
 'doj__date_of_journey_month': 0.06566379379520822,
 'doj__date_of_journey_week': 0.13850405629277374,
 'doj__date_of_journey_day_of_week': 0.0001868848225368517,
 'doj__date_of_journey_day_of_year': 0.18522071329059644,
 'location__source': 0.14332025583589403,
 'location__destination': 0.11733143105009473,
 'location__source_is_north': 0.030415340650087847,
 'location__destination_is_north': 0.030415340650087847,
 'time__dep_time_hour': 0.0030841648420105727,
 'time__dep_time_minute': 0.02886722052028216,
 'time__arrival_time_hour': 0.07830852691079349,
 'time__arrival_time_minute': 0.04278838357774443,
 'time__dep_time_part_of_day': -0.0019238559347177036,
 'time__arrival_time_part_of_day': 0.03166053012014961,
 'dur__duration_rbf_25': 0.081377

In [44]:
sorted_feat_imp = dict(sorted(feature_performances.items(), key=lambda val: val[1]))
sorted_feat_imp

{'dur__duration_rbf_75': -0.004874338003034051,
 'info__additional_info_In-flight meal not included': -0.002942737924904071,
 'dur__duration_rbf_50': -0.002690783598547797,
 'info__additional_info': -0.0020045617329748446,
 'time__dep_time_part_of_day': -0.0019238559347177036,
 'info__additional_info_No Info': 0.00013879173274689016,
 'doj__date_of_journey_day_of_week': 0.0001868848225368517,
 'time__dep_time_hour': 0.0030841648420105727,
 'air__airline_Air India': 0.00387141461115866,
 'info__additional_info_other': 0.014567452228773573,
 'time__dep_time_minute': 0.02886722052028216,
 'air__airline_Multiple Carriers': 0.030069666148454743,
 'location__source_is_north': 0.030415340650087847,
 'location__destination_is_north': 0.030415340650087847,
 'time__arrival_time_part_of_day': 0.03166053012014961,
 'time__arrival_time_minute': 0.04278838357774443,
 'doj__date_of_journey_month': 0.06566379379520822,
 'time__arrival_time_hour': 0.07830852691079349,
 'dur__duration_rbf_25': 0.0813776