## 1. Import Libraries

In [63]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

## 2. Display Settings

In [39]:
pd.set_option("display.max_columns", None)

In [40]:
sklearn.set_config(transform_output="pandas")

In [41]:
warnings.filterwarnings("ignore")

## 3. Read Data

note : feature engineering is performed on the train dataset, and the same transformations
are applied to the test dataset.

In [42]:
path = r"C:/Users/Abhinay/Desktop/flight-prediction-sageMaker/data/train.csv"
train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-04-09,Delhi,Cochin,05:55:00,19:15:00,2240,2,No info,7711
1,Air India,2019-06-03,Delhi,Cochin,07:10:00,07:40:00,1470,2,No info,12698
2,Air India,2019-05-15,Kolkata,Banglore,16:45:00,21:05:00,1700,1,No info,7452
3,Vistara,2019-06-15,Chennai,Kolkata,07:05:00,09:20:00,135,0,No info,3687
4,Jet Airways,2019-04-09,Delhi,Cochin,23:05:00,19:00:00,1195,2,No info,9483
...,...,...,...,...,...,...,...,...,...,...
635,Multiple Carriers,2019-03-06,Delhi,Cochin,11:30:00,19:15:00,465,1,No info,15077
636,Indigo,2019-04-21,Delhi,Cochin,02:00:00,07:45:00,345,1,No info,6258
637,Indigo,2019-06-09,Kolkata,Banglore,22:15:00,00:50:00,155,0,No info,5224
638,Indigo,2019-05-18,Delhi,Cochin,14:20:00,22:30:00,490,1,No info,7640


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          640 non-null    object
 1   date_of_journey  640 non-null    object
 2   source           640 non-null    object
 3   destination      640 non-null    object
 4   dep_time         640 non-null    object
 5   arrival_time     640 non-null    object
 6   duration         640 non-null    int64 
 7   total_stops      640 non-null    int64 
 8   additional_info  640 non-null    object
 9   price            640 non-null    int64 
dtypes: int64(3), object(7)
memory usage: 50.1+ KB


In [44]:
X_train = train.drop(columns=["price"])
y_train = train.price.copy()

## 4. Transformation operations

### 4.1 `airline`

In [45]:
train.airline

0              Air India
1              Air India
2              Air India
3                Vistara
4            Jet Airways
             ...        
635    Multiple Carriers
636               Indigo
637               Indigo
638               Indigo
639            Air India
Name: airline, Length: 640, dtype: object

In [None]:
airline_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("grouper", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
        # ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ])
airline_transformer.fit_transform(X_train.loc[:, ["airline"]]).airline.value_counts()

airline
Jet Airways          214
Indigo               127
Air India            117
Other                114
Multiple Carriers     68
Name: count, dtype: int64

In [58]:
airline_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("grouper", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ])
airline_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
635,0.0,0.0,0.0,1.0,0.0
636,0.0,1.0,0.0,0.0,0.0
637,0.0,1.0,0.0,0.0,0.0
638,0.0,1.0,0.0,0.0,0.0


### 4.2 `date_of_journey`


In [60]:
train.date_of_journey

0      2019-04-09
1      2019-06-03
2      2019-05-15
3      2019-06-15
4      2019-04-09
          ...    
635    2019-03-06
636    2019-04-21
637    2019-06-09
638    2019-05-18
639    2019-03-24
Name: date_of_journey, Length: 640, dtype: object

In [64]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.333333,0.352941,0.166667,0.330508
1,1.000000,0.823529,0.000000,0.796610
2,0.666667,0.647059,0.333333,0.635593
3,1.000000,0.882353,0.833333,0.898305
4,0.333333,0.352941,0.166667,0.330508
...,...,...,...,...
635,0.000000,0.058824,0.333333,0.042373
636,0.333333,0.411765,1.000000,0.432203
637,1.000000,0.823529,1.000000,0.847458
638,0.666667,0.647059,0.833333,0.661017


### 4.3 `source` and `destination`

In [69]:
train.loc[:, ['source', 'destination']]

Unnamed: 0,source,destination
0,Delhi,Cochin
1,Delhi,Cochin
2,Kolkata,Banglore
3,Chennai,Kolkata
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Kolkata,Banglore
638,Delhi,Cochin


In [70]:
train[['source', 'destination']]

Unnamed: 0,source,destination
0,Delhi,Cochin
1,Delhi,Cochin
2,Kolkata,Banglore
3,Chennai,Kolkata
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Kolkata,Banglore
638,Delhi,Cochin


In [None]:
train.columns.tolist()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info',
 'price']

## 5. Column Transformer

In [65]:
column_transformer = ColumnTransformer(transformers=[
        ("airline", airline_transformer, ["airline"]),
        ("date_of_journey", doj_transformer, ["date_of_journey"]),
        
    ], remainder="passthrough")
column_transformer.fit_transform(X_train)

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_Other,date_of_journey__date_of_journey_month,date_of_journey__date_of_journey_week,date_of_journey__date_of_journey_day_of_week,date_of_journey__date_of_journey_day_of_year,remainder__source,remainder__destination,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,1.0,0.0,0.0,0.0,0.0,0.333333,0.352941,0.166667,0.330508,Delhi,Cochin,05:55:00,19:15:00,2240,2,No info
1,1.0,0.0,0.0,0.0,0.0,1.000000,0.823529,0.000000,0.796610,Delhi,Cochin,07:10:00,07:40:00,1470,2,No info
2,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.333333,0.635593,Kolkata,Banglore,16:45:00,21:05:00,1700,1,No info
3,0.0,0.0,0.0,0.0,1.0,1.000000,0.882353,0.833333,0.898305,Chennai,Kolkata,07:05:00,09:20:00,135,0,No info
4,0.0,0.0,1.0,0.0,0.0,0.333333,0.352941,0.166667,0.330508,Delhi,Cochin,23:05:00,19:00:00,1195,2,No info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.0,1.0,0.0,0.000000,0.058824,0.333333,0.042373,Delhi,Cochin,11:30:00,19:15:00,465,1,No info
636,0.0,1.0,0.0,0.0,0.0,0.333333,0.411765,1.000000,0.432203,Delhi,Cochin,02:00:00,07:45:00,345,1,No info
637,0.0,1.0,0.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,Kolkata,Banglore,22:15:00,00:50:00,155,0,No info
638,0.0,1.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,Delhi,Cochin,14:20:00,22:30:00,490,1,No info
