In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)
from feature_engine.encoding import RareLabelEncoder
from feature_engine.datetime import DatetimeFeatures

import warnings

<h1>Display Settings</h1>

In [2]:
pd.set_option("display.max_columns", None)                  #forces pandas to display all columns

In [3]:
sklearn.set_config(transform_output="pandas")               #transformers give numpy arrays as output, this makes it so that it gives df as output

In [4]:
warnings.filterwarnings("ignore")                           #warnings not displayed

<h1>Loading the Data (Feature Engineering should only be done on training data)</h1>

In [5]:
file_path = r"D:\ML_Project\Data\train.csv"
train = pd.read_csv(file_path)

In [6]:
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-09,Kolkata,Banglore,16:50:00,05:35:00,765,2.0,No Info,11642
1,Goair,2019-06-15,Banglore,Delhi,20:55:00,23:40:00,165,0.0,No Info,3898
2,Multiple Carriers,2019-03-21,Delhi,Cochin,10:00:00,21:00:00,660,1.0,In-flight meal not included,8085
3,Jet Airways,2019-05-09,Delhi,Cochin,05:30:00,04:25:00,1375,2.0,In-flight meal not included,13029
4,Indigo,2019-04-18,Kolkata,Banglore,21:25:00,00:05:00,160,0.0,No Info,4174
...,...,...,...,...,...,...,...,...,...,...
635,Air India,2019-05-15,Delhi,Cochin,13:00:00,19:15:00,375,1.0,No Info,8372
636,Multiple Carriers,2019-06-06,Delhi,Cochin,08:00:00,19:15:00,675,1.0,No Info,14848
637,Multiple Carriers,2019-05-01,Delhi,Cochin,09:00:00,19:15:00,615,1.0,No Info,15373
638,Vistara,2019-06-06,Banglore,Delhi,19:30:00,22:15:00,165,0.0,No Info,5088


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      639 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [8]:
X_train=train.drop(columns="price")
y_train=train.price.copy()

In [9]:
X_train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Air India,2019-06-09,Kolkata,Banglore,16:50:00,05:35:00,765,2.0,No Info
1,Goair,2019-06-15,Banglore,Delhi,20:55:00,23:40:00,165,0.0,No Info
2,Multiple Carriers,2019-03-21,Delhi,Cochin,10:00:00,21:00:00,660,1.0,In-flight meal not included
3,Jet Airways,2019-05-09,Delhi,Cochin,05:30:00,04:25:00,1375,2.0,In-flight meal not included
4,Indigo,2019-04-18,Kolkata,Banglore,21:25:00,00:05:00,160,0.0,No Info
...,...,...,...,...,...,...,...,...,...
635,Air India,2019-05-15,Delhi,Cochin,13:00:00,19:15:00,375,1.0,No Info
636,Multiple Carriers,2019-06-06,Delhi,Cochin,08:00:00,19:15:00,675,1.0,No Info
637,Multiple Carriers,2019-05-01,Delhi,Cochin,09:00:00,19:15:00,615,1.0,No Info
638,Vistara,2019-06-06,Banglore,Delhi,19:30:00,22:15:00,165,0.0,No Info


In [10]:
y_train

0      11642
1       3898
2       8085
3      13029
4       4174
       ...  
635     8372
636    14848
637    15373
638     5088
639    13344
Name: price, Length: 640, dtype: int64

<h1>Transformation Operations</h1>

<h3>Airline Column</h3>

In [11]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

<p style="font-size:20px">Pieline performs all the operations on a given input</p>

In [12]:
air_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),  #all categories that occur less than 10% is grouped into one, min 2 cats
    ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))      #stored as sparse matrix by default #ignore makes new column unseen as 0  
])

In [13]:
air_transformer.fit_transform(X_train.loc[:,["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0
636,0.0,0.0,0.0,1.0,0.0
637,0.0,0.0,0.0,1.0,0.0
638,0.0,0.0,0.0,0.0,1.0


In [14]:
X_train.loc[:,"airline"]                        #gives series

0              Air India
1                  Goair
2      Multiple Carriers
3            Jet Airways
4                 Indigo
             ...        
635            Air India
636    Multiple Carriers
637    Multiple Carriers
638              Vistara
639          Jet Airways
Name: airline, Length: 640, dtype: object

In [15]:
X_train.loc[:,["airline"]]                       #stays as df

Unnamed: 0,airline
0,Air India
1,Goair
2,Multiple Carriers
3,Jet Airways
4,Indigo
...,...
635,Air India
636,Multiple Carriers
637,Multiple Carriers
638,Vistara


<h3>Date of Journey</h3>

In [16]:
X_train.date_of_journey

0      2019-06-09
1      2019-06-15
2      2019-03-21
3      2019-05-09
4      2019-04-18
          ...    
635    2019-05-15
636    2019-06-06
637    2019-05-01
638    2019-06-06
639    2019-06-09
Name: date_of_journey, Length: 640, dtype: object

In [17]:
features_to_extract=["month","week","day_of_week","day_of_year"]

In [18]:
pd.to_datetime(X_train.date_of_journey).dt.year.unique()         #hence no need to extract year

array([2019], dtype=int32)

In [19]:
doj_transformer=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=features_to_extract,yearfirst=True,format="mixed")),
    ("scaler",MinMaxScaler())
]
)

In [20]:
doj_transformer.fit_transform(X_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.823529,1.000000,0.847458
1,1.000000,0.882353,0.833333,0.898305
2,0.000000,0.176471,0.500000,0.169492
3,0.666667,0.588235,0.500000,0.584746
4,0.333333,0.411765,0.500000,0.406780
...,...,...,...,...
635,0.666667,0.647059,0.333333,0.635593
636,1.000000,0.823529,0.500000,0.822034
637,0.666667,0.529412,0.333333,0.516949
638,1.000000,0.823529,0.500000,0.822034


<h3>Source & Destination</h3>

<h1>Column Transformer</h1>

In [21]:
column_transformer=ColumnTransformer(transformers=[
    ("air",air_transformer,["airline"]),         #transformer only looks at variable passed inside the tupple
    ("doj",doj_transformer,["date_of_journey"])
],remainder="passthrough")                      #remainder makes sure that the remaining columns are shown too

column_transformer.fit_transform(train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,remainder__source,remainder__destination,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info,remainder__price
0,1.0,0.0,0.0,0.0,0.0,1.000000,0.823529,1.000000,0.847458,Kolkata,Banglore,16:50:00,05:35:00,765,2.0,No Info,11642
1,0.0,0.0,0.0,0.0,1.0,1.000000,0.882353,0.833333,0.898305,Banglore,Delhi,20:55:00,23:40:00,165,0.0,No Info,3898
2,0.0,0.0,0.0,1.0,0.0,0.000000,0.176471,0.500000,0.169492,Delhi,Cochin,10:00:00,21:00:00,660,1.0,In-flight meal not included,8085
3,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,Delhi,Cochin,05:30:00,04:25:00,1375,2.0,In-flight meal not included,13029
4,0.0,1.0,0.0,0.0,0.0,0.333333,0.411765,0.500000,0.406780,Kolkata,Banglore,21:25:00,00:05:00,160,0.0,No Info,4174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.333333,0.635593,Delhi,Cochin,13:00:00,19:15:00,375,1.0,No Info,8372
636,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.500000,0.822034,Delhi,Cochin,08:00:00,19:15:00,675,1.0,No Info,14848
637,0.0,0.0,0.0,1.0,0.0,0.666667,0.529412,0.333333,0.516949,Delhi,Cochin,09:00:00,19:15:00,615,1.0,No Info,15373
638,0.0,0.0,0.0,0.0,1.0,1.000000,0.823529,0.500000,0.822034,Banglore,Delhi,19:30:00,22:15:00,165,0.0,No Info,5088
