## 1. Import Libraries

In [1]:
!pip install feature_engine



In [48]:
import pandas as pd

import numpy as np

import sklearn

import feature_engine
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder
)
from feature_engine.datetime import DatetimeFeatures


import warnings

## 2. Display Settings

In [3]:
pd.set_option("display.max_columns", None) #display all columns

In [4]:
sklearn.set_config(transform_output= "pandas") #sklearn transforms otherwise return np arrays

In [5]:
warnings.filterwarnings("ignore")

## 3. Read the Data

In [6]:
path=r"C:\Users\jinni\Desktop\flights-sagemaker-project\data\train.csv"

train= pd.read_csv(path)

In [7]:
train.head(5)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-27,Delhi,Cochin,11:30:00,18:50:00,440,1.0,No Info,12242
1,Jet Airways,2019-06-12,Delhi,Cochin,02:15:00,19:00:00,1005,1.0,No Info,14714
2,Jet Airways,2019-05-18,Kolkata,Banglore,08:25:00,22:35:00,850,1.0,In-flight meal not included,10844
3,Jet Airways,2019-05-09,Kolkata,Banglore,06:30:00,16:20:00,590,1.0,In-flight meal not included,8586
4,Indigo,2019-03-27,Delhi,Cochin,06:40:00,16:10:00,570,1.0,No Info,6442


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [9]:
X_train= train.drop(columns= "price")
y_train= train.price.copy()

## 4. Transformation Operations

In [10]:
X_train.columns.to_list(
)

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

### 4.1 Airline

In [11]:
X_train.airline

0      Jet Airways
1      Jet Airways
2      Jet Airways
3      Jet Airways
4           Indigo
          ...     
635      Air India
636    Jet Airways
637      Air India
638       Air Asia
639         Indigo
Name: airline, Length: 640, dtype: object

In [12]:
X_train.airline.unique()

array(['Jet Airways', 'Indigo', 'Spicejet', 'Vistara', 'Air India',
       'Multiple Carriers', 'Air Asia', 'Goair', 'Trujet'], dtype=object)

### Note: Transformation operations for categorical column- Airline
1. Imputation
2. Group Rare Labels
3. One-hot encoding

In [15]:
air_transformer= Pipeline(steps=[ #transformation steps to be applied to this column- is a list of tuples
    ("imputer", SimpleImputer(strategy="most_frequent")), #missing values imputed
    ("grouper", RareLabelEncoder(tol=0.1, replace_with= "Other", n_categories=2)), #tolerance- group all categeories less than 10%
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown= "ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

#pipeline used to perform transfrmations one after the other on given input- airline

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0
636,0.0,0.0,1.0,0.0,0.0
637,1.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,1.0


### 4.2 date_of_journey

In [25]:
X_train.date_of_journey

0      2019-03-27
1      2019-06-12
2      2019-05-18
3      2019-05-09
4      2019-03-27
          ...    
635    2019-05-06
636    2019-03-06
637    2019-05-18
638    2019-04-21
639    2019-03-03
Name: date_of_journey, Length: 640, dtype: object

### Note: Transforms on date-time column date_of_journey
1. Date-time feature extraction
2. Min max scaling- convert all values between 0 to 1.

In [29]:
#all are year 2019, extract other features of datetime
feature_to_extract= ["month", "week", "day_of_week", "day_of_year"]

doj_transformer= Pipeline(steps= [
    ("dt", DatetimeFeatures(features_to_extract= feature_to_extract, yearfirst=True, format="mixed" ))
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])
#feature engine library works on pandas dataframe

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,3,13,2,86
1,6,24,2,163
2,5,20,5,138
3,5,19,3,129
4,3,13,2,86
...,...,...,...,...
635,5,19,0,126
636,3,10,2,65
637,5,20,5,138
638,4,16,6,111


In [31]:
#these values go from single digit to three digit- not an issue with tree based model
# for the sake of completeness-

feature_to_extract= ["month", "week", "day_of_week", "day_of_year"]

doj_transformer= Pipeline(steps= [
    ("dt", DatetimeFeatures(features_to_extract= feature_to_extract, yearfirst=True, format="mixed" )),
    ("scaler", MinMaxScaler()) #all values now between 0-1
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.235294,0.333333,0.220339
1,1.000000,0.882353,0.333333,0.872881
2,0.666667,0.647059,0.833333,0.661017
3,0.666667,0.588235,0.500000,0.584746
4,0.000000,0.235294,0.333333,0.220339
...,...,...,...,...
635,0.666667,0.588235,0.000000,0.559322
636,0.000000,0.058824,0.333333,0.042373
637,0.666667,0.647059,0.833333,0.661017
638,0.333333,0.411765,1.000000,0.432203


### 4.3 source and destination
1. *group rare labels*
2. *mean encoding* - replaces the categories by the mean value of the `target` for each category. Eg. categorical var colour- RGB and mean of the numerical target for RBG is 0.5, 0.8, 0.1-> R replaced by 0.5, G by 0.8, B by 0.1
3. *power transformer* - for scaling on numerical variable, such that the numerical var becomes as symmetric as possible. Find lambda. offered by sklearn. Performs transformation plus has `standardize= True` which hence performs scaling.

In [36]:
X_train.source

0         Delhi
1         Delhi
2       Kolkata
3       Kolkata
4         Delhi
         ...   
635     Kolkata
636    Banglore
637     Kolkata
638     Kolkata
639       Delhi
Name: source, Length: 640, dtype: object

In [37]:
X_train.destination

0         Cochin
1         Cochin
2       Banglore
3       Banglore
4         Cochin
         ...    
635     Banglore
636    New Delhi
637     Banglore
638     Banglore
639       Cochin
Name: destination, Length: 640, dtype: object

In [39]:
location_subset= X_train.loc[:, ["source", "destination"]]
location_subset #columns we want to work with-same type

Unnamed: 0,source,destination
0,Delhi,Cochin
1,Delhi,Cochin
2,Kolkata,Banglore
3,Kolkata,Banglore
4,Delhi,Cochin
...,...,...
635,Kolkata,Banglore
636,Banglore,New Delhi
637,Kolkata,Banglore
638,Kolkata,Banglore


In [49]:
location_pipe1= Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with= "Other", n_categories=2)),
    ("encoder", MeanEncoder()) #needs target column as well
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

TypeError: 'tuple' object is not callable

## 5. Column Transformer

In [35]:
# input is list of tuples 
# each tuple has 1. a given name "air", 2. transformers to be applied 3. input column




column_transformer= ColumnTransformer(transformers= [
    ("air", air_transformer, ["airline"])
    
])
column_transformer.fit_transform(X_train)




#gave full dataset but output is only airline transformations- due to columnTransformer


Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0
636,0.0,0.0,1.0,0.0,0.0
637,1.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,1.0


In [34]:
# to show other columns- remainder passthrough

column_transformer= ColumnTransformer(transformers= [
    ("air", air_transformer, ["airline"]),
     ("doj", doj_transformer, ["date_of_journey"])
    
], remainder= "passthrough")
                                      
column_transformer.fit_transform(X_train)

#gives 13 columns instead of only 5 of airline
# increased to 16 after doj (date of journey) transformers

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,remainder__source,remainder__destination,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,Delhi,Cochin,11:30:00,18:50:00,440,1.0,No Info
1,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,Delhi,Cochin,02:15:00,19:00:00,1005,1.0,No Info
2,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,Kolkata,Banglore,08:25:00,22:35:00,850,1.0,In-flight meal not included
3,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,Kolkata,Banglore,06:30:00,16:20:00,590,1.0,In-flight meal not included
4,0.0,1.0,0.0,0.0,0.0,0.000000,0.235294,0.333333,0.220339,Delhi,Cochin,06:40:00,16:10:00,570,1.0,No Info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0,0.666667,0.588235,0.000000,0.559322,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info
636,0.0,0.0,1.0,0.0,0.0,0.000000,0.058824,0.333333,0.042373,Banglore,New Delhi,14:05:00,09:30:00,1165,1.0,No Info
637,1.0,0.0,0.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,Kolkata,Banglore,09:50:00,23:15:00,805,2.0,No Info
638,0.0,0.0,0.0,0.0,1.0,0.333333,0.411765,1.000000,0.432203,Kolkata,Banglore,10:20:00,12:55:00,155,0.0,No Info


In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 45.1+ KB
