In [1]:
import pandas as pd 
import numpy as np 


In [2]:
flights = pd.read_csv('data/flights.csv')
flights.head(10)

Unnamed: 0,travelCode,userCode,from,to,flightType,price,time,distance,agency,date
0,0,0,Recife (PE),Florianopolis (SC),firstClass,1434.38,1.76,676.53,FlyingDrops,09/26/2019
1,0,0,Florianopolis (SC),Recife (PE),firstClass,1292.29,1.76,676.53,FlyingDrops,09/30/2019
2,1,0,Brasilia (DF),Florianopolis (SC),firstClass,1487.52,1.66,637.56,CloudFy,10/03/2019
3,1,0,Florianopolis (SC),Brasilia (DF),firstClass,1127.36,1.66,637.56,CloudFy,10/04/2019
4,2,0,Aracaju (SE),Salvador (BH),firstClass,1684.05,2.16,830.86,CloudFy,10/10/2019
5,2,0,Salvador (BH),Aracaju (SE),firstClass,1531.92,2.16,830.86,CloudFy,10/12/2019
6,3,0,Aracaju (SE),Campo Grande (MS),economic,743.54,1.69,650.1,Rainbow,10/17/2019
7,3,0,Campo Grande (MS),Aracaju (SE),economic,877.56,1.69,650.1,Rainbow,10/20/2019
8,4,0,Recife (PE),Florianopolis (SC),economic,803.39,1.76,676.53,Rainbow,10/24/2019
9,4,0,Florianopolis (SC),Recife (PE),economic,695.3,1.76,676.53,Rainbow,10/26/2019


In [3]:
# travelCode and userCode are identifier for a complete unique travel . And userCode is identifier for a unique user . 
# we will drop travelCode , it is not needed to model creation .

flights.drop(['travelCode'],axis = 1 , inplace = True)

In [9]:
# dropping usercode 
flights.drop(['userCode'],axis = 1 , inplace = True)

In [10]:
flights.head()

Unnamed: 0,from,to,flightType,price,time,distance,agency,date
0,Recife (PE),Florianopolis (SC),firstClass,1434.38,1.76,676.53,FlyingDrops,09/26/2019
1,Florianopolis (SC),Recife (PE),firstClass,1292.29,1.76,676.53,FlyingDrops,09/30/2019
2,Brasilia (DF),Florianopolis (SC),firstClass,1487.52,1.66,637.56,CloudFy,10/03/2019
3,Florianopolis (SC),Brasilia (DF),firstClass,1127.36,1.66,637.56,CloudFy,10/04/2019
4,Aracaju (SE),Salvador (BH),firstClass,1684.05,2.16,830.86,CloudFy,10/10/2019


In [14]:
# one hot encoding the "from" and "to" column 

from sklearn.preprocessing import OneHotEncoder 

from_to_encoder = OneHotEncoder(sparse_output=False)

cols = ['from' , 'to']

encoded_arr = from_to_encoder.fit_transform(flights[cols])

encoded_arr

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
from_to_encoder.categories_

[array(['Aracaju (SE)', 'Brasilia (DF)', 'Campo Grande (MS)',
        'Florianopolis (SC)', 'Natal (RN)', 'Recife (PE)',
        'Rio de Janeiro (RJ)', 'Salvador (BH)', 'Sao Paulo (SP)'],
       dtype=object),
 array(['Aracaju (SE)', 'Brasilia (DF)', 'Campo Grande (MS)',
        'Florianopolis (SC)', 'Natal (RN)', 'Recife (PE)',
        'Rio de Janeiro (RJ)', 'Salvador (BH)', 'Sao Paulo (SP)'],
       dtype=object)]

In [15]:
encoded_arr.shape

(271888, 18)

In [16]:
flights['flightType'].unique()

array(['firstClass', 'economic', 'premium'], dtype=object)

In [20]:
# There is ordinal order in flightType so we will do ordinal encoding

from sklearn.preprocessing import OrdinalEncoder
flightType_encoder = OrdinalEncoder(categories=[['firstClass' ,'premium' , 'economic']])

encoded_flightType = flightType_encoder.fit_transform(flights[['flightType']])

In [25]:
# Scaling the encoded_flightType
from sklearn.preprocessing import StandardScaler 
flightType_scaler = StandardScaler()
encoded_flightType_scaled = flightType_scaler.fit_transform(encoded_flightType)


In [23]:
flights.head()

Unnamed: 0,from,to,flightType,price,time,distance,agency,date
0,Recife (PE),Florianopolis (SC),firstClass,1434.38,1.76,676.53,FlyingDrops,09/26/2019
1,Florianopolis (SC),Recife (PE),firstClass,1292.29,1.76,676.53,FlyingDrops,09/30/2019
2,Brasilia (DF),Florianopolis (SC),firstClass,1487.52,1.66,637.56,CloudFy,10/03/2019
3,Florianopolis (SC),Brasilia (DF),firstClass,1127.36,1.66,637.56,CloudFy,10/04/2019
4,Aracaju (SE),Salvador (BH),firstClass,1684.05,2.16,830.86,CloudFy,10/10/2019


In [27]:
target = 'price'

In [28]:
# processing time and distance features
from sklearn.preprocessing import PowerTransformer

cols = ['time' , 'distance']
time_dis_pt = PowerTransformer()

time_distance_transformed = time_dis_pt.fit_transform(flights[cols])
time_distance_transformed


array([[ 0.61272056,  0.61776595],
       [ 0.61272056,  0.61776595],
       [ 0.42214525,  0.42942277],
       ...,
       [ 0.76632167,  0.77676493],
       [-0.41456697, -0.3933792 ],
       [-0.41456697, -0.3933792 ]])

In [35]:
# processing agency 


agency_ohe = OneHotEncoder(sparse_output=False)
agency_encoded = agency_ohe.fit_transform(flights[['agency']])
agency_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [36]:
agency_ohe.categories_

[array(['CloudFy', 'FlyingDrops', 'Rainbow'], dtype=object)]

In [39]:
# Processing date 
# converting date feature to datetime
flights['date'] = pd.to_datetime(flights['date'])

# creating year , month , day_of_week , day_of_month 
flights['year'] = flights['date'].dt.year
flights['month'] = flights['date'].dt.month
flights['day_of_week'] = flights['date'].dt.dayofweek # Monday = 0 , sunday = 6
flights['day_of_month'] = flights['date'].dt.day

In [40]:
flights.head()

Unnamed: 0,from,to,flightType,price,time,distance,agency,date,year,month,day_of_week,day_of_month
0,Recife (PE),Florianopolis (SC),firstClass,1434.38,1.76,676.53,FlyingDrops,2019-09-26,2019,9,3,26
1,Florianopolis (SC),Recife (PE),firstClass,1292.29,1.76,676.53,FlyingDrops,2019-09-30,2019,9,0,30
2,Brasilia (DF),Florianopolis (SC),firstClass,1487.52,1.66,637.56,CloudFy,2019-10-03,2019,10,3,3
3,Florianopolis (SC),Brasilia (DF),firstClass,1127.36,1.66,637.56,CloudFy,2019-10-04,2019,10,4,4
4,Aracaju (SE),Salvador (BH),firstClass,1684.05,2.16,830.86,CloudFy,2019-10-10,2019,10,3,10


In [41]:
# scaling year , month , day_of_week , day_of_month 

date_scaler = StandardScaler()
cols = ['year','month','day_of_week','day_of_month']
date_features_scaled = date_scaler.fit_transform(flights[cols])
date_features_scaled

array([[-1.55368936,  0.66336123, -0.22562083,  1.15663377],
       [-1.55368936,  0.66336123, -2.03773229,  1.60979171],
       [-1.55368936,  0.94063034, -0.22562083, -1.44902438],
       ...,
       [-0.53344655,  0.10882299, -2.03773229,  0.47689686],
       [-0.53344655,  0.10882299, -0.22562083,  0.81676532],
       [-0.53344655,  0.10882299,  1.58649062,  1.15663377]])

In [42]:
final_matrix = np.concatenate([encoded_arr , encoded_flightType_scaled , time_distance_transformed , agency_encoded , date_features_scaled] , axis = 1)
final_matrix.shape 

(271888, 28)

In [43]:
# saving all the encoders for future use using pickle : save in artifacts folder
#  save from_to_encoder , flightType_encoder , flightType_scaler , time_dis_pt ,agency_ohe , date_scaler , then save a list of date columns which needs to be first created from user input and then needs to be scaled using date_scaler : cols
import pickle 
objects_to_save = {
    "from_to_encoder.pkl" : from_to_encoder,
    "flightType_encoder.pkl": flightType_encoder,
    "flightType_scaler.pkl" : flightType_scaler,
    "time_dis_pt.pkl" : time_dis_pt,
    "agency_ohe.pkl" : agency_ohe,
    "date_scaler.pkl" : date_scaler , 
    "date_cols.pkl" : cols
}

for filename , obj in objects_to_save.items():
    with open(f"artifacts/{filename}" , "wb") as f:
        pickle.dump(obj , f)

print("All the encoders and scalers saved successfully")

All the encoders and scalers saved successfully


In [46]:
# creating the target variable y 

y = flights[target].values

# saving the Model ready data matrix : final_matrix and the targer feature array : y 

np.savez("data/flights_cleaned.npz" , X = final_matrix , y = y)

Completed flights dataset preprocessing