##  1. Import Libraries

In [12]:
 ! pip install xgboost



In [13]:
 ! pip install sagemaker



In [14]:
import os

import pickle

import boto3

import xgboost as xgb

import numpy as np
import pandas as pd

import sklearn

from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

import warnings

# needs after step 5

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Display Settings

In [15]:
pd.set_option('display.max_columns', None)

In [16]:
sklearn.set_config(transform_output='pandas')

In [17]:
warnings.filterwarnings('ignore')

# 3. Read Datasets

In [18]:
train = pd.read_csv('/home/slumio/flight-price-prediction/data/train.csv')
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-09,Kolkata,Banglore,,10:20:00,240,1.0,No Info,4226
1,Jet Airways,2019-05-09,Kolkata,Banglore,,,730,1.0,In-flight meal not included,9663
2,Indigo,2019-06-18,Delhi,Cochin,,08:50:00,195,0.0,No Info,5000
3,Jet Airways,2019-03-15,Banglore,New Delhi,,15:15:00,435,1.0,In-flight meal not included,9134
4,Vistara,2019-06-21,Delhi,Cochin,,09:10:00,190,0.0,No Info,6216


In [19]:
val = pd.read_csv('/home/slumio/flight-price-prediction/data/valid.csv')

val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,,,840,1.0,In-flight meal not included,7832
1,Jet Airways,2019-06-27,Delhi,Cochin,,19:00:00,810,2.0,In-flight meal not included,11507
2,Indigo,2019-06-01,Mumbai,Hyderabad,,04:05:00,90,0.0,No Info,3175
3,Indigo,2019-04-15,Delhi,Cochin,,12:10:00,425,1.0,No Info,6287
4,Multiple Carriers,2019-05-21,Delhi,Cochin,,,625,2.0,No Info,16655
...,...,...,...,...,...,...,...,...,...,...
155,Jet Airways,2019-06-09,Delhi,Cochin,,,1425,1.0,In-flight meal not included,10577
156,Indigo,2019-06-21,Banglore,Delhi,,21:20:00,185,0.0,No Info,4990
157,Multiple Carriers,2019-05-09,Delhi,Cochin,,,625,2.0,No Info,18191
158,Jet Airways,2019-06-09,Kolkata,Banglore,,23:35:00,1025,1.0,No Info,14571


In [20]:
test = pd.read_csv('/home/slumio/flight-price-prediction/data/test.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-03-09,Delhi,Cochin,,,1275,2.0,No Info,14851
1,Multiple Carriers,2019-05-27,Delhi,Cochin,,,480,1.0,No Info,13377
2,Jet Airways,2019-05-21,Kolkata,Banglore,,17:35:00,435,1.0,No Info,8355
3,Jet Airways,2019-03-21,Mumbai,Hyderabad,,08:35:00,85,0.0,In-flight meal not included,4160
4,Air Asia,2019-05-27,Delhi,Cochin,,15:45:00,470,1.0,No Info,6451
...,...,...,...,...,...,...,...,...,...,...
195,Jet Airways,2019-06-27,Delhi,Cochin,,19:00:00,300,1.0,No Info,14714
196,Jet Airways,2019-05-09,Delhi,Cochin,,,1355,1.0,In-flight meal not included,12373
197,Jet Airways,2019-03-03,Delhi,Cochin,,18:50:00,535,1.0,No Info,18550
198,Indigo,2019-03-12,Chennai,Kolkata,,15:35:00,135,0.0,No Info,6297


# 4. Preprocessing Operations

In [21]:
# airline
air_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('grouper', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='Other')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
     
])

# date of journey
feature_to_extract = ['month', 'week', 'day_of_week', 'day_of_month', 'day_of_year']

doj_transformer = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format='mixed')),
    ('scaler', MinMaxScaler())
])

# source and destination
location_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1,replace_with='Other', n_categories=2)),
    ('encoer' , MeanEncoder()),
    ('scaler' , PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ['Delhi', 'New Delhi', 'Kolkata', 'Mumbai']
    return (
        X.assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer=FeatureUnion(transformer_list=[
    ('part1', location_pipe1),
    ('part2', FunctionTransformer(func=is_north))
])

# dep_time & arrival_time

time_pipe1 = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler', MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, evening=16, night=20):
    columns = X.columns.to_list()
    X_temp  = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive='left'),
                 X_temp.loc[:, col].between(noon, evening, inclusive='left'),
                 X_temp.loc[:, col].between(evening, night, inclusive='left')
                ],
                ['morning', 'afternoon', 'evening'],
                default='night'
            ) 
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ('part', FunctionTransformer(func=part_of_day)),
    ('encoder', CountFrequencyEncoder()),
    ('scaler' , MinMaxScaler())
])


time_transformer = FeatureUnion(transformer_list=[
    ('part1', time_pipe1),
    ('part2', time_pipe2)
])

# duration

def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)
   

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns='duration')
    )

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])


# total stops

def is_direct(X):
    return X.assign(is_direct_flight = X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ("", FunctionTransformer(func=is_direct))
])

# additional information

info_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='Other')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne('No Info').astype(int))

info_union = FeatureUnion(transformer_list=[
    ('part1', info_pipe1),
    ('part2', FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

# column transformer


column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector

estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold=0.06
)

# preprocessor

preprocessor = Pipeline(steps=[
    ('ct', column_transformer),
    ('selector', selector)
])

In [22]:
preprocessor.fit(train.drop(columns='price'), train.price.copy())

ValueError: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer.

In [None]:
preprocessor.transform(train.drop(columns='price'))

#  5. Preprocess Data and Upload to Bucket

In [None]:
BUCKET_NAME = 'fligh-prices-bucket'
DATA_PREFIX = 'data'

In [None]:
def get_file_name(name):
    return f'{name}-pre.csv'       # returning the name of preprocess data

In [None]:
# first we export data then we upload on s3 bucket
# sagemaker wants its target variable first then after any input

def export_data(data, name, pre):
    # split data into X and y subsets
    
    X = data.drop(columns='price')
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    
    # here we save file into csv format. index=False, header=False this is the expectation of sagemaker
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [None]:
# interaction of python with aws services

def upload_to_bucket(name):
    
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()  # representing the curr working env
        .resource('s3')
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f'{name}/{name}.csv')) # which file you wanna save upto this folder
        .upload_file(file_name)
    )

In [None]:
def export_and_upload_to_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [None]:
export_and_upload_to_bucket(train, 'train', preprocessor)

In [None]:
export_and_upload_to_bucket(test, 'test', preprocessor)

In [None]:
export_and_upload_to_bucket(val , 'val' , preprocessor)

#  6. Model and Hyperparameter Tuning Set-up

In [None]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [None]:
# where to save our model => inside s3 bucket
output_path = f's3://{BUCKET_NAME}/model/output'

In [None]:
xgboost_container = sagemaker.image_uris.retrieve('xgboost', region_name, "1.2-1")
model = Estimator(
    image_uri = xgboost_container,
    role = sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path = output_path,
    sagemaker_session=session,
    max_run = 300,
    max_wait= 600,
    use_spot_instances=True
)

In [None]:
model.set_hyperparameters(
    objective='reg:linear',
    num_round=10,
    eta=0.1,
    max_depth=5, 
    subsample=0.7,
    colsample_bytree = 0.7,
    alpha=0.1
)

In [None]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [None]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

# 7. Data Channels

In [None]:
# model ko path dikhana => channel

In [None]:
def get_data_channel(name):
    bucket_path = f's3://{BUCKET_NAME}/{DATA_PREFIX}/{name}'
    return TrainingInput(bucket_path, content_type='csv')

In [None]:
train_data_channel = get_data_channel('train')

In [None]:
val_data_channel = get_data_channel('val')

In [None]:
data_channel = {
    'train':train_data_channel,
    'validation':val_data_channel
}

# 8. Train and Tune the Model

In [None]:
tuner.fit(data_channel)

In [None]:
tuner.best_estimator()

In [None]:
# for deploy the model on sagemaker 
# tuner.best_estimator().deploy()

# 9. Model Evaluation

In [None]:
with open('xgboost-model', 'rb') as f:
    best_model = pickle.load(f)
    
best_model

In [None]:
pd.read_csv('train-pre.csv')

In [None]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])   # this is the formate for xgboost model
    y = data.iloc[:,0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [None]:
evaluate_model('train')

In [None]:
evaluate_model('val')

In [None]:
evaluate_model('test')

- in our case we need sagemaker 