## 1. Import Libraries

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [2]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.1/357.1 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.0


In [3]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## 2. Display Settings

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
sklearn.set_config(transform_output="pandas")

In [6]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [7]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-27,Delhi,Cochin,11:30:00,18:50:00,440,1.0,No Info,12242
1,Jet Airways,2019-06-12,Delhi,Cochin,02:15:00,19:00:00,1005,1.0,No Info,14714
2,Jet Airways,2019-05-18,Kolkata,Banglore,08:25:00,22:35:00,850,1.0,In-flight meal not included,10844
3,Jet Airways,2019-05-09,Kolkata,Banglore,06:30:00,16:20:00,590,1.0,In-flight meal not included,8586
4,Indigo,2019-03-27,Delhi,Cochin,06:40:00,16:10:00,570,1.0,No Info,6442
...,...,...,...,...,...,...,...,...,...,...
635,Air India,2019-05-06,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
636,Jet Airways,2019-03-06,Banglore,New Delhi,14:05:00,09:30:00,1165,1.0,No Info,17261
637,Air India,2019-05-18,Kolkata,Banglore,09:50:00,23:15:00,805,2.0,No Info,15957
638,Air Asia,2019-04-21,Kolkata,Banglore,10:20:00,12:55:00,155,0.0,No Info,4409


In [8]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Vistara,2019-05-15,Kolkata,Banglore,07:10:00,22:40:00,930,1.0,No Info,8452
1,Jet Airways,2019-06-15,Delhi,Cochin,09:40:00,19:00:00,560,2.0,In-flight meal not included,10368
2,Jet Airways,2019-05-12,Kolkata,Banglore,06:30:00,12:00:00,330,1.0,No Info,13941
3,Vistara,2019-05-18,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No Info,11982
4,Jet Airways,2019-06-27,Delhi,Cochin,19:15:00,19:00:00,1425,1.0,In-flight meal not included,10262
...,...,...,...,...,...,...,...,...,...,...
155,Indigo,2019-04-01,Kolkata,Banglore,18:05:00,23:30:00,325,1.0,No Info,6284
156,Indigo,2019-05-18,Kolkata,Banglore,15:30:00,18:05:00,155,0.0,No Info,4804
157,Multiple Carriers,2019-03-03,Delhi,Cochin,09:45:00,16:10:00,385,1.0,No Info,11265
158,Jet Airways,2019-05-01,Kolkata,Banglore,16:30:00,23:35:00,425,1.0,No Info,14781


In [9]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-12,Kolkata,Banglore,14:15:00,16:45:00,150,0.0,No Info,5510
1,Multiple Carriers,2019-06-27,Delhi,Cochin,08:00:00,21:00:00,780,1.0,No Info,7408
2,Indigo,2019-05-24,Chennai,Kolkata,22:05:00,00:25:00,140,0.0,No Info,5277
3,Jet Airways,2019-05-06,Kolkata,Banglore,08:25:00,18:15:00,590,1.0,No Info,14781
4,Indigo,2019-06-03,Delhi,Cochin,10:35:00,01:30:00,895,1.0,No Info,5883
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-03-06,Delhi,Cochin,15:55:00,21:55:00,360,1.0,No Info,9748
196,Air India,2019-05-24,Kolkata,Banglore,09:25:00,10:30:00,1505,2.0,No Info,12797
197,Indigo,2019-03-15,Chennai,Kolkata,14:40:00,17:05:00,145,0.0,No Info,6297
198,Jet Airways,2019-03-01,Banglore,New Delhi,08:55:00,21:20:00,745,1.0,1 Long layover,27992


## 4. Preprocessing Operations

In [10]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])


In [11]:
preprocessor.fit(
    train.drop(columns="price"), # X component
    train.price.copy() #y component
)

In [12]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.235294,0.220339,1.066560,0.511903,2.0,0,-0.326389,1.0,0
1,0.0,1.0,0.882353,0.872881,1.066560,0.511903,2.0,1,0.853332,1.0,0
2,0.0,1.0,0.647059,0.661017,-0.333656,-0.150936,2.0,0,0.529692,1.0,0
3,0.0,1.0,0.588235,0.584746,-0.333656,-0.150936,2.0,0,-0.013189,1.0,0
4,1.0,0.0,0.235294,0.220339,1.066560,0.511903,2.0,0,-0.054949,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.588235,0.559322,-0.333656,-0.150936,2.0,0,-0.107149,1.0,0
636,0.0,1.0,0.058824,0.042373,-0.513470,1.781560,2.0,1,1.187412,1.0,0
637,0.0,0.0,0.647059,0.661017,-0.333656,-0.150936,2.0,0,0.435732,2.0,0
638,0.0,0.0,0.411765,0.432203,-0.333656,-0.150936,0.0,0,-0.921469,0.0,1


## 4. Preprocess data and Upload a bucket

In [19]:
BUCKET_NAME = "sagemaker-flightdata-bucket2" #first create bucket on S3

DATA_PREFIX = "data"

In [20]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [21]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X) #before exporting need to preprocess
    
    # exporting
    file_name = get_file_name(name) #eg train-pre
    (
        y
        .to_frame()
        .join(X_pre) #target var first in sagemaker
        .to_csv(file_name, index=False, header=False) #format for sagemaker
    )

In [22]:
# pushing data into s3 bucket
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3 #sdk for python interaction with aws
        .Session() #working in this session
        .resource("s3") #s3 bucket
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv")) #where inside bucket user wants to store
        .upload_file(file_name) #file to be stored at location
    )

In [23]:
def export_and_upload_bucket(data, name, pre): #pre= preprocessor
    export_data(data, name, pre)
    upload_to_bucket(name)

In [24]:
export_and_upload_bucket(train, "train", preprocessor)
#train-pre.csv created in jupyter/ bucket
# this is the PREPROCESSED training data

In [25]:
export_and_upload_bucket(val, "val", preprocessor)

In [26]:
export_and_upload_bucket(test, "test", preprocessor)

## 5. Model and Hyperparameter Tuning Set-Up

In [27]:
session = sagemaker.Session() #current working session to be mentioned
region_name = session.boto_region_name #region name

In [28]:
output_path = f"s3://{BUCKET_NAME}/model/output" #to save trained model
# format to mention s3 bucket - s3://

In [29]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(), #iam role
    instance_count=1, #ec2 instantce
    instance_type="ml.m4.xlarge", #ec2 instance within notebook
    volume_size=5,#s3 size in gb 
    output_path=output_path, #save model here
    use_spot_instances=True, #this and following done to save costs
    max_run=300,
    max_wait=600,
    sagemaker_session=session #env we are working in
)

In [30]:
model.set_hyperparameters( #for xgboost
    objective="reg:linear", #MSE
    num_round=10, #base estimators
    eta=0.1, #learning rate - scaling o/p of each tree
    max_depth=5, #of tree
    subsample=0.8, #for each tree randomly sample 80% of the rows-not be trained on same dataset- prevent overfitting
    colsample_bytree=0.8, #80% columns as well
    alpha=0.1 #l2 regularisation
)

In [31]:
hyperparameter_ranges = { #dict 
    "eta": ContinuousParameter(0.05, 0.2), #learning rate- range given for continuous var
    "alpha": ContinuousParameter(0, 1), #l2 regualariser
    "max_depth": IntegerParameter(3, 5)# int- can be tuned to 3,4 or 5
}

In [32]:
tuner = HyperparameterTuner( #class called
    estimator=model,
    objective_metric_name="validation:rmse", #not test
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian", #randomly select hyperparameters and evaluate- will asign probability to each- iteration repeated with prob
    objective_type="Minimize"
)
#bayesian optimization

## 6. Data Channels
* show the train data to sagemaker algo
* show the model the path to s3 bucket (channel)

In [33]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv") #return address

In [34]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f461f7e0d90>

In [35]:
val_data_channel = get_data_channel("val")

In [36]:
data_channels = { #dict creation
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [37]:
tuner.fit(data_channels) #data channels instead of x_train and y_train

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


## 8. Model Evaluation

In [41]:
with open("xgboost-model", "rb") as f: #download from bucket and upload on jupyter
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f462619d0c0>

In [42]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:]) #to get X
    y = data.iloc[:, 0].copy() #to get y -target var
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [43]:
evaluate_model("train")

0.6491665840148926

In [44]:
evaluate_model("val")

0.5692120790481567

In [45]:
evaluate_model("test")

0.6287098526954651