## 1. Import Libraries

In [None]:
!pip install xgboost

In [None]:
!pip install feature-engine

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
import sklearn
import pickle
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import missingno as mn
from sklearn.ensemble import IsolationForest,RandomForestRegressor



from sklearn.pipeline import(
  Pipeline,FeatureUnion)
from sklearn.compose import ColumnTransformer
from sklearn.impute import(
    SimpleImputer
)

from sklearn.preprocessing import(
    OneHotEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    FunctionTransformer,
    PowerTransformer,
    StandardScaler

)
from feature_engine.encoding import RareLabelEncoder
from feature_engine.datetime import DatetimeFeatures
from sklearn.metrics.pairwise import rbf_kernel
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance


import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

'import sagemaker\nfrom sagemaker.estimator import Estimator\nfrom sagemaker.inputs import TrainingInput\nfrom sagemaker.tuner import (\n    IntegerParameter,\n    ContinuousParameter,\n    HyperparameterTuner\n)'

## 2. Display Settings

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
sklearn.set_config(transform_output="pandas")

In [None]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [None]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info,9187
...,...,...,...,...,...,...,...,...,...,...
635,Air Asia,2019-04-12,Banglore,Delhi,04:55:00,07:45:00,170,0.0,No Info,4282
636,Jet Airways,2019-05-09,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info,13067
637,Indigo,2019-05-15,Banglore,Delhi,06:05:00,08:50:00,165,0.0,No Info,4423
638,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info,7670


In [None]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-01,Delhi,Cochin,05:15:00,19:15:00,2280,3.0,No Info,10703
1,Air India,2019-03-03,Delhi,Cochin,20:00:00,19:15:00,1395,2.0,No Info,17266
2,Air India,2019-03-06,Mumbai,Hyderabad,05:55:00,22:00:00,965,1.0,No Info,10366
3,Goair,2019-06-09,Kolkata,Banglore,16:40:00,00:15:00,455,1.0,No Info,9495
4,Jet Airways,2019-06-03,Delhi,Cochin,20:55:00,19:00:00,1325,1.0,No Info,14714
...,...,...,...,...,...,...,...,...,...,...
155,Spicejet,2019-04-03,Delhi,Cochin,08:45:00,13:15:00,270,1.0,No check-in baggage included,4098
156,Spicejet,2019-03-01,Mumbai,Hyderabad,05:45:00,07:05:00,80,0.0,No Info,12475
157,Multiple Carriers,2019-05-18,Delhi,Cochin,03:50:00,12:35:00,525,1.0,No Info,10197
158,Jet Airways,2019-03-01,Banglore,New Delhi,22:50:00,00:45:00,1555,1.0,No Info,26890


In [None]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-03-03,Delhi,Cochin,18:15:00,01:35:00,440,1.0,No Info,18916
1,Jet Airways,2019-06-01,Delhi,Cochin,20:55:00,12:35:00,940,1.0,In-flight meal not included,10262
2,Multiple Carriers,2019-06-06,Delhi,Cochin,07:05:00,16:10:00,545,1.0,No Info,9646
3,Air India,2019-03-24,Kolkata,Banglore,16:50:00,18:30:00,1540,1.0,No Info,14641
4,Multiple Carriers,2019-05-27,Delhi,Cochin,10:20:00,19:00:00,520,1.0,No Info,9794
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-04-21,Kolkata,Banglore,06:55:00,09:30:00,155,0.0,No Info,4174
196,Multiple Carriers,2019-06-01,Delhi,Cochin,11:40:00,19:15:00,455,1.0,No Info,10261
197,Indigo,2019-03-21,Mumbai,Hyderabad,21:20:00,22:45:00,85,0.0,No Info,2227
198,Jet Airways,2019-06-03,Delhi,Cochin,16:00:00,12:35:00,1235,1.0,In-flight meal not included,10262


## 4. Preprocessing Operations

In [None]:
# airline
airline_pipe=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("rarelabelencoder",RareLabelEncoder(tol=0.1,n_categories=2,replace_with="Other")),
    ("onehotencoder",OneHotEncoder(sparse_output=False))

]
)

#doj
def missing_dt(x):
  return x.fillna(method="ffill").astype('datetime64[ns]')

extract_features=["month","weekend","day_of_month"]
doj_pipe=Pipeline(steps=[
    ("imputer",FunctionTransformer(func=missing_dt)),
    ("datetime",DatetimeFeatures(features_to_extract=extract_features,yearfirst=True,format="%Y-%m-%d")),
    ("scaler",MinMaxScaler())
])


# source & destination
#1st Transformation
location_pipe1=Pipeline(steps=[
         ("rarelabelencoder",RareLabelEncoder(tol=0.1,n_categories=2,replace_with="Other")),
         ("onehotencoder",OneHotEncoder(sparse_output=False))
                              ])

#2nd Transformation
def location_feature(x):
  columns=x.columns.tolist()
  north=["Delhi","Kolkata","Mumbai"]

  return (
    x
    .assign(**{
        f"{col}_is_north": x[col].isin(north).astype(int)
        for col in columns}
    )
    .drop(columns=columns)
  )


location_union=FeatureUnion(transformer_list=[
    ("pipe1",location_pipe1),
    ("pipe2",FunctionTransformer(func=location_feature))
])


location_transformer=Pipeline([
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("union",location_union)
])

# dep_time & arrival_time
time_features=["hour","minute"]
time_pipe1=Pipeline(steps=[
    ("datetime",DatetimeFeatures(features_to_extract=time_features,yearfirst=True,format='mixed')),
    ("scaler",MinMaxScaler())
])

def time_feature(x):
  columns=x.columns.tolist()

  return (
      x
      .assign(**{
          f"{col}_part_of_day":np.select([x[col].dt.hour.between(4,12,inclusive="left"),
          x[col].dt.hour.between(12,16,inclusive="left"),
          x[col].dt.hour.between(16,19,inclusive="left")],["Morning","Afternoon","Evening"],default="Night")
          for col in columns
      })
      .drop(columns=columns)
  )

time_pipe2=Pipeline(steps=[
    ("feature",FunctionTransformer(func=time_feature)),
    ("onehotencoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))

])

time_union=FeatureUnion(transformer_list=[
     ("pipe1",time_pipe1),
    ("pipe2",time_pipe2)

])

time_transformer=Pipeline(steps=[
    ("imputer",FunctionTransformer(func=missing_dt)),
    ("union",time_union)
])

# duration
def duration_feature1(x):
  columns=x.columns.tolist()
  return (
      x
      .assign(**{
          f"{col}_range":np.select([x[col].between(75,300,inclusive="left"),
            x[col].between(300,800,inclusive="left")],["Short","Medium"],default="Long")
          for col in columns
      }

      )
      .drop(columns=columns)
  )


duration_pipe1=Pipeline(steps=[
    ("feature1",FunctionTransformer(func=duration_feature1)),
    ("ordinalencoder",OrdinalEncoder())
])


def duration_feature2(df):
  columns=df.columns.tolist()
  quantiles=[0.25,0.5,0.75]
  reference={
      col:
      (df[col]
       .quantile(quantiles)
       .values.reshape(-1,1))
      for col in columns
  }

  obj=[]
  for var in reference:
    col=[f"{var}_{int(q*100)}th" for q in quantiles]
    data=pd.DataFrame(rbf_kernel(X=df[[var]],Y=reference[var]),columns=col,index=df.index)
    obj.append(data)

  return pd.concat(obj,axis=1)

duration_pipe2=Pipeline(steps=[
    ("feature2",FunctionTransformer(duration_feature2)),
    ("powertransformer",PowerTransformer(standardize=True))
])



duration_union=FeatureUnion(transformer_list=[
    ("pipe1",duration_pipe1),
    ("pipe2",duration_pipe2),
    ("scaler",StandardScaler())
])


duration_transformer=Pipeline(steps=[
    ("outlier",Winsorizer(capping_method="iqr",fold=1.5)),
    ("uniont",duration_union),
    ("imputer",SimpleImputer(strategy="median"))

])

# total_stops
stops_pipe=Pipeline(steps=[
    ("rarelabelencoder",RareLabelEncoder(tol=0.1,replace_with="3",n_categories=2))

])

def stops_feature(x):
  columns=x.columns.tolist()

  return(
    x
    .assign(**{
        "is_direct_flight":x[col].isin(["1"]).astype(int)
        for col in columns
        })
    .drop(columns,axis=1)
  )


def num_datatype(x):
  columns=x.columns.tolist()

  return(
      x.assign(**{
          col:x[col].astype("int")
          for col in columns
      }

      )
  )


stops_union=FeatureUnion(transformer_list=[
    ("pipe1",stops_pipe),
    ("pipe2",FunctionTransformer(func=stops_feature))
]
)

stops_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("union",stops_union),
    ("dtype",FunctionTransformer(func=num_datatype))

])

# additional_info
info_pipe1=Pipeline(steps=[
    ("rarelabelencoder",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("onehotencoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])


def info_feature(x):
  columns=x.columns.tolist()


  return(
      x
      .assign(**{
          "have_info":(~x[col].isin(["No Info"])).astype(int)
          for col in columns
      }
      )
      .drop(columns,axis=1)
  )


info_union=FeatureUnion(transformer_list=[
    ('pipe1',info_pipe1),
    ("pipe2",FunctionTransformer(func=info_feature))
])

info_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("union",info_union)
])




##Feature Transformer

In [None]:
Feature_Transformer=ColumnTransformer(transformers=[
    ("air",airline_pipe,["airline"]),
    ("doj",doj_pipe,["date_of_journey"]),
    ("loc",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("dur",duration_transformer,["duration"]),
    ("stops",stops_transformer,["total_stops"]),
    ("info",info_transformer,["additional_info"])


],remainder="passthrough")


##Pipeline

In [None]:
preprocessor=Pipeline(steps=[
    ("transformer",Feature_Transformer),
    ("selector",SelectBySingleFeaturePerformance(estimator=RandomForestRegressor(n_estimators=3,max_depth=3,random_state=42),scoring="r2",threshold=0.01))

])

## 4. Preprocess Data and Upload to Bucket

In [None]:
BUCKET_NAME = "sagemaker-flights-bucket"

DATA_PREFIX = "data"

In [None]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [None]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()

    # transformation
    X_pre = pre.transform(X)

    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [None]:
def upload_to_bucket(name):
    file_name = get_file_name(name)

    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [None]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [None]:
export_and_upload_bucket(train, "train", preprocessor)

In [None]:
export_and_upload_bucket(val, "val", preprocessor)

In [None]:
export_and_upload_bucket(test, "test", preprocessor)

## 5. Model and Hyperparameter Tuning Set-up

In [None]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [None]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [None]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [None]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [None]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [None]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6. Data Channels

In [None]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [None]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7fa7b7b29510>

In [None]:
val_data_channel = get_data_channel("val")

In [None]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [None]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


## 8. Model Evaluation

In [None]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)

best_model

<xgboost.core.Booster at 0x7fa7b7870610>

In [None]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)

    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()

    pred = best_model.predict(X)

    return r2_score(y, pred)

In [None]:
evaluate_model("train")

0.8945921659469604

In [None]:
evaluate_model("val")

0.785722553730011

In [None]:
evaluate_model("test")

0.8205366730690002