In [103]:
import os

# import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV  #
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

# import sagemaker
# from sagemaker.estimator import Estimator
# from sagemaker.inputs import TrainingInput
# from sagemaker.tuner import (
#     IntegerParameter,
#     ContinuousParameter,
#     HyperparameterTuner
# )

 ## 2.Display Settings

In [18]:
pd.set_option("display.max_columns",None)

In [19]:
sklearn.set_config(transform_output="pandas") # to get output as pandas dataframe 

In [20]:
warnings.filterwarnings("ignore")

## 3. Read the Data

In [21]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/data/train.csv"
train=pd.read_csv(file_path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-03,Banglore,Delhi,10:10:00,13:00:00,170,0.0,No Info,3943
1,Jet Airways,2019-03-09,Banglore,New Delhi,08:00:00,10:25:00,1585,1.0,In-flight meal not included,11087
2,Air India,2019-05-09,Delhi,Cochin,09:45:00,23:00:00,795,1.0,No Info,7174
3,Jet Airways,2019-06-01,Delhi,Cochin,11:40:00,19:00:00,440,2.0,In-flight meal not included,15812
4,Indigo,2019-06-18,Banglore,Delhi,04:00:00,06:50:00,170,0.0,No Info,3943
...,...,...,...,...,...,...,...,...,...,...
635,Jet Airways,2019-05-27,Delhi,Cochin,11:30:00,19:00:00,450,1.0,In-flight meal not included,9732
636,Indigo,2019-03-21,Banglore,New Delhi,14:25:00,19:25:00,300,1.0,No Info,7708
637,Indigo,2019-03-15,Mumbai,Hyderabad,21:20:00,22:45:00,85,0.0,No Info,2754
638,Multiple Carriers,2019-03-27,Delhi,Cochin,12:50:00,19:15:00,385,1.0,No Info,6953


In [22]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/data/val.csv"
val=pd.read_csv(file_path)
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-03,Delhi,Cochin,08:35:00,16:10:00,455,1.0,No Info,7663
1,Vistara,2019-06-12,Kolkata,Banglore,17:00:00,23:25:00,385,1.0,No Info,8610
2,Multiple Carriers,2019-05-18,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,10348
3,Jet Airways,2019-03-21,Delhi,Cochin,19:10:00,18:50:00,1420,2.0,In-flight meal not included,6643
4,Goair,2019-04-01,Banglore,Delhi,07:45:00,10:40:00,175,0.0,No Info,4239
...,...,...,...,...,...,...,...,...,...,...
155,Vistara,2019-06-01,Chennai,Kolkata,07:05:00,09:20:00,135,0.0,No Info,3687
156,Air India,2019-03-21,Delhi,Cochin,17:15:00,19:15:00,1560,2.0,No Info,8770
157,Spicejet,2019-03-03,Kolkata,Banglore,06:55:00,09:35:00,160,0.0,No Info,4622
158,Multiple Carriers,2019-06-27,Delhi,Cochin,07:30:00,19:00:00,690,1.0,No Info,10877


In [23]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/data/test.csv"
test=pd.read_csv(file_path)
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-27,Delhi,Cochin,06:50:00,16:10:00,560,1.0,No Info,6442
1,Jet Airways,2019-04-01,Kolkata,Banglore,16:30:00,16:20:00,1430,1.0,In-flight meal not included,7064
2,Jet Airways,2019-03-09,Delhi,Cochin,20:50:00,04:25:00,455,1.0,No Info,15554
3,Multiple Carriers,2019-06-27,Delhi,Cochin,19:00:00,01:30:00,390,1.0,No Info,6795
4,Indigo,2019-06-03,Banglore,Delhi,00:25:00,03:15:00,170,0.0,No Info,3943
...,...,...,...,...,...,...,...,...,...,...
195,Jet Airways,2019-05-06,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info,12121
196,Air India,2019-05-18,Delhi,Cochin,13:20:00,07:40:00,1100,2.0,No Info,13801
197,Jet Airways,2019-03-06,Banglore,New Delhi,21:25:00,11:25:00,840,1.0,No Info,17261
198,Air India,2019-05-12,Kolkata,Banglore,12:00:00,13:15:00,1515,1.0,No Info,6612


## 4. Preprocessing Operations

In [24]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [25]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

0,1,2
,steps,"[('ct', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('air', ...), ('doj', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function is_...t 0x13c28dbc0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,False
,utc,
,format,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,func,<function par...t 0x13c28da80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,1.5
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,variables,
,percentiles,"[0.25, 0.5, ...]"
,gamma,0.1

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function dur...t 0x13c28dc60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function is_...t 0x13c28df80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function is_...t 0x13c28d940>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function hav...t 0x13c28d9e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,estimator,RandomForestR...ndom_state=42)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.529412,0.533898,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1
1,0.0,1.0,0.058824,0.067797,-1.220399,-1.209244,-0.347330,2.0,2.069894,1.0,0
2,0.0,0.0,0.588235,0.584746,0.924395,0.925502,-0.347330,2.0,0.401052,1.0,0
3,0.0,1.0,0.764706,0.779661,0.924395,0.925502,-0.347330,2.0,-0.348871,2.0,0
4,1.0,0.0,0.941176,0.923729,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.764706,0.737288,0.924395,0.925502,-0.347330,2.0,-0.327747,1.0,0
636,1.0,0.0,0.176471,0.169492,-1.220399,-1.209244,-0.347330,1.0,-0.644616,1.0,0
637,1.0,0.0,0.117647,0.118644,-1.794731,-1.209244,-0.347330,0.0,-1.098794,0.0,1
638,0.0,0.0,0.235294,0.220339,0.924395,0.925502,-0.347330,1.0,-0.465057,1.0,0


In [79]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [87]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    # X_pre_df = pd.DataFrame(X_pre, columns=X.columns)
    # result_df = X_pre_df.join(y)
    
    # exporting
    file_name = get_file_name(name)
    (
        # result_df
        X_pre
        .join(y)
        .to_csv(file_name, index=False, header=True)
    )

In [88]:
export_data(train, "train", preprocessor)


In [89]:
export_data(val, "val", preprocessor)

In [90]:
export_data(test, "test", preprocessor)

In [91]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/notebooks/train-pre.csv"
train_pre=pd.read_csv(file_path)
train_pre

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight,price
0,1.0,0.0,0.529412,0.533898,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1,3943
1,0.0,1.0,0.058824,0.067797,-1.220399,-1.209244,-0.347330,2.0,2.069894,1.0,0,11087
2,0.0,0.0,0.588235,0.584746,0.924395,0.925502,-0.347330,2.0,0.401052,1.0,0,7174
3,0.0,1.0,0.764706,0.779661,0.924395,0.925502,-0.347330,2.0,-0.348871,2.0,0,15812
4,1.0,0.0,0.941176,0.923729,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1,3943
...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.764706,0.737288,0.924395,0.925502,-0.347330,2.0,-0.327747,1.0,0,9732
636,1.0,0.0,0.176471,0.169492,-1.220399,-1.209244,-0.347330,1.0,-0.644616,1.0,0,7708
637,1.0,0.0,0.117647,0.118644,-1.794731,-1.209244,-0.347330,0.0,-1.098794,0.0,1,2754
638,0.0,0.0,0.235294,0.220339,0.924395,0.925502,-0.347330,1.0,-0.465057,1.0,0,6953


In [92]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/notebooks/val-pre.csv"
val_pre=pd.read_csv(file_path)
val_pre

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight,price
0,1.0,0.0,0.000000,0.016949,0.924395,0.925502,-0.347330,2.0,-0.317184,1.0,0,7663
1,0.0,0.0,0.882353,0.872881,0.080413,0.079822,-0.347330,1.0,-0.465057,1.0,0,8610
2,0.0,0.0,0.647059,0.661017,0.924395,0.925502,-0.347330,2.0,0.337678,1.0,0,10348
3,0.0,1.0,0.176471,0.169492,0.924395,0.925502,-0.347330,2.0,1.721338,2.0,0,6643
4,0.0,0.0,0.294118,0.262712,-1.220399,-1.746339,3.115327,0.0,-0.908673,0.0,1,4239
...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,0.0,0.764706,0.779661,-1.794731,-1.209244,-0.347330,0.0,-0.993171,0.0,1,3687
156,0.0,0.0,0.176471,0.169492,0.924395,0.925502,-0.347330,2.0,2.017083,2.0,0,8770
157,0.0,0.0,0.000000,0.016949,0.080413,0.079822,-0.347330,0.0,-0.940360,0.0,1,4622
158,0.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,2.0,0.179243,1.0,0,10877


In [106]:
file_path=r"/Users/akashjaiswal/Downloads/flight_project/notebooks/test-pre.csv"
test_pre=pd.read_csv(file_path)
test_pre

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight,price
0,1.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,2.0,-0.095376,1.0,0,6442
1,0.0,1.0,0.294118,0.262712,0.080413,0.079822,-0.347330,2.0,1.742463,1.0,0,7064
2,0.0,1.0,0.058824,0.067797,0.924395,0.925502,-0.347330,2.0,-0.317184,1.0,0,15554
3,0.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,1.0,-0.454494,1.0,0,6795
4,1.0,0.0,0.823529,0.796610,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1,3943
...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,1.0,0.588235,0.559322,0.080413,0.079822,-0.347330,2.0,0.179243,1.0,0,12121
196,0.0,0.0,0.647059,0.661017,0.924395,0.925502,-0.347330,2.0,1.045352,2.0,0,13801
197,0.0,1.0,0.058824,0.042373,-1.220399,-1.209244,-0.347330,2.0,0.496112,1.0,0,17261
198,0.0,0.0,0.588235,0.610169,0.080413,0.079822,-0.347330,2.0,1.922022,1.0,0,6612


In [93]:
X_train=train_pre.iloc[:,:-1]
X_train

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.529412,0.533898,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1
1,0.0,1.0,0.058824,0.067797,-1.220399,-1.209244,-0.347330,2.0,2.069894,1.0,0
2,0.0,0.0,0.588235,0.584746,0.924395,0.925502,-0.347330,2.0,0.401052,1.0,0
3,0.0,1.0,0.764706,0.779661,0.924395,0.925502,-0.347330,2.0,-0.348871,2.0,0
4,1.0,0.0,0.941176,0.923729,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.764706,0.737288,0.924395,0.925502,-0.347330,2.0,-0.327747,1.0,0
636,1.0,0.0,0.176471,0.169492,-1.220399,-1.209244,-0.347330,1.0,-0.644616,1.0,0
637,1.0,0.0,0.117647,0.118644,-1.794731,-1.209244,-0.347330,0.0,-1.098794,0.0,1
638,0.0,0.0,0.235294,0.220339,0.924395,0.925502,-0.347330,1.0,-0.465057,1.0,0


In [94]:
X_val=val_pre.iloc[:,:-1]
X_val

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.000000,0.016949,0.924395,0.925502,-0.347330,2.0,-0.317184,1.0,0
1,0.0,0.0,0.882353,0.872881,0.080413,0.079822,-0.347330,1.0,-0.465057,1.0,0
2,0.0,0.0,0.647059,0.661017,0.924395,0.925502,-0.347330,2.0,0.337678,1.0,0
3,0.0,1.0,0.176471,0.169492,0.924395,0.925502,-0.347330,2.0,1.721338,2.0,0
4,0.0,0.0,0.294118,0.262712,-1.220399,-1.746339,3.115327,0.0,-0.908673,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,0.0,0.764706,0.779661,-1.794731,-1.209244,-0.347330,0.0,-0.993171,0.0,1
156,0.0,0.0,0.176471,0.169492,0.924395,0.925502,-0.347330,2.0,2.017083,2.0,0
157,0.0,0.0,0.000000,0.016949,0.080413,0.079822,-0.347330,0.0,-0.940360,0.0,1
158,0.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,2.0,0.179243,1.0,0


In [107]:
X_test=test_pre.iloc[:,:-1]
X_test

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,2.0,-0.095376,1.0,0
1,0.0,1.0,0.294118,0.262712,0.080413,0.079822,-0.347330,2.0,1.742463,1.0,0
2,0.0,1.0,0.058824,0.067797,0.924395,0.925502,-0.347330,2.0,-0.317184,1.0,0
3,0.0,0.0,1.000000,1.000000,0.924395,0.925502,-0.347330,1.0,-0.454494,1.0,0
4,1.0,0.0,0.823529,0.796610,-1.220399,-1.746339,2.766544,0.0,-0.919235,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,1.0,0.588235,0.559322,0.080413,0.079822,-0.347330,2.0,0.179243,1.0,0
196,0.0,0.0,0.647059,0.661017,0.924395,0.925502,-0.347330,2.0,1.045352,2.0,0
197,0.0,1.0,0.058824,0.042373,-1.220399,-1.209244,-0.347330,2.0,0.496112,1.0,0
198,0.0,0.0,0.588235,0.610169,0.080413,0.079822,-0.347330,2.0,1.922022,1.0,0


In [95]:
y_train=train_pre.iloc[:,-1]
y_train

0       3943
1      11087
2       7174
3      15812
4       3943
       ...  
635     9732
636     7708
637     2754
638     6953
639     3100
Name: price, Length: 640, dtype: int64

In [96]:
y_val=val_pre.iloc[:,-1]
y_val

0       7663
1       8610
2      10348
3       6643
4       4239
       ...  
155     3687
156     8770
157     4622
158    10877
159     6493
Name: price, Length: 160, dtype: int64

In [108]:
y_test=test_pre.iloc[:,-1]
y_test

0       6442
1       7064
2      15554
3       6795
4       3943
       ...  
195    12121
196    13801
197    17261
198     6612
199     8586
Name: price, Length: 200, dtype: int64

## 5. Model Training and Evaluation Function

In [97]:
def evaluate_model(model, X, y, dataset_name="Validation"):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f"{model.__class__.__name__} {dataset_name} - R2: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")
    return r2, mae, rmse

### 6. Train Random Forest Regressor

In [98]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [101]:
evaluate_model(rf, X_val, y_val, "Validation")

RandomForestRegressor Validation - R2: 0.6792, MAE: 1697.76, RMSE: 2431.91


(0.679216515324194, 1697.7613158234126, np.float64(2431.909358524926))

## 7. Hyperparameter Tuning for Random Forest

In [104]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(rf, rf_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
print(f"Best Random Forest Params: {rf_grid.best_params_}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Random Forest Params: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}


In [109]:
# Evaluate best RF on validation
best_rf_r2_val, _, _ = evaluate_model(best_rf, X_val, y_val, "Validation (Tuned)")

# 8. Train XGBoost Regressor
xgb_model = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
xgb_model.fit(X_train, y_train)

# Evaluate on validation
evaluate_model(xgb_model, X_val, y_val, "Validation")

# 9. Hyperparameter Tuning for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3]
}
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_
print(f"Best XGBoost Params: {xgb_grid.best_params_}")

# Evaluate best XGB on validation
best_xgb_r2_val, _, _ = evaluate_model(best_xgb, X_val, y_val, "Validation (Tuned)")

# 10. Select Best Model based on Validation R2
models = {
    'Random Forest': (best_rf, best_rf_r2_val),
    'XGBoost': (best_xgb, best_xgb_r2_val)
}

best_model_name = max(models, key=lambda k: models[k][1])
best_model = models[best_model_name][0]

print(f"Best Model Selected: {best_model_name} with Validation R2: {models[best_model_name][1]:.4f}")

# 11. Final Evaluation on Test Set
evaluate_model(best_model, X_test, y_test, "Test")

# 12. Save Best Model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Best model saved as 'best_model.pkl'")

RandomForestRegressor Validation (Tuned) - R2: 0.7462, MAE: 1635.98, RMSE: 2163.30
XGBRegressor Validation - R2: 0.6565, MAE: 1771.39, RMSE: 2516.42
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best XGBoost Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
XGBRegressor Validation (Tuned) - R2: 0.7538, MAE: 1632.95, RMSE: 2130.54
Best Model Selected: XGBoost with Validation R2: 0.7538
XGBRegressor Test - R2: 0.7376, MAE: 1635.09, RMSE: 2141.30
Best model saved as 'best_model.pkl'
