In [123]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle as pkl
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.preprocessing import Normalizer
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [2]:
df = pd.read_csv('refined.csv')
df = df.iloc[:,1:]
df

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,dep_month,dep_day,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes
0,3,0,5,170,0,3897,3,24,22,20,1,10
1,1,3,0,445,2,7662,1,5,5,50,13,15
2,4,2,1,1140,2,13882,9,6,9,25,4,25
3,3,3,0,325,1,6218,12,5,18,5,23,30
4,3,0,5,285,1,13302,1,3,16,50,21,35
...,...,...,...,...,...,...,...,...,...,...,...,...
10677,0,3,0,150,0,4107,9,4,19,55,22,25
10678,1,3,0,155,0,4145,4,27,20,45,23,20
10679,4,0,2,180,0,7229,4,27,8,20,11,20
10680,10,0,5,160,0,12648,1,3,11,30,14,10


# Data Split

In [3]:
x = df.drop(columns='Price')
y = df['Price']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                                        x,
                                        y,
                                        random_state=1234,test_size = 0.20,
                                        shuffle=True
                                        )
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(8545, 11)
(2137, 11)
(8545,)
(2137,)


# Decision Tree Regresor

In [218]:
dt = DecisionTreeRegressor()
dt.fit(x_train, y_train)
print('Train Score:',df.score(x_train,y_train))
print('Test Score:',df.score(x_test,y_test))

Train Score: 0.9717818150899008
Test Score: 0.7241279904089539


In [219]:
y_pred = dt.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1347.4930510060833
R2_score 0.7225208591337251


##### Tuning

In [114]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'sqrt', 'log2']
}

dt_reg = DecisionTreeRegressor(random_state=0)
fitmodel = GridSearchCV(estimator=dt_reg, param_grid=param_grid, cv=5, n_jobs=-1,verbose = 1)
fitmodel.fit(x_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
DecisionTreeRegressor(max_depth=10, max_features='auto', min_samples_leaf=2,
                      min_samples_split=5, random_state=0) {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5} 0.7287079853517497


In [212]:
dt = DecisionTreeRegressor(max_depth=10, max_features='auto', min_samples_leaf=2,
                      min_samples_split=5)
dt.fit(x_train, y_train)
print('Train Score:',dt.score(x_train,y_train))
print('Test Score:',dt.score(x_test,y_test))

Train Score: 0.8521273258128419
Test Score: 0.7820190228194326


In [213]:
y_pred = dt.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1324.9109534144775
R2_score 0.7820190228194326


# Random Forest Regresor

In [214]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
print('Train Score:',rf.score(x_train,y_train))
print('Test Score:',rf.score(x_test,y_test))

Train Score: 0.9513186575229488
Test Score: 0.8165708335268179


In [215]:
y_pred = rf.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1177.8445946111292
R2_score 0.8165708335268179


##### Tuning

In [53]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

rf_reg = RandomForestRegressor()
fitmodel = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5, n_jobs=-1,verbose = 1)
fitmodel.fit(x_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Parameters:  ExtraTreesRegressor(max_features='auto', min_samples_split=5)
Best parameters:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best score:  0.8029804440896511


In [221]:
rf = RandomForestRegressor(max_features=5, min_samples_split=6,min_samples_leaf= 1, n_estimators= 200)
rf.fit(x_train, y_train)
print('Train Score:',rf.score(x_train,y_train))
print('Test Score:',rf.score(x_test,y_test))

Train Score: 0.9215320860950634
Test Score: 0.8351723710261117


In [222]:
y_pred = rf.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1149.2500555811164
R2_score 0.8351723710261117


# XGBoost

In [223]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
print('Train Score:',xgb.score(x_train,y_train))
print('Test Score:',xgb.score(x_test,y_test))

Train Score: 0.9388032504450972
Test Score: 0.8450509462070098


In [224]:
y_pred = xgb.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1166.465082178397
R2_score 0.8450509462070098


##### Tuning

In [161]:
params = {
        'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'colsample_bytree': [0.5, 0.7, 1],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.7, 1]
        }
xgb = XGBRegressor()
fitmodel = GridSearchCV(xgb, param_grid=params, cv=5, refit=True, scoring="r2", n_jobs=-1, verbose=5)
fitmodel.fit(x_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

# {'booster': 'gbtree', 'colsample_bytree': 0.4, 'gamma': 0, 'grow_policy': 'depthwise', 'max_depth': 6, 'min_child_weight': 2} 0.8316847387210963

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...) {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1} 0.8318252838550564


In [225]:
xgb = XGBRegressor(learning_rate=0.1,colsample_bytree= 0.5, max_depth= 7,n_estimators=200, subsample=1)
xgb.fit(x_train, y_train)
print('Train Score:',xgb.score(x_train,y_train))
print('Test Score:',xgb.score(x_test,y_test))

Train Score: 0.9340167298355434
Test Score: 0.8374383618451964


In [226]:
y_pred = fitmodel.best_estimator_.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1172.7396297927878
R2_score 0.8374383618451964


# KNN

In [227]:
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
print('Train Score:',knn.score(x_train,y_train))
print('Test Score:',knn.score(x_test,y_test))
print()

Train Score: 0.7053820365092627
Test Score: 0.5675141565846251



In [229]:
y_pred = knn.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1828.4948058025268
R2_score 0.5675141565846251


##### Tuning

In [41]:
knn = KNeighborsRegressor()

params = {
    #'algorithm': 'auto',
 'leaf_size': [30,40,50],
 #'metric': 'minkowski',
 #'metric_params': None,
 'n_neighbors': [10,15,20],
 'weights': ['uniform','distance']
         }

fitmodel = GridSearchCV(knn, param_grid=params, cv=5, refit=True,scoring='r2')
fitmodel.fit(x_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

KNeighborsRegressor(n_neighbors=10, weights='distance') {'leaf_size': 30, 'n_neighbors': 10, 'weights': 'distance'} 0.5024129906377326


In [230]:
knn = KNeighborsRegressor(leaf_size= 30, n_neighbors= 10, weights= 'distance')
knn.fit(x_train,y_train)
print('Train Score:',knn.score(x_train,y_train))
print('Test Score:',knn.score(x_test,y_test))
print()

Train Score: 0.9717818150899008
Test Score: 0.5431800314779938



In [231]:
y_pred = knn.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1936.8484857762048
R2_score 0.5431800314779938


# ExtraTreesRegressor

In [207]:
etr = ExtraTreesRegressor()
etr.fit(x_train, y_train)
print('Train Score:',etr.score(x_train,y_train))
print('Test Score:',etr.score(x_test,y_test))

Train Score: 0.9717808035798382
Test Score: 0.7691292550589371


In [208]:
y_pred = etr.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1279.390108329434
R2_score 0.7691292550589371


##### Tuning

In [44]:
etr = ExtraTreesRegressor()

params = {
 'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
         }

fitmodel = GridSearchCV(etr, param_grid=params, cv=5, refit=True,scoring='r2',verbose=1,n_jobs=-1)
fitmodel.fit(x_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


  warn(


ExtraTreesRegressor(max_features='auto', min_samples_split=5) {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100} 0.8189921193436038


In [154]:
etr = ExtraTreesRegressor(max_features='auto', min_samples_split=5)
etr.fit(x_train, y_train)
print('Train Score:',etr.score(x_train,y_train))
print('Test Score:',etr.score(x_test,y_test))

Train Score: 0.954357398294944
Test Score: 0.8169597049115184


In [205]:
y_pred = etr.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1168.805208157854
R2_score 0.8169597049115184


# AdaBoost Regressor

In [201]:
adb = AdaBoostRegressor()
adb.fit(x_train, y_train)
print('Train Score:',adb.score(x_train,y_train))
print('Test Score:',adb.score(x_test,y_test))

Train Score: 0.5236350154893887
Test Score: 0.470446722344


In [202]:
y_pred = adb.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 2723.8665586101283
R2_score 0.470446722344


##### Tuning

In [73]:
# Define the parameter grid to search
param_grid = {
    'base_estimator': [DecisionTreeRegressor(max_features=15), 
                       DecisionTreeRegressor(max_depth=10)],
    'n_estimators': [200, 300],
    'learning_rate': [0.1, 0.3]
}

# Create an instance of the AdaBoostRegressor
ada_reg = AdaBoostRegressor()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=ada_reg, param_grid=param_grid, cv=5, n_jobs=-1, verbose = 1)

# Fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)

# Print the best parameters and best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters:  {'base_estimator': DecisionTreeRegressor(max_features=15), 'learning_rate': 0.3, 'n_estimators': 300}
Best score:  0.7939170031657092


In [203]:
adb = AdaBoostRegressor(base_estimator= DecisionTreeRegressor(max_depth=15), learning_rate= 0.3, n_estimators= 300)
adb.fit(x_train, y_train)
print('Train Score:',adb.score(x_train,y_train))
print('Test Score:',adb.score(x_test,y_test))

Train Score: 0.9438070312670521
Test Score: 0.8342911023961097


In [204]:
y_pred = adb.predict(x_test)
print('MAE:',mean_absolute_error(y_pred,y_test))
print('R2_score',r2_score(y_test,y_pred))

MAE: 1191.572356606128
R2_score 0.8342911023961097
