In [1]:
import pandas as pd
import numpy as np
import datetime as dt


In [2]:
df_full = pd.read_excel('Data_Train.xlsx')

In [3]:
df_full['Route'].fillna('None', inplace = True)
df_full['Total_Stops'].fillna('0 stop', inplace = True)


In [4]:
df_full['Additional_Info'].value_counts()

# Here 'NO Info ' is spelled wrongly

No info                         8345
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
1 Short layover                    1
2 Long layover                     1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [5]:
df_full["Additional_Info"] = df_full["Additional_Info"].replace({'No Info': 'No info'})

In [6]:
df_full['Additional_Info'].value_counts()

No info                         8348
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
1 Short layover                    1
2 Long layover                     1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [7]:
df_full['Total_Stops'].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
0 stop         1
Name: Total_Stops, dtype: int64

In [8]:

df_full["isWeekend"] = ((pd.to_datetime(df_full["Date_of_Journey"], format = '%d/%m/%Y').dt.dayofweek) // 5 == 1).astype(int)
df_full["Day_of_Week"] = pd.to_datetime(df_full["Date_of_Journey"], format = '%d/%m/%Y').dt.day_name()
df_full["Day_Of_Journey"] = pd.to_datetime(df_full["Date_of_Journey"], format = '%d/%m/%Y').dt.day
df_full["Month_of_Journey"] = pd.to_datetime(df_full["Date_of_Journey"], format = '%d/%m/%Y').dt.month

df_full.drop('Date_of_Journey', axis = 1, inplace = True)

In [9]:
# converting the 'Dep_time' to datetime and splitting into columns
df_full['Depart_Time_Hour'] = pd.to_datetime(df_full.Dep_Time).dt.hour
df_full['Depart_Time_Minutes'] = pd.to_datetime(df_full.Dep_Time).dt.minute
df_full.drop('Dep_Time',axis =1,inplace = True)


In [10]:
# converting the 'Arrival_time' to datetime and splitting into columns
df_full['Arrival_Time_Hour'] = pd.to_datetime(df_full.Arrival_Time).dt.hour
df_full['Arrival_Time_Minutes'] = pd.to_datetime(df_full.Arrival_Time).dt.minute
df_full.drop("Arrival_Time", axis=1, inplace = True)

In [11]:
# Converting the Duration into minutes
df_full['Duration']=  df_full['Duration'].str.replace("h", '*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

df_full['Price'] = df_full['Price'].fillna('0')

In [12]:
df_full.Total_Stops.replace(['non-stop', '0 stop','1 stop', '2 stops', '3 stops', '4 stops'],[0,1,2,3,4,5], inplace = True)

In [13]:

mid = df_full['Price']
df_full.drop(labels=['Price'], axis=1, inplace = True)
df_full.insert(15, 'Price', mid)
df_full.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,isWeekend,Day_of_Week,Day_Of_Journey,Month_of_Journey,Depart_Time_Hour,Depart_Time_Minutes,Arrival_Time_Hour,Arrival_Time_Minutes,Price
0,IndiGo,Banglore,New Delhi,BLR → DEL,170,0,No info,1,Sunday,24,3,22,20,1,10,3897
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,445,3,No info,0,Wednesday,1,5,5,50,13,15,7662
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,1140,3,No info,1,Sunday,9,6,9,25,4,25,13882
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,325,2,No info,1,Sunday,12,5,18,5,23,30,6218
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,285,2,No info,0,Friday,1,3,16,50,21,35,13302


In [14]:
# Encoding the Total stops column

# Import label encoder 
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()


df_full['Route'] = label_encoder.fit_transform(df_full['Route'])
df_full['Additional_Info'] = label_encoder.fit_transform(df_full['Additional_Info'])
df_full['Day_of_Week'] = label_encoder.fit_transform(df_full['Day_of_Week'])


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,1,2])],remainder = 'passthrough')
df_full = ct.fit_transform(df_full)


X = df_full[:, 0:35]
y = df_full[:, -1:]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)



In [15]:
X_train


array([[ 0.,  0.,  0., ...,  0., 21.,  0.],
       [ 0.,  0.,  0., ..., 20., 10., 35.],
       [ 0.,  0.,  0., ..., 55., 19., 50.],
       ...,
       [ 0.,  0.,  0., ..., 50.,  1., 30.],
       [ 0.,  0.,  0., ..., 40.,  3., 25.],
       [ 0.,  0.,  0., ...,  0.,  4., 25.]])

In [16]:
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler(feature_range = (0,1))
X_train[:, 23:] = min_max.fit_transform(X_train[:, 23:])
X_test[:, 23:] = min_max.transform(X_test[:, 23:])


In [17]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [18]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV

rf1 = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf1, param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 19.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [19]:
rf_random.best_params_



{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [20]:
rf_random.best_score_

0.8816287956547392

In [35]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)
print('Random forest r2_score',r2_score(y_test, y_pred1))
print('mean_square_error', mean_squared_error(y_test, y_pred1))

  


Random forest r2_score 0.8830100798758679
mean_square_error 2389796.962753655


In [36]:
from sklearn.ensemble import ExtraTreesRegressor
rf2 = ExtraTreesRegressor()
rf2.fit(X_train, y_train)
y_pred = rf2.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
print(r2_score(y_test, y_pred))
print('mean_square_error', mean_squared_error(y_test, y_pred))

0.9155357415155391
mean_square_error 1725383.0772192073


In [38]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
estimator = ExtraTreesRegressor()
cv = ShuffleSplit(n_splits=5, random_state=42, test_size=0.3)
estimator = ExtraTreesRegressor(random_state=42, n_estimators=900, max_depth=18)
# Create the random grid
param_grid = { 
'min_samples_split':range(10,100,5)
}
n_jobs=8
regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=param_grid, n_jobs=n_jobs)
regressor.fit(X_train, y_train)
y_pred = regressor.best_estimator_.predict(X_test)

regressor.best_score_

  self.best_estimator_.fit(X, y, **fit_params)


0.8952624979325237

In [39]:
regressor.best_score_


0.8952624979325237

In [40]:
import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(rf2) 
  
# Load the pickled model 
extra_from_pickle = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions 
predicted=extra_from_pickle.predict(X_test)


In [41]:
predicted

array([10851.27,  5113.1 , 14777.07, ...,  3724.54,  5832.59, 14031.44])

In [42]:
import pandas as pd 
pd.DataFrame(predicted).to_csv("pred.csv")

In [43]:
# i tried linear regression, xg boost and gradient boost hyper parameter tuning but none of them crosse above 90% accuracy
# only extra tree regrssor acheived 92%