In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from pprint import pprint

In [0]:
cen = pd.read_csv('fulfilment_center_info.csv')
mel = pd.read_csv('meal_info.csv')
main = pd.read_csv('train.csv')

inner1 =  pd.merge(left=main ,right=cen, left_on='center_id', right_on='center_id')
mark0 = pd.merge(left=inner1 ,right=mel, left_on='meal_id', right_on='meal_id')
mark0.shape

(456548, 15)

In [0]:
mark1 = mark0

mark1.drop(['id','center_id','meal_id','base_price','city_code','region_code','week'], axis=1,inplace=True)

mark1['emailer_for_promotion'] = mark1['emailer_for_promotion'].replace([0,1],['email_no','email_yes'])
mark1['homepage_featured'] = mark1['homepage_featured'].replace([0,1],['home_no','home_yes'])

hot = mark1[['emailer_for_promotion','homepage_featured','center_type','category','cuisine']]
dum = pd.get_dummies(hot,drop_first=True)

mark1.drop(['emailer_for_promotion','homepage_featured','center_type','category','cuisine'],axis=1,inplace=True)

mark1 = pd.concat([mark1, dum], axis=1)
mark1.head(1)

Unnamed: 0,checkout_price,num_orders,op_area,emailer_for_promotion_email_yes,homepage_featured_home_yes,center_type_TYPE_B,center_type_TYPE_C,category_Biryani,category_Desert,category_Extras,category_Fish,category_Other Snacks,category_Pasta,category_Pizza,category_Rice Bowl,category_Salad,category_Sandwich,category_Seafood,category_Soup,category_Starters,cuisine_Indian,cuisine_Italian,cuisine_Thai
0,136.83,177,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [0]:
foxtrot = mark1

X_train, X_test, y_train, y_test = train_test_split(foxtrot.drop('num_orders',axis=1), 
                                                    foxtrot['num_orders'], test_size=0.30, 
                                                    random_state=101)

In [0]:
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [0]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 70, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 16, 22, 28, 34, 40, 46, 52, 58, 64, 70, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500]}


In [0]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [0]:
rf_random.best_params_

In [0]:
# Predictions
predictions = rf_random.predict(X_test)
# Residual Plot
plt.figure(figsize=(6,6))
plt.scatter(y_test,predictions)
plt.title('Residual Plot')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()
print("\n")
# Residual Histogram
plt.figure(figsize=(6,6))
sns.distplot((y_test-predictions),bins=25)
plt.title('Residual Histogram')
plt.show()
print("\n")
# Error Values
print('Regression Evaluation Metrices')
print('Train Score:', rf_random.score(X_train,y_train))
print('Test Score:', rf_random.score(X_test,y_test))
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))