In [0]:
pip install tensorflow==2.0.0



In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import MaxAbsScaler
from tensorflow.keras.regularizers import l2

In [0]:
tf.__version__

'2.0.0'

In [0]:
root_path = '/content/drive/My Drive/Fast Food Hackathon/'

In [0]:
import pandas as pd
import numpy as np

In [0]:
cd /content/drive/My Drive/Fast Food Hackathon/

/content/drive/My Drive/Fast Food Hackathon


In [0]:
df = pd.read_csv('csvTrain.csv')
test = pd.read_csv('csvTest.csv')

In [0]:
print(df.describe())
print(test.describe())

       Restaurant  ... Delivery_Time
count       11094  ...         11094
unique       7480  ...             7
top       ID_7184  ...    30 minutes
freq           22  ...          7406

[4 rows x 9 columns]
       Restaurant                   Location      Cuisines  ... Rating Votes Reviews
count        2774                       2774          2774  ...   2774  2774    2774
unique       2401                         35           881  ...     30   580     392
top       ID_1209  D-Block, Sector 63, Noida  North Indian  ...      -     -       -
freq            8                        221           226  ...    305   542     593

[4 rows x 8 columns]


#Step 1: Exploring the data

In [0]:
# To count unique features                        ---not necessary for the test set
def count_unique_features(column_frame):
  value_row = list(column_frame) 
  all_values = list()
  # For each row split the string by ',' and iterate through array
  for c in value_row:
    c_list = c.split(', ')
    for _c in c_list:
      if _c not in all_values:
        all_values.append(_c)
  print("Number of unique features in " + column_frame.name + " is " + str(len(all_values)))
  return all_values

In [0]:
print("In the training set")
# For the dataframe
locations = count_unique_features(df.Location)
cuisines = count_unique_features(df.Cuisines)

print("\nIn the test set")
# For the test set
locations_test = count_unique_features(test.Location)
cuisines_test = count_unique_features(test.Cuisines)

In the training set
Number of unique features in Location is 63
Number of unique features in Cuisines is 101

In the test set
Number of unique features in Location is 63
Number of unique features in Cuisines is 86


In [0]:
# Cleaning the cost and converting it to float
def clean_costs(column_values):                   
  val_arr = list()
  # Remove unnecessary symbols
  for _ind, _val in enumerate(column_values):
    _val = _val.replace('₹', '')
    _val = _val.replace(',', '')
    try:
      _val = float(_val)
    except:
      _val = np.nan
    val_arr.append(_val)
  # Obtain median from array
  col_med = np.nanmedian(val_arr)
  print("Median cost is ", col_med)
  # Obtain indices with nan values
  inds = np.where(np.isnan(val_arr))
  # If there are nan values then replace with their median
  if len(inds[0]) > 0:
    for _index in inds:
      val_arr[_index[0]] = col_med
  
  return pd.Series(val_arr)

In [0]:
print("In the training set")
# On the dataframe
df.Average_Cost = clean_costs(df.Average_Cost.values)
df.Minimum_Order = clean_costs(df.Minimum_Order.values)

print("\nIn the test set")
# On the test set
test.Average_Cost = clean_costs(test.Average_Cost.values)
test.Minimum_Order = clean_costs(test.Minimum_Order.values)

In the training set
Median cost is  200.0
Median cost is  50.0

In the test set
Median cost is  200.0
Median cost is  50.0


In [0]:
print("Average cost of training data")
print("Maximum average cost\n", df.Average_Cost.max())
print("Minimum average cost\n", df.Average_Cost.min())

print("\nAverage cost of test data")
print("Maximum average cost\n", test.Average_Cost.max())
print("Minimum average cost\n", test.Average_Cost.min())

Average cost of training data
Maximum average cost
 2050.0
Minimum average cost
 50.0

Average cost of test data
Maximum average cost
 1200.0
Minimum average cost
 50.0


In [0]:
def clean_ratings(column_values):         
  val_arr = list()
  # Remove unknown values with 0
  for _ind, _val in enumerate(column_values):
    try:
      _val = float(_val)
    except:
      _val = np.nan
    val_arr.append(_val)
  val_arr = np.nan_to_num(val_arr)
  
  return pd.Series(val_arr)

In [0]:
print("For ratings of training data")
df.Rating = clean_ratings(df.Rating.values)
print("Maximum rating\n", df.Rating.max())
print("Minimum rating\n", df.Rating.min())

print("\nFor ratings of test data")
test.Rating = clean_ratings(test.Rating.values)
print("Maximum rating\n", test.Rating.max())
print("Minimum rating\n", test.Rating.min())

For ratings of training data
Maximum rating
 4.9
Minimum rating
 0.0

For ratings of test data
Maximum rating
 4.8
Minimum rating
 0.0


In [0]:
def clean_votes_and_reviews(column_values):       
  val_arr = list()
  # Replace unknown values with 0
  for _ind, _val in enumerate(column_values):
    try:
      _val = float(_val)
    except:
      _val = np.nan
    val_arr.append(_val)
  val_arr = np.nan_to_num(val_arr)
  
  return pd.Series(val_arr)

df.Votes = clean_votes_and_reviews(df.Votes.values)
df.Reviews = clean_votes_and_reviews(df.Reviews.values)
test.Votes = clean_votes_and_reviews(test.Votes.values)
test.Reviews = clean_votes_and_reviews(test.Reviews.values)

In [0]:
print("Training data")
print("For votes")
print("Maximum number of votes\n", df.Votes.max())
print("Minimum number of votes\n", df.Votes.min())

print("\nFor reviews")
print("Maximum number of reviews\n", df.Reviews.max())
print("Minimum number of reviews\n", df.Reviews.min())

Training data
For votes
Maximum number of votes
 9054.0
Minimum number of votes
 0.0

For reviews
Maximum number of reviews
 6504.0
Minimum number of reviews
 0.0


In [0]:
print("Test data")
print("\nFor votes")
print("Maximum number of votes\n", test.Votes.max())
print("Minimum number of votes\n", test.Votes.min())

print("\nFor reviews")
print("Maximum number of reviews\n", test.Reviews.max())
print("Minimum number of reviews\n", test.Reviews.min())

Test data

For votes
Maximum number of votes
 7811.0
Minimum number of votes
 0.0

For reviews
Maximum number of reviews
 3863.0
Minimum number of reviews
 0.0


In [0]:
def clean_time(column_values):
  val_arr = list()
  # Remove minutes
  for _ind, _val in enumerate(column_values):
    _val = _val.replace(' minutes', '')
    try:
      _val = int(_val)
    except:
      _val = np.nan
    val_arr.append(_val)
  val_arr = np.nan_to_num(val_arr)
  
  return pd.Series(val_arr)

df.Delivery_Time = clean_time(df.Delivery_Time.values)
print("For time\n")
print("Maximum time taken\n", df.Delivery_Time.max())
print("Minimum time taken\n", df.Delivery_Time.min())

For time

Maximum time taken
 120
Minimum time taken
 10


In [0]:
print(df.head())
print(test.head())

  Restaurant                             Location  ... Reviews  Delivery_Time
0    ID_6321  FTI College, Law College Road, Pune  ...     4.0             30
1    ID_2882                 Sector 3, Marathalli  ...     4.0             30
2    ID_1595                       Mumbai Central  ...    30.0             65
3    ID_5929                      Sector 1, Noida  ...    95.0             30
4    ID_6123   Rmz Centennial, I Gate, Whitefield  ...   235.0             65

[5 rows x 9 columns]
  Restaurant                                    Location  ...  Votes  Reviews
0    ID_2842  Mico Layout, Stage 2, BTM Layout,Bangalore  ...  361.0    225.0
1     ID_730  Mico Layout, Stage 2, BTM Layout,Bangalore  ...    0.0      0.0
2    ID_4620                             Sector 1, Noida  ...   36.0     16.0
3    ID_5470                  Babarpur, New Delhi, Delhi  ...   66.0     33.0
4    ID_3249                             Sector 1, Noida  ...   38.0     14.0

[5 rows x 8 columns]


# Step 2: Preprocessing

In [0]:
# Encoding the cuisines
from sklearn.preprocessing import MultiLabelBinarizer
mb_cuisines = MultiLabelBinarizer(classes=cuisines)

c_values = [_c.split(', ') for _c in df.Cuisines.values]
y_df = mb_cuisines.fit_transform(c_values)
y_df = pd.DataFrame(y_df)

new_df = pd.concat([y_df, df], axis=1)
new_df = new_df.drop(['Cuisines', 'Restaurant'], axis=1, index=None)
print(new_df.head())

   0  1  2  3  4  ...  Minimum_Order  Rating  Votes  Reviews  Delivery_Time
0  1  1  1  1  1  ...           50.0     3.5   12.0      4.0             30
1  0  0  0  0  0  ...           50.0     3.5   11.0      4.0             30
2  1  0  0  0  0  ...           50.0     3.6   99.0     30.0             65
3  0  0  0  0  0  ...           99.0     3.7  176.0     95.0             30
4  0  0  0  0  0  ...           99.0     3.2  521.0    235.0             65

[5 rows x 108 columns]


In [0]:
# Encoding the locations
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

new_df.Location = lb.fit_transform(new_df.Location)
final_df = new_df

In [0]:
# Operations on test set
test = test.drop(['Restaurant'], axis = 1, index = None)
test.head()

Unnamed: 0,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,"Mico Layout, Stage 2, BTM Layout,Bangalore","North Indian, Chinese, Assamese",350.0,50.0,4.2,361.0,225.0
1,"Mico Layout, Stage 2, BTM Layout,Bangalore","Biryani, Kebab",100.0,50.0,0.0,0.0,0.0
2,"Sector 1, Noida",Fast Food,100.0,50.0,3.6,36.0,16.0
3,"Babarpur, New Delhi, Delhi","Mithai, North Indian, Chinese, Fast Food, Sout...",200.0,50.0,3.6,66.0,33.0
4,"Sector 1, Noida","Chinese, Fast Food",150.0,50.0,2.9,38.0,14.0


In [0]:
c_values = [_c.split(', ') for _c in test.Cuisines.values]
y_df = mb_cuisines.fit_transform(c_values)
y_df = pd.DataFrame(y_df)

new_df_test = pd.concat([y_df, test], axis=1)
new_df_test.head()
new_df_test = new_df_test.drop(['Cuisines'], axis=1, index=None)

In [0]:
new_df_test.Location = lb.transform(new_df_test.Location)
final_df_test = new_df_test
final_df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,350.0,50.0,4.2,361.0,225.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,100.0,50.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,100.0,50.0,3.6,36.0,16.0
3,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,200.0,50.0,3.6,66.0,33.0
4,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,150.0,50.0,2.9,38.0,14.0


In [0]:
print("The classes recognized by the Label Encoder")
print(lb.classes_)

The classes recognized by the Label Encoder
['BTM Layout 1, Electronic City' 'Babarpur, New Delhi, Delhi'
 'Chandni Chowk, Kolkata' 'Chatta Bazaar, Malakpet, Hyderabad'
 'D-Block, Sector 63, Noida' 'Delhi Administration Flats, Timarpur'
 'Delhi Cantt.' 'Delhi High Court, India Gate'
 'Delhi University-GTB Nagar' 'Dockyard Road, Mumbai CST Area'
 'FTI College, Law College Road, Pune'
 'Gora Bazar, Rajbari, North Dumdum, Kolkata'
 'Hyderabad Public School, Begumpet' 'Jaya Nagar, Saidabad, Hyderabad'
 'Laxman Vihar Industrial Area, Sector 3A, Gurgoan' 'MG Road, Pune'
 'Majestic' 'Mico Layout, Stage 2, BTM Layout,Bangalore'
 'Moulali, Kolkata' 'Mumbai Central' 'Musi Nagar, Malakpet, Hyderabad'
 'Nathan Road, Mangaldas Road, Pune'
 'Noorkhan Bazaar, Malakpet, Hyderabad'
 'Panjetan Colony, Malakpet, Hyderabad' 'Pune University'
 'Raja Bazar, Kolkata' 'Rmz Centennial, I Gate, Whitefield'
 'Sandhurst Road, Mumbai CST Area' 'Sector 1, Noida' 'Sector 14, Noida'
 'Sector 3, Marathalli' 'Sector 63

In [0]:
np.random.seed(0)
final_df = final_df.sample(frac=1)

In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
final_df.Average_Cost = scaler.fit_transform(final_df.Average_Cost.values.reshape(-1, 1))
final_df.Minimum_Order = scaler.fit_transform(final_df.Minimum_Order.values.reshape(-1, 1))
final_df.Rating = scaler.fit_transform(final_df.Rating.values.reshape(-1, 1))
final_df.Votes = scaler.fit_transform(final_df.Votes.values.reshape(-1, 1))
final_df.Reviews = scaler.fit_transform(final_df.Reviews.values.reshape(-1, 1))

In [0]:
print(final_df.Delivery_Time.value_counts())

30     7406
45     2665
65      923
120      62
20       20
80       14
10        4
Name: Delivery_Time, dtype: int64


In [0]:
print(final_df_test.head())

   0  1  2  3  4  ...  Average_Cost  Minimum_Order  Rating  Votes  Reviews
0  0  0  0  0  0  ...         350.0           50.0     4.2  361.0    225.0
1  0  0  0  0  0  ...         100.0           50.0     0.0    0.0      0.0
2  1  0  0  0  0  ...         100.0           50.0     3.6   36.0     16.0
3  1  0  0  0  0  ...         200.0           50.0     3.6   66.0     33.0
4  1  0  0  0  0  ...         150.0           50.0     2.9   38.0     14.0

[5 rows x 107 columns]


In [0]:
X = final_df.iloc[:, :-1]
y = final_df.iloc[:, -1]

In [0]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews
9131,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0.075,0.1,0.816327,0.037663,0.038130
1035,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0.125,0.1,0.734694,0.032472,0.008918
6291,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0.025,0.1,0.857143,0.083720,0.044588
7425,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0.025,0.1,0.000000,0.000000,0.000000
5783,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0.050,0.1,0.755102,0.002540,0.001538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4859,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0.025,0.1,0.816327,0.012702,0.003383
3264,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0.050,0.1,0.000000,0.000000,0.000000
9845,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0.075,0.1,0.000000,0.000000,0.000000
10799,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0.075,0.1,0.693878,0.007400,0.005689


In [0]:
print(X.head())
print(y.head())

      0  1  2  3  4  ...  Average_Cost  Minimum_Order    Rating     Votes   Reviews
9131  0  0  0  0  0  ...         0.075            0.1  0.816327  0.037663  0.038130
1035  1  0  0  0  0  ...         0.125            0.1  0.734694  0.032472  0.008918
6291  1  0  0  0  0  ...         0.025            0.1  0.857143  0.083720  0.044588
7425  0  0  0  0  0  ...         0.025            0.1  0.000000  0.000000  0.000000
5783  0  1  0  0  0  ...         0.050            0.1  0.755102  0.002540  0.001538

[5 rows x 107 columns]
9131    30
1035    45
6291    45
7425    30
5783    30
Name: Delivery_Time, dtype: int64


In [0]:
X_test = final_df_test

In [0]:
print(X.info())
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11094 entries, 9131 to 2732
Columns: 107 entries, 0 to Reviews
dtypes: float64(5), int64(102)
memory usage: 9.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 107 entries, 0 to Reviews
dtypes: float64(5), int64(102)
memory usage: 2.3 MB
None


In [0]:
#encode the y values
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
y = encoder.fit_transform(y)
print(np.unique(y))
#hotencoding using keras
y = np_utils.to_categorical(y)

[0 1 2 3 4 5 6]


In [0]:
print(X.shape, y.shape, X_test.shape)

(11094, 107) (11094, 7) (2774, 107)


In [0]:
classes = np.unique([np.argmax(x) for x in y])
classes

array([0, 1, 2, 3, 4, 5, 6])

In [0]:
np.unique([np.argmax(x) for x in y], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([   4,   20, 7406, 2665,  923,   14,   62]))

In [0]:
from sklearn.utils.class_weight import compute_class_weight
c_weights = compute_class_weight('balanced', classes, [np.argmax(x) for x in y])
c_dict = dict()
for i, val in enumerate(c_weights):
  c_dict[i] = val
c_dict

{0: 396.2142857142857,
 1: 79.24285714285715,
 2: 0.21399637359669765,
 3: 0.5946931117662825,
 4: 1.7170716607336325,
 5: 113.20408163265306,
 6: 25.56221198156682}

In [0]:
bias_weights = tf.keras.initializers.Constant(c_weights)
bias_weights.value

array([3.96214286e+02, 7.92428571e+01, 2.13996374e-01, 5.94693112e-01,
       1.71707166e+00, 1.13204082e+02, 2.55622120e+01])

In [0]:
X.shape[1]

107

Classification using XGBClassifier

In [0]:
from xgboost import XGBClassifier
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1, num_class=y_train.unique())
xgb.fit(X_train, y_train, sample_weight=sample_weights)
xgb_preds = predict_values(xgb, y_test)
np.unique(xgb_preds, return_counts=True)

In [0]:
result_preds = pd.DataFrame(xgb_preds, columns=['Delivery_Time'], index=None)
result_preds['Delivery_Time'] = result_preds['Delivery_Time'].astype(str) + ' minutes'
result_preds.to_excel('xgb_pred.xlsx')

Classification using RandomForestClassifier


In [0]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=250, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                             min_impurity_split=None, bootstrap=True, oob_score=False, 
                             n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=c_w_dict)
rfc.fit(train_X, train_y)
rfc_preds = predict_values(rfc, y_test)
np.unique(rfc_preds, return_counts=True)

In [0]:
result_preds = pd.DataFrame(rfc_preds, columns=['Delivery_Time'], index=None)
result_preds['Delivery_Time'] = result_preds['Delivery_Time'].astype(str) + ' minutes'
result_preds.to_excel('rfc.xlsx')