In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_QoiMO9B.csv')
sub = pd.read_csv('sample_submission_hSlSoT6.csv')
meal = pd.read_csv('meal_info.csv')
fulfil = pd.read_csv('fulfilment_center_info.csv')

train.shape, test.shape

((456548, 9), (32573, 8))

In [3]:
meal.head()

Unnamed: 0,meal_id,category,cuisine
0,1885,Beverages,Thai
1,1993,Beverages,Thai
2,2539,Beverages,Thai
3,1248,Beverages,Indian
4,2631,Beverages,Indian


In [4]:
train.head(3)

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189


In [5]:
fulfil.head(2)

Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7


In [6]:
# join data --> meal and train then meal and test
train_meal = pd.merge(train, meal, how='left', on='meal_id')
test_meal = pd.merge(test, meal, how='left', on='meal_id')

print(train.shape, test.shape)

(456548, 9) (32573, 8)


In [7]:
train_mf = pd.merge(train_meal, fulfil, how='left', on='center_id')
test_mf = pd.merge(test_meal, fulfil, how='left', on='center_id')


In [8]:
print(train_mf.shape, test_mf.shape)

(456548, 15) (32573, 14)


In [9]:
train_mf.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0


In [10]:
df = pd.concat([train_mf, test_mf], ignore_index=True, sort=False)
df.shape

(489121, 15)

In [11]:
df.nunique()

id                       489121
week                        155
center_id                    77
meal_id                      51
checkout_price             1995
base_price                 1931
emailer_for_promotion         2
homepage_featured             2
num_orders                 1250
category                     14
cuisine                       4
city_code                    51
region_code                   8
center_type                   3
op_area                      30
dtype: int64

${\textbf{Feature Engineering}}$

In [12]:
##################################### checkout_price ##############################################
##################################################################################################

df['week_checkout_price_mean'] = df.groupby('week')['checkout_price'].transform('mean')
df['week_checkout_price_std'] = df.groupby('week')['checkout_price'].transform('std')

df['center_id_checkout_price_mean'] = df.groupby('center_id')['checkout_price'].transform('mean')
df['center_id_checkout_price_std'] = df.groupby('center_id')['checkout_price'].transform('std')

df['meal_id_checkout_price_mean'] = df.groupby('meal_id')['checkout_price'].transform('mean')
df['meal_id_checkout_price_std'] = df.groupby('meal_id')['checkout_price'].transform('std')

df['category_checkout_price_mean'] = df.groupby('category')['checkout_price'].transform('mean')
df['category_checkout_price_std'] = df.groupby('category')['checkout_price'].transform('std')

df['cuisine_checkout_price_mean'] = df.groupby('cuisine')['checkout_price'].transform('mean')
df['cuisine_checkout_price_std'] = df.groupby('cuisine')['checkout_price'].transform('std')

df['city_code_checkout_price_mean'] = df.groupby('city_code')['checkout_price'].transform('mean')
df['city_code_checkout_price_std'] = df.groupby('city_code')['checkout_price'].transform('std')

df['region_code_checkout_price_mean'] = df.groupby('region_code')['checkout_price'].transform('mean')
df['region_code_checkout_price_std'] = df.groupby('region_code')['checkout_price'].transform('std')

df['center_type_checkout_price_mean'] = df.groupby('center_type')['checkout_price'].transform('mean')
df['center_type_checkout_price_std'] = df.groupby('center_type')['checkout_price'].transform('std')

df['op_area_checkout_price_mean'] = df.groupby('op_area')['checkout_price'].transform('mean')
df['op_area_checkout_price_std'] = df.groupby('op_area')['checkout_price'].transform('std')


##################################### base_price ##############################################
##################################################################################################

df['week_base_price_mean'] = df.groupby('week')['base_price'].transform('mean')
df['week_base_price_std'] = df.groupby('week')['base_price'].transform('std')

df['center_id_base_price_mean'] = df.groupby('center_id')['base_price'].transform('mean')
df['center_id_base_price_std'] = df.groupby('center_id')['base_price'].transform('std')

df['meal_id_base_price_mean'] = df.groupby('meal_id')['base_price'].transform('mean')
df['meal_id_base_price_std'] = df.groupby('meal_id')['base_price'].transform('std')

df['category_base_price_mean'] = df.groupby('category')['base_price'].transform('mean')
df['category_base_price_std'] = df.groupby('category')['base_price'].transform('std')

df['cuisine_base_price_mean'] = df.groupby('cuisine')['base_price'].transform('mean')
df['cuisine_base_price_std'] = df.groupby('cuisine')['base_price'].transform('std')

df['city_code_base_price_mean'] = df.groupby('city_code')['base_price'].transform('mean')
df['city_code_base_price_std'] = df.groupby('city_code')['base_price'].transform('std')

df['region_code_base_price_mean'] = df.groupby('region_code')['base_price'].transform('mean')
df['region_code_base_price_std'] = df.groupby('region_code')['base_price'].transform('std')

df['center_type_base_price_mean'] = df.groupby('center_type')['base_price'].transform('mean')
df['center_type_base_price_std'] = df.groupby('center_type')['base_price'].transform('std')

df['op_area_base_price_mean'] = df.groupby('op_area')['base_price'].transform('mean')
df['op_area_base_price_std'] = df.groupby('op_area')['base_price'].transform('std')



In [13]:
cat_feature_index = np.where(df.dtypes == object)[0]
cat_feature_index

array([ 9, 10, 13], dtype=int64)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# ct =ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [9,10,13])], remainder='passthrough')
cols_to_dum = ['category', 'cuisine', 'center_type']
df = pd.get_dummies(columns=cols_to_dum, data=df)
df.shape

(489121, 69)

In [16]:
col_2_drop = ['id', 'week', 'center_id', 'meal_id']
df = df.drop(columns=col_2_drop, axis=1)
df.shape

(489121, 65)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_df = df[df['num_orders'].isna() == False]
test_df = df[df['num_orders'].isna() == True]
test_df.drop('num_orders', axis=1, inplace=True)


In [24]:
X = train_df.drop('num_orders', axis=1)
y = train_df['num_orders']


In [30]:
def rmsle(true, pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(true, pred))

In [31]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
err = []
y_pred_totcb = []
kfold = KFold(n_splits=5, random_state=101)
for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]    
    
    m1 = CatBoostRegressor(iterations=5000, learning_rate=0.001, eval_metric='RMSE', random_state=110)
    m1.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
           early_stopping_rounds=300, verbose=500)
    preds = m1.predict(X_test)
    print('err:', rmsle(y_test, preds))
    err.append(rmsle(y_test, preds))
    p2 = m1.predict(tes)
    y_pred_totcb.append(p2)
np.mean(err)

0:	learn: 382.3927807	test: 382.3927807	test1: 445.1405089	best: 445.1405089 (0)	total: 129ms	remaining: 10m 44s
500:	learn: 314.6526349	test: 314.6526349	test1: 381.3842680	best: 381.3842680 (500)	total: 1m 10s	remaining: 10m 30s
1000:	learn: 276.0918943	test: 276.0918943	test1: 341.3661099	best: 341.3661099 (1000)	total: 2m 19s	remaining: 9m 16s
1500:	learn: 252.7852070	test: 252.7852070	test1: 315.1890756	best: 315.1890756 (1500)	total: 3m 16s	remaining: 7m 38s
2000:	learn: 238.8248731	test: 238.8248731	test1: 300.3772519	best: 300.3772519 (2000)	total: 4m 12s	remaining: 6m 18s
2500:	learn: 229.3530159	test: 229.3530159	test1: 291.1094776	best: 291.1094776 (2500)	total: 5m 3s	remaining: 5m 3s
3000:	learn: 222.2559081	test: 222.2559081	test1: 283.9358854	best: 283.9358854 (3000)	total: 5m 54s	remaining: 3m 56s
3500:	learn: 217.2074234	test: 217.2074234	test1: 279.7593138	best: 279.7593138 (3500)	total: 6m 45s	remaining: 2m 53s
4000:	learn: 213.1658279	test: 213.1658279	test1: 276.804

227.3632012324374

In [33]:
predict1 = np.mean(y_pred_totcb, 0)

a = {'num_orders': predict1}

sub1 = pd.DataFrame(data=a)

In [34]:
sub['num_orders'] = sub1
sub.to_csv('sub1.csv', index=False)

In [35]:
sub.head()

Unnamed: 0,id,num_orders
0,1028232,207.283272
1,1127204,207.283272
2,1212707,207.283272
3,1082698,203.665146
4,1400926,203.665146
