In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, scale

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
train  = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submit = pd.read_csv('Sample_submission.csv')

In [4]:
train['Supermarket _Size'] = train['Supermarket _Size'].fillna('High')

In [5]:
test['Supermarket _Size'] = test['Supermarket _Size'].fillna('High')

In [6]:
a = train['Product_Weight'].mean()

train['Product_Weight'] = train['Product_Weight'].fillna(a)

In [7]:
b = train['Product_Weight'].mean()

test['Product_Weight'] = test['Product_Weight'].fillna(b)

In [8]:
train.drop(['Product_Identifier', 'Supermarket_Identifier', 'Product_Supermarket_Identifier',], axis=1, inplace=True)
test.drop(['Product_Identifier', 'Supermarket_Identifier', 'Product_Supermarket_Identifier'], axis=1, inplace=True)

In [9]:
train['PPW'] = train['Product_Weight'] * train['Product_Price']
test['PPW'] = test['Product_Weight'] * test['Product_Price']

In [10]:
train['APP'] = train['Average_Price_per_ProductType'] / train['Product_Price']
test['APP'] = test['Average_Price_per_ProductType'] / test['Product_Price']

In [11]:
train['PSV'] = train['Product_Shelf_Visibility'] * 100
test['PSV'] = test['Product_Shelf_Visibility'] * 100

In [12]:
def ordinal_encode(e_train, e_test):
    ord_enc = OrdinalEncoder()
    ord_enc.fit(e_train)
    train_encode = ord_enc.transform(e_train)
    test_encode = ord_enc.transform(e_test)
    return train_encode, test_encode

In [13]:
col_encode = ['Product_Fat_Content', 'Product_Type', 'Supermarket _Size', 'Supermarket_Location_Type', 'Supermarket_Type']

In [14]:
train_cate = train[col_encode]

In [15]:
test_cate = test[col_encode]

In [16]:
train_cate_encode, test_cate_encode = ordinal_encode(train_cate, test_cate)

In [17]:
train[col_encode] = pd.DataFrame(train_cate_encode)
test[col_encode] = pd.DataFrame(test_cate_encode)

In [18]:
train.head()

Unnamed: 0,Product_Weight,Product_Fat_Content,Product_Shelf_Visibility,Product_Type,Product_Price,Supermarket_Opening_Year,Supermarket _Size,Supermarket_Location_Type,Supermarket_Type,Average_Price_per_ProductType,Product_Supermarket_Sales,PPW,APP,PSV
0,10.5,0.0,0.027276,9.0,428.28,2004,2.0,0.0,1.0,399.999418,8983.31,4496.94,0.933967,2.727641
1,10.0,0.0,0.045928,5.0,345.3,2006,1.0,0.0,1.0,388.071094,4893.63,3453.0,1.123866,4.592823
2,8.235,0.0,0.014489,8.0,459.49,2009,0.0,1.0,1.0,367.143029,7323.8,3783.90015,0.799023,1.4489
3,18.85,0.0,0.016108,8.0,324.41,2004,2.0,0.0,1.0,367.143029,7541.85,6115.1285,1.131725,1.610755
4,12.6,1.0,0.07407,4.0,635.59,2009,0.0,1.0,1.0,409.504343,11445.1,8008.434,0.64429,7.407035


In [19]:
test.head()

Unnamed: 0,Product_Weight,Product_Fat_Content,Product_Shelf_Visibility,Product_Type,Product_Price,Supermarket_Opening_Year,Supermarket _Size,Supermarket_Location_Type,Supermarket_Type,Average_Price_per_ProductType,PPW,APP,PSV
0,20.2,1.0,0.121153,13.0,243.44,1994,0.0,2.0,1.0,395.35219,4917.488,1.624023,12.115333
1,17.6,0.0,0.049268,13.0,282.54,2016,1.0,2.0,2.0,395.35219,4972.704,1.399279,4.926776
2,17.1,0.0,0.032761,5.0,372.1,2016,1.0,2.0,2.0,388.071094,6362.91,1.042922,3.276102
3,17.75,0.0,0.0,11.0,271.73,2016,1.0,2.0,2.0,388.6077,4823.2075,1.430124,0.0
4,19.6,0.0,0.023951,9.0,264.24,2004,2.0,0.0,1.0,399.999418,5179.104,1.513773,2.395115


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 14 columns):
Product_Weight                   2994 non-null float64
Product_Fat_Content              2994 non-null object
Product_Shelf_Visibility         2994 non-null float64
Product_Type                     2994 non-null object
Product_Price                    2994 non-null float64
Supermarket_Opening_Year         2994 non-null int64
Supermarket _Size                2994 non-null object
Supermarket_Location_Type        2994 non-null object
Supermarket_Type                 2994 non-null object
Average_Price_per_ProductType    2994 non-null float64
Product_Supermarket_Sales        2994 non-null float64
PPW                              2994 non-null float64
APP                              2994 non-null float64
PSV                              2994 non-null float64
dtypes: float64(8), int64(1), object(5)
memory usage: 327.5+ KB


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1996 entries, 0 to 1995
Data columns (total 13 columns):
Product_Weight                   1996 non-null float64
Product_Fat_Content              1996 non-null object
Product_Shelf_Visibility         1996 non-null float64
Product_Type                     1996 non-null object
Product_Price                    1996 non-null float64
Supermarket_Opening_Year         1996 non-null int64
Supermarket _Size                1996 non-null object
Supermarket_Location_Type        1996 non-null object
Supermarket_Type                 1996 non-null object
Average_Price_per_ProductType    1996 non-null float64
PPW                              1996 non-null float64
APP                              1996 non-null float64
PSV                              1996 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 202.8+ KB


In [20]:
X = train.drop('Product_Supermarket_Sales', axis=1)
y = train['Product_Supermarket_Sales']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
rfc_model = RandomForestRegressor(n_estimators=1000, max_depth=None, min_samples_split=10, max_leaf_nodes=50)

In [24]:
rfc_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=50,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [25]:
rfc_pred = rfc_model.predict(X_test)

In [26]:
rfc_score = mean_squared_error(y_test, rfc_pred)

In [27]:
math.sqrt(rfc_score)

3111.879557932649

In [28]:
rfc_final = rfc_model.predict(test)

In [29]:
submit.head()

Unnamed: 0,Product_Supermarket_Identifier,Product_Supermarket_Sales
0,FDY57_CHUKWUDI013,0
1,FDY10_CHUKWUDI018,0
2,FDO40_CHUKWUDI018,0
3,NCQ43_CHUKWUDI018,0
4,NCC07_CHUKWUDI046,0


In [30]:
submit['Product_Supermarket_Sales'] = rfc_final

In [31]:
submit.head()

Unnamed: 0,Product_Supermarket_Identifier,Product_Supermarket_Sales
0,FDY57_CHUKWUDI013,3854.703533
1,FDY10_CHUKWUDI018,4523.149135
2,FDO40_CHUKWUDI018,5930.809756
3,NCQ43_CHUKWUDI018,4085.107423
4,NCC07_CHUKWUDI046,4022.213649


In [32]:
submit.to_csv('rfc_6-7.csv', index=None)

In [79]:
from sklearn.svm import SVR

In [80]:
svr_model = SVR(kernel='poly', degree=3, gamma='auto')

In [None]:
svr_model.fit(X_train, y_train)

In [None]:
svr_pred = svr_model.predict(X_test)

In [None]:
svr_score = mean_squared_error(y_test, svr_pred)

In [None]:
math.sqrt(svr_score)

In [71]:
parameters = {'depth': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1], 'iterations': [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]}

In [72]:
rand_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=parameters, cv=2, n_jobs=-1)

In [73]:
rand_search.fit(X_train, y_train)

Parameters: { depth, iterations, n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=4,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n...
                                          validate_parameters=1,
                                          verbosity=None),
                   iid='warn', n_iter=10, n_jobs=-1,
             

In [74]:
print('Result:')
print('\n the best estimator across all searched parmas: ', rand_search.best_estimator_)
print('\n the best score across all searched params: ', rand_search.best_score_)
print('\n the best parameters acress al best searched params: ', rand_search.best_params_)

Result:

 the best estimator across all searched parmas:  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, depth=6, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             iterations=100, learning_rate=0.04, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimator=5000, n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=10, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

 the best score across all searched params:  0.5265744420365911

 the best parameters acress al best searched params:  {'learning_rate': 0.04, 'iterations': 100, 'depth': 6}


In [62]:
X_train.head()

Unnamed: 0,Product_Weight,Product_Fat_Content,Product_Shelf_Visibility,Product_Type,Product_Price,Supermarket_Opening_Year,Supermarket _Size,Supermarket_Location_Type,Supermarket_Type,Average_Price_per_ProductType,PPW,APP,PSV
1089,-2.450385e-14,0.0,-0.762128,13.0,2.053726,1992,1.0,2.0,3.0,0.30716,1.346677,-0.888431,-0.762128
1868,0.2260184,1.0,-0.021138,6.0,-1.673998,1994,0.0,2.0,1.0,0.545963,-1.036903,1.560446,-0.021138
896,0.7053168,0.0,0.257391,3.0,-2.443725,1994,0.0,2.0,1.0,0.007448,-1.496005,4.678238,0.257391
2645,1.815886,1.0,-0.336405,6.0,0.597935,2011,2.0,1.0,1.0,0.545963,1.933961,-0.486542,-0.336405
1028,1.605463,0.0,-1.267584,7.0,-0.652445,2006,1.0,0.0,1.0,-1.239278,0.486055,0.085516,-1.267584


In [43]:
features = ['Product_Weight', 'Product_Fat_Content', 'Product_Shelf_Visibility',
       'Product_Type', 'Product_Price', 'Supermarket_Opening_Year',
       'Supermarket _Size', 'Supermarket_Location_Type', 'Supermarket_Type',
       'Average_Price_per_ProductType', 'PPW', 'APP', 'PSV']

In [20]:
from lightgbm import LGBMRegressor

In [30]:
lgbm_model = LGBMRegressor(boosting_type='rf', objective='regression')

In [45]:
lgbm_model.fit(X_train, y_train, feature_name=col_encode, categorical_feature=col_encode, early_stopping_rounds=500)

New categorical_feature is ['Product_Fat_Content', 'Product_Type', 'Supermarket _Size', 'Supermarket_Location_Type', 'Supermarket_Type']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: Product_Fat_Content, Product_Type, Supermarket _Size, Supermarket_Location_Type, Supermarket_Type

In [33]:
from xgboost import XGBRegressor

In [35]:
xgb_model = XGBRegressor(n_estimator=5000, max_depth=4, reg_lambda=10, random_state=1)

In [36]:
xgb_model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=300)

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:5778.62451	validation_1-rmse:5953.45801
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 300 rounds.
[1]	validation_0-rmse:4600.46582	validation_1-rmse:4759.96533
[2]	validation_0-rmse:3864.01514	validation_1-rmse:4022.04004
[3]	validation_0-rmse:3403.43066	validation_1-rmse:3572.10913
[4]	validation_0-rmse:3136.68018	validation_1-rmse:3325.74731
[5]	validation_0-rmse:2978.95044	validation_1-rmse:3185.53540
[6]	validation_0-rmse:2885.71753	validation_1-rmse:3122.32764
[7]	validation_0-rmse:2826.41064	validation_1-rmse:3084.44653
[8]	validation_0-rmse:2782.15479	validation_1-rmse:3078.3

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimator=5000, n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=1, reg_alpha=0,
             reg_lambda=10, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
xgb_pred = xgb_model.predict(X_test)

In [38]:
xgb_score = mean_squared_error(y_test, xgb_pred)

In [39]:
math.sqrt(xgb_score)

3078.39722387781

In [None]:
{'learning_rate': 0.04, 'iterations': 100, 'depth': 6}

In [40]:
from catboost import CatBoostRegressor

In [42]:
cat_model = CatBoostRegressor(learning_rate=0.09, iterations=65, depth=6)

In [43]:
cat_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=500, use_best_model=True)

0:	learn: 4256.1478548	test: 4256.1478548	test1: 4395.6232992	best: 4395.6232992 (0)	total: 57.4ms	remaining: 3.67s
1:	learn: 4083.1986534	test: 4083.1986534	test1: 4227.8238785	best: 4227.8238785 (1)	total: 60.6ms	remaining: 1.91s
2:	learn: 3920.1649050	test: 3920.1649050	test1: 4069.2514032	best: 4069.2514032 (2)	total: 63ms	remaining: 1.3s
3:	learn: 3785.6395816	test: 3785.6395816	test1: 3936.3887168	best: 3936.3887168 (3)	total: 66.2ms	remaining: 1.01s
4:	learn: 3656.6716658	test: 3656.6716658	test1: 3807.8552369	best: 3807.8552369 (4)	total: 69.9ms	remaining: 838ms
5:	learn: 3551.6331536	test: 3551.6331536	test1: 3706.1059704	best: 3706.1059704 (5)	total: 72.9ms	remaining: 717ms
6:	learn: 3461.4243256	test: 3461.4243256	test1: 3619.7843732	best: 3619.7843732 (6)	total: 76ms	remaining: 630ms
7:	learn: 3377.0833181	test: 3377.0833181	test1: 3536.7906524	best: 3536.7906524 (7)	total: 79.2ms	remaining: 564ms
8:	learn: 3307.3056466	test: 3307.3056466	test1: 3467.6935879	best: 3467.6935

<catboost.core.CatBoostRegressor at 0x19b059b9588>

In [44]:
from sklearn.model_selection import KFold

In [48]:
xgb = XGBRegressor(n_estimators=5000, max_depth=4, reg_lambda=5, random_state=101)
NFOLDS = 7
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=30)

In [52]:
scores = []
test_pred = np.zeros(len(test))

for fold, (tr_in, te_in) in enumerate(kf.split(X)):
    print(f'============fold{fold}=============')
    X_train, X_test = X.iloc[tr_in], X.iloc[te_in]
    y_train, y_test = y.iloc[tr_in], y.iloc[te_in]
    xgb.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=500)
    scores.append(rmse(y_test, xgb.predict(X_test)))
    test_scores += xgb.predict(test)
    oof_test_kf[:,fold] += xgb.predict(test)

[0]	validation_0-rmse:5786.62158	validation_1-rmse:5789.64258
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 500 rounds.
[1]	validation_0-rmse:4586.20557	validation_1-rmse:4616.29053
[2]	validation_0-rmse:3845.13599	validation_1-rmse:3894.47461
[3]	validation_0-rmse:3410.55957	validation_1-rmse:3478.31104
[4]	validation_0-rmse:3156.17627	validation_1-rmse:3216.18604
[5]	validation_0-rmse:3010.62573	validation_1-rmse:3078.95459
[6]	validation_0-rmse:2923.22827	validation_1-rmse:3023.21289
[7]	validation_0-rmse:2867.47730	validation_1-rmse:2996.55249
[8]	validation_0-rmse:2832.18774	validation_1-rmse:2994.03857
[9]	validation_0-rmse:2805.82739	validation_1-rmse:2980.36108
[10]	validation_0-rmse:2783.31177	validation_1-rmse:2984.70654
[11]	validation_0-rmse:2763.24634	validation_1-rmse:2993.95776
[12]	validation_0-rmse:2750.93066	validation_1-rmse:2999.59595
[13]	validation_0-rmse:2724.972

[127]	validation_0-rmse:1779.45435	validation_1-rmse:3214.79272
[128]	validation_0-rmse:1773.22534	validation_1-rmse:3212.55322
[129]	validation_0-rmse:1766.37756	validation_1-rmse:3211.34424
[130]	validation_0-rmse:1763.95105	validation_1-rmse:3212.04443
[131]	validation_0-rmse:1756.03235	validation_1-rmse:3210.65015
[132]	validation_0-rmse:1754.03406	validation_1-rmse:3210.12793
[133]	validation_0-rmse:1750.82617	validation_1-rmse:3213.06641
[134]	validation_0-rmse:1747.51160	validation_1-rmse:3213.29053
[135]	validation_0-rmse:1742.40002	validation_1-rmse:3217.90430
[136]	validation_0-rmse:1739.80750	validation_1-rmse:3214.89697
[137]	validation_0-rmse:1737.50879	validation_1-rmse:3210.73120
[138]	validation_0-rmse:1732.30408	validation_1-rmse:3210.11182
[139]	validation_0-rmse:1730.36877	validation_1-rmse:3211.39087
[140]	validation_0-rmse:1725.69043	validation_1-rmse:3212.72681
[141]	validation_0-rmse:1722.12744	validation_1-rmse:3214.71167
[142]	validation_0-rmse:1715.93127	valid

[256]	validation_0-rmse:1318.05115	validation_1-rmse:3299.78760
[257]	validation_0-rmse:1315.64539	validation_1-rmse:3300.70801
[258]	validation_0-rmse:1313.89392	validation_1-rmse:3302.13696
[259]	validation_0-rmse:1311.58716	validation_1-rmse:3300.85010
[260]	validation_0-rmse:1307.58594	validation_1-rmse:3297.84253
[261]	validation_0-rmse:1302.98767	validation_1-rmse:3298.72290
[262]	validation_0-rmse:1300.77356	validation_1-rmse:3300.72607
[263]	validation_0-rmse:1297.03174	validation_1-rmse:3302.37354
[264]	validation_0-rmse:1292.47278	validation_1-rmse:3300.30322
[265]	validation_0-rmse:1289.74707	validation_1-rmse:3302.99731
[266]	validation_0-rmse:1289.08069	validation_1-rmse:3303.14526
[267]	validation_0-rmse:1284.85596	validation_1-rmse:3305.27832
[268]	validation_0-rmse:1282.77832	validation_1-rmse:3303.19214
[269]	validation_0-rmse:1279.95923	validation_1-rmse:3301.66553
[270]	validation_0-rmse:1278.87134	validation_1-rmse:3301.60669
[271]	validation_0-rmse:1277.03528	valid

[385]	validation_0-rmse:979.82178	validation_1-rmse:3365.64160
[386]	validation_0-rmse:978.42456	validation_1-rmse:3365.70703
[387]	validation_0-rmse:977.07776	validation_1-rmse:3364.48438
[388]	validation_0-rmse:975.29364	validation_1-rmse:3365.07349
[389]	validation_0-rmse:974.13513	validation_1-rmse:3364.43237
[390]	validation_0-rmse:970.49902	validation_1-rmse:3363.54883
[391]	validation_0-rmse:968.13019	validation_1-rmse:3364.48218
[392]	validation_0-rmse:967.48090	validation_1-rmse:3365.03149
[393]	validation_0-rmse:965.90912	validation_1-rmse:3364.25659
[394]	validation_0-rmse:963.53906	validation_1-rmse:3364.36572
[395]	validation_0-rmse:961.54321	validation_1-rmse:3363.04346
[396]	validation_0-rmse:958.57947	validation_1-rmse:3362.59814
[397]	validation_0-rmse:957.09668	validation_1-rmse:3362.27490
[398]	validation_0-rmse:954.69605	validation_1-rmse:3362.31689
[399]	validation_0-rmse:952.17499	validation_1-rmse:3362.50513
[400]	validation_0-rmse:949.61481	validation_1-rmse:336

NameError: name 'rmse' is not defined

In [53]:
scores

[]