In [53]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, scale

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
train  = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submit = pd.read_csv('Sample_submission.csv')

In [28]:
train.head()

Unnamed: 0,Product_Weight,Product_Fat_Content,Product_Shelf_Visibility,Product_Type,Product_Price,Supermarket_Opening_Year,Supermarket _Size,Supermarket_Location_Type,Supermarket_Type,Average_Price_per_ProductType,Product_Supermarket_Sales,PPW,APP,PSV
0,-0.545535,0.0,-0.73763,9.0,0.292861,2004,2.0,0.0,1.0,0.690378,8983.31,-0.245713,-0.356266,-0.73763
1,-0.662437,0.0,-0.375244,5.0,-0.398619,2006,1.0,0.0,1.0,-0.293253,4893.63,-0.693654,-0.036294,-0.375244
2,-1.075101,0.0,-0.986077,8.0,0.552937,2009,0.0,1.0,1.0,-2.019018,7323.8,-0.551669,-0.583641,-0.986077
3,1.406729,0.0,-0.95463,8.0,-0.572697,2004,2.0,0.0,1.0,-2.019018,7541.85,0.448631,-0.023052,-0.95463
4,-0.054547,1.0,0.17153,4.0,2.020394,2009,0.0,1.0,1.0,1.474172,11445.1,1.261024,-0.844359,0.17153


In [29]:
test.head()

Unnamed: 0,Product_Weight,Product_Fat_Content,Product_Shelf_Visibility,Product_Type,Product_Price,Supermarket_Opening_Year,Supermarket _Size,Supermarket_Location_Type,Supermarket_Type,Average_Price_per_ProductType,PPW,APP,PSV
0,1.655901,1.0,0.936231,13.0,-1.236451,1994,0.0,2.0,1.0,0.263208,-0.061707,0.8262,0.936231
1,1.058617,0.0,-0.364899,13.0,-0.906147,2016,1.0,2.0,2.0,0.263208,-0.03774,0.434145,-0.364899
2,0.943755,0.0,-0.663671,5.0,-0.149574,2016,1.0,2.0,2.0,-0.320419,0.565695,-0.187503,-0.663671
3,1.093076,0.0,-1.256646,11.0,-0.997466,2016,1.0,2.0,2.0,-0.277406,-0.102631,0.487953,-1.256646
4,1.518066,0.0,-0.82313,9.0,-1.060739,2004,2.0,0.0,1.0,0.635713,0.05185,0.633875,-0.82313


In [6]:
train['Supermarket _Size'] = train['Supermarket _Size'].fillna('High')

In [7]:
test['Supermarket _Size'] = test['Supermarket _Size'].fillna('High')

In [8]:
a = train['Product_Weight'].mean()

train['Product_Weight'] = train['Product_Weight'].fillna(a)

In [9]:
b = train['Product_Weight'].mean()

test['Product_Weight'] = test['Product_Weight'].fillna(b)

In [10]:
train.drop(['Product_Identifier', 'Supermarket_Identifier', 'Product_Supermarket_Identifier',], axis=1, inplace=True)
test.drop(['Product_Identifier', 'Supermarket_Identifier', 'Product_Supermarket_Identifier'], axis=1, inplace=True)

In [11]:
train['PPW'] = train['Product_Weight'] * train['Product_Price']
test['PPW'] = test['Product_Weight'] * test['Product_Price']

In [12]:
train['APP'] = train['Average_Price_per_ProductType'] / train['Product_Price']
test['APP'] = test['Average_Price_per_ProductType'] / test['Product_Price']

In [13]:
train['PSV'] = train['Product_Shelf_Visibility'] * 100
test['PSV'] = test['Product_Shelf_Visibility'] * 100

In [15]:
train.isna().sum()

Product_Weight                   0
Product_Fat_Content              0
Product_Shelf_Visibility         0
Product_Type                     0
Product_Price                    0
Supermarket_Opening_Year         0
Supermarket _Size                0
Supermarket_Location_Type        0
Supermarket_Type                 0
Average_Price_per_ProductType    0
Product_Supermarket_Sales        0
PPW                              0
APP                              0
PSV                              0
dtype: int64

In [16]:
test.isna().sum()

Product_Weight                   0
Product_Fat_Content              0
Product_Shelf_Visibility         0
Product_Type                     0
Product_Price                    0
Supermarket_Opening_Year         0
Supermarket _Size                0
Supermarket_Location_Type        0
Supermarket_Type                 0
Average_Price_per_ProductType    0
PPW                              0
APP                              0
PSV                              0
dtype: int64

In [17]:
def ordinal_encode(e_train, e_test):
    ord_enc = OrdinalEncoder()
    ord_enc.fit(e_train)
    train_encode = ord_enc.transform(e_train)
    test_encode = ord_enc.transform(e_test)
    return train_encode, test_encode

In [18]:
col_encode = ['Product_Fat_Content', 'Product_Type', 'Supermarket _Size', 'Supermarket_Location_Type', 'Supermarket_Type']

In [19]:
train_cate = train[col_encode]

In [20]:
test_cate = test[col_encode]

In [21]:
train_cate_encode, test_cate_encode = ordinal_encode(train_cate, test_cate)

In [22]:
train[col_encode] = pd.DataFrame(train_cate_encode)
test[col_encode] = pd.DataFrame(test_cate_encode)

In [23]:
col_scale = ['Product_Weight', 'Product_Shelf_Visibility', 'Product_Price', 'Average_Price_per_ProductType', 'PPW', 'APP', 'PSV']

In [24]:
train_scale = scale(train[col_scale])
test_scale = scale(test[col_scale])

In [25]:
train[col_scale] = train_scale
test[col_scale] = test_scale

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 14 columns):
Product_Weight                   2994 non-null float64
Product_Fat_Content              2994 non-null float64
Product_Shelf_Visibility         2994 non-null float64
Product_Type                     2994 non-null float64
Product_Price                    2994 non-null float64
Supermarket_Opening_Year         2994 non-null int64
Supermarket _Size                2994 non-null float64
Supermarket_Location_Type        2994 non-null float64
Supermarket_Type                 2994 non-null float64
Average_Price_per_ProductType    2994 non-null float64
Product_Supermarket_Sales        2994 non-null float64
PPW                              2994 non-null float64
APP                              2994 non-null float64
PSV                              2994 non-null float64
dtypes: float64(13), int64(1)
memory usage: 327.5 KB


In [27]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1996 entries, 0 to 1995
Data columns (total 13 columns):
Product_Weight                   1996 non-null float64
Product_Fat_Content              1996 non-null float64
Product_Shelf_Visibility         1996 non-null float64
Product_Type                     1996 non-null float64
Product_Price                    1996 non-null float64
Supermarket_Opening_Year         1996 non-null int64
Supermarket _Size                1996 non-null float64
Supermarket_Location_Type        1996 non-null float64
Supermarket_Type                 1996 non-null float64
Average_Price_per_ProductType    1996 non-null float64
PPW                              1996 non-null float64
APP                              1996 non-null float64
PSV                              1996 non-null float64
dtypes: float64(12), int64(1)
memory usage: 202.8 KB


In [30]:
X = train.drop('Product_Supermarket_Sales', axis=1)
y = train['Product_Supermarket_Sales']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [32]:
submit.head()

Unnamed: 0,Product_Supermarket_Identifier,Product_Supermarket_Sales
0,FDY57_CHUKWUDI013,0
1,FDY10_CHUKWUDI018,0
2,FDO40_CHUKWUDI018,0
3,NCQ43_CHUKWUDI018,0
4,NCC07_CHUKWUDI046,0


In [33]:
from sklearn.linear_model import LinearRegression

In [43]:
linreg = LinearRegression(n_jobs=2)

In [44]:
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=2, normalize=False)

In [45]:
linreg_pred = linreg.predict(X_test)

In [46]:
linreg_score = mean_squared_error(y_test, linreg_pred)

In [47]:
math.sqrt(linreg_score)

3285.266303393114

In [39]:
linreg_final = linreg.predict(test)

In [40]:
submit['Product_Supermarket_Sales'] = linreg_final

In [41]:
submit.to_csv('linreg_sub.csv', index=None)

In [57]:
parameters = {'depth': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1], 'iterations': [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]}

In [58]:
rand_search = RandomizedSearchCV(estimator=cat_model, param_distributions=parameters, cv=2, n_jobs=-1)

In [59]:
rand_search.fit(X_train, y_train)

0:	learn: 4256.1478548	total: 58.2ms	remaining: 3.72s
1:	learn: 4083.1986534	total: 61.6ms	remaining: 1.94s
2:	learn: 3920.1649050	total: 64.3ms	remaining: 1.33s
3:	learn: 3785.6395816	total: 67.8ms	remaining: 1.03s
4:	learn: 3656.6716658	total: 71ms	remaining: 852ms
5:	learn: 3551.6331536	total: 74.4ms	remaining: 732ms
6:	learn: 3461.4243256	total: 77.9ms	remaining: 645ms
7:	learn: 3377.0833181	total: 81.3ms	remaining: 579ms
8:	learn: 3307.3056466	total: 84.6ms	remaining: 526ms
9:	learn: 3244.4874893	total: 87.8ms	remaining: 483ms
10:	learn: 3192.5235707	total: 91ms	remaining: 447ms
11:	learn: 3151.5848583	total: 94.2ms	remaining: 416ms
12:	learn: 3114.1003093	total: 97.6ms	remaining: 390ms
13:	learn: 3078.2455347	total: 101ms	remaining: 370ms
14:	learn: 3051.0874058	total: 105ms	remaining: 349ms
15:	learn: 3028.5869159	total: 109ms	remaining: 333ms
16:	learn: 3004.9786605	total: 112ms	remaining: 317ms
17:	learn: 2989.2514610	total: 115ms	remaining: 301ms
18:	learn: 2971.2802857	total

RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=<catboost.core.CatBoostRegressor object at 0x00000265A53E3C50>,
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'depth': [6, 7, 8, 9, 10, 11, 12, 13,
                                                  14, 15],
                                        'iterations': [30, 35, 40, 45, 50, 55,
                                                       60, 65, 70, 75, 80, 85,
                                                       90, 95, 100],
                                        'learning_rate': [0.01, 0.02, 0.03,
                                                          0.04, 0.05, 0.06,
                                                          0.07, 0.08, 0.09,
                                                          0.1]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbo

In [60]:
print('Result:')
print('\n the best estimator across all searched parmas: ', rand_search.best_estimator_)
print('\n the best score across all searched params: ', rand_search.best_score_)
print('\n the best parameters acress al best searched params: ', rand_search.best_params_)

Result:

 the best estimator across all searched parmas:  <catboost.core.CatBoostRegressor object at 0x00000265A6B7BA90>

 the best score across all searched params:  0.5521158749980276

 the best parameters acress al best searched params:  {'learning_rate': 0.09, 'iterations': 65, 'depth': 6}


In [48]:
from catboost import CatBoostRegressor

In [61]:
cat_model = CatBoostRegressor(learning_rate=0.09, iterations=65, depth=6)

In [62]:
cat_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=500, use_best_model=True)

0:	learn: 4256.1478548	test: 4256.1478548	test1: 4395.6232992	best: 4395.6232992 (0)	total: 4.23ms	remaining: 271ms
1:	learn: 4083.1986534	test: 4083.1986534	test1: 4227.8238785	best: 4227.8238785 (1)	total: 7.39ms	remaining: 233ms
2:	learn: 3920.1649050	test: 3920.1649050	test1: 4069.2514032	best: 4069.2514032 (2)	total: 9.69ms	remaining: 200ms
3:	learn: 3785.6395816	test: 3785.6395816	test1: 3936.3887168	best: 3936.3887168 (3)	total: 12.8ms	remaining: 195ms
4:	learn: 3656.6716658	test: 3656.6716658	test1: 3807.8552369	best: 3807.8552369 (4)	total: 15.8ms	remaining: 190ms
5:	learn: 3551.6331536	test: 3551.6331536	test1: 3706.1059704	best: 3706.1059704 (5)	total: 18.9ms	remaining: 186ms
6:	learn: 3461.4243256	test: 3461.4243256	test1: 3619.7843732	best: 3619.7843732 (6)	total: 22.3ms	remaining: 185ms
7:	learn: 3377.0833181	test: 3377.0833181	test1: 3536.7906524	best: 3536.7906524 (7)	total: 25.5ms	remaining: 182ms
8:	learn: 3307.3056466	test: 3307.3056466	test1: 3467.7321444	best: 3467

<catboost.core.CatBoostRegressor at 0x265a6b7bb00>

In [63]:
cat_pred = cat_model.predict(test)

In [64]:
submit['Product_Supermarket_Sales'] = cat_pred

In [65]:
submit.to_csv('catboost_sub.csv', index=None)