In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sol = pd.read_csv('SampleSubmission.csv')

In [4]:
print(train.shape)
print(test.shape)

(4990, 13)
(3532, 12)


In [5]:
submit = test[['Item_Store_ID']]

In [6]:
submit.shape

(3532, 1)

In [7]:
train.drop(['Item_Store_ID','Item_ID'], axis = 1, inplace = True)
test.drop(['Item_Store_ID','Item_ID'], axis = 1, inplace = True)

In [8]:
# train['Item_Store_Returns'].hist()

### Replacing Missing values

In [9]:
train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace = True)
test['Item_Weight'].fillna(test['Item_Weight'].mean(), inplace = True)

In [10]:
train['Store_Size'].fillna(train['Store_Size'].mode()[0], inplace = True)
test['Store_Size'].fillna(test['Store_Size'].mode()[0], inplace = True)

In [11]:
from datetime import datetime as dt

In [12]:
train['Store_Start_Year'] = (dt.today().year - train['Store_Start_Year']).astype('float')
test['Store_Start_Year'] = (dt.today().year - test['Store_Start_Year']).astype('float')

In [13]:
train['sqrt_Item_Price'] = np.sqrt(train['Item_Price'])
test['sqrt_Item_Price'] = np.sqrt(test['Item_Price'])


train['cross_Item_weight'] = train['Item_Price'] * train['Item_Weight']
test['cross_Item_weight'] = test['Item_Price'] * test['Item_Weight']

train['Cross_Item_visibility_weight'] = train['Item_Price'] * train['Item_Weight'] * train['Item_Visibility']
test['Cross_Item_visibility_weight'] = test['Item_Price'] * test['Item_Weight'] * test['Item_Visibility']

In [14]:
data = train.drop('Item_Store_Returns', axis = 1)
target = train['Item_Store_Returns']

In [15]:
print(data.shape)
print(target.shape)

(4990, 13)
(4990,)


In [16]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

In [17]:
data['Item_Sugar_Content'] = oe.fit_transform(data.Item_Sugar_Content.values.reshape(-1, 1))
test['Item_Sugar_Content'] = oe.fit_transform(test.Item_Sugar_Content.values.reshape(-1, 1))

data['Store_Size'] = oe.fit_transform(data.Store_Size.values.reshape(-1, 1))
test['Store_Size'] = oe.fit_transform(test.Store_Size.values.reshape(-1, 1))

In [18]:
dummy1 = pd.get_dummies(data['Store_ID'],prefix = 'Store_ID' )
dummy2 = pd.get_dummies(test['Store_ID'],prefix = 'Store_ID' )

data = pd.concat([data, dummy1], axis = 1)
test = pd.concat([test, dummy2], axis = 1)

data.drop('Store_ID', axis = 1, inplace = True)
test.drop('Store_ID', axis = 1, inplace = True)

In [19]:
dummy1 = pd.get_dummies(data['Item_Type'],prefix = 'Item_Type' )
dummy2 = pd.get_dummies(test['Item_Type'],prefix = 'Item_Type' )

data = pd.concat([data, dummy1], axis = 1)
test = pd.concat([test, dummy2], axis = 1)

data.drop('Item_Type', axis = 1, inplace = True)
test.drop('Item_Type', axis = 1, inplace = True)

In [20]:
dummy1 = pd.get_dummies(data['Store_Location_Type'],prefix = 'Loc_Type' )
dummy2 = pd.get_dummies(test['Store_Location_Type'],prefix = 'Loc_Type' )

data = pd.concat([data, dummy1], axis = 1)
test = pd.concat([test, dummy2], axis = 1)

data.drop('Store_Location_Type', axis = 1, inplace = True)
test.drop('Store_Location_Type', axis = 1, inplace = True)

In [21]:
dummy1 = pd.get_dummies(data['Store_Type'],prefix = 'Store_Type' )
dummy2 = pd.get_dummies(test['Store_Type'],prefix = 'Store_Type' )

data = pd.concat([data, dummy1], axis = 1)
test = pd.concat([test, dummy2], axis = 1)

data.drop('Store_Type', axis = 1, inplace = True)
test.drop('Store_Type', axis = 1, inplace = True)

In [22]:
print(data.shape)
print(test.shape)

(4990, 42)
(3532, 42)


In [23]:
# get a list of models to evaluate
# def get_models():
#     models = dict()
#     models['knn'] = KNeighborsRegressor()
#     models['cart'] = DecisionTreeRegressor()
#     models['svm'] = SVR()
#     models['rfr'] = RandomForestRegressor()
#     return models

In [35]:
def get_models():
    models = dict()
    models['lgb'] = LGBMRegressor(num_leaves=200, min_data_in_leaf=3,
                    objective='regression',
                    max_depth=-1,learning_rate=0.05, 
                    boosting_type='gbdt', 
                    feature_fraction=0.60,
                    lambda_l1=1,lambda_l2=1, 
                    metric='rmse', 
                    num_iterations=4000)
    models['xgb'] = XGBRegressor(n_estimators = 5000, max_depth = 30, 
                     reg_lambda = 80,random_state = 30,
                     learning_rate=0.1, gamma = 1.5)
    models['cat'] = CatBoostRegressor(iterations = 50, depth = 3, learning_rate = 0.05, loss_function = 'RMSE')
    models['rfr'] = RandomForestRegressor()
    return models

In [36]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data, target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    scores = np.abs(scores)
    return scores

In [None]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))


>lgb 10609805.767 (940614.090)


In [None]:
lgb = LGBMRegressor(num_leaves=200, min_data_in_leaf=3,
                    objective='regression',
                    max_depth=-1,learning_rate=0.05, 
                    boosting_type='gbdt', 
                    feature_fraction=0.60,
                    lambda_l1=1,lambda_l2=1, 
                    metric='rmse', 
                    num_iterations=4000)

xgb = XGBRegressor(n_estimators = 5000, max_depth = 30, 
                     reg_lambda = 80,random_state = 30,
                     learning_rate=0.1, gamma = 1.5)

cat = CatBoostRegressor(iterations = 50, depth = 3, learning_rate = 0.05, loss_function = 'RMSE')

In [28]:
# def get_stacking():
#     # define the base models
#     level0 = list()
#     level0.append(('knn', KNeighborsRegressor()))
#     level0.append(('cart', DecisionTreeRegressor()))
#     level0.append(('svm', SVR()))
#     level0.append(('rfr', RandomForestRegressor()))
    
#     # define meta learner model
#     level1 = LinearRegression()
#     # define the stacking ensemble
#     model = StackingRegressor(estimators=level0, final_estimator=level1, cv=10)
#     return model

In [None]:
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lgb', lgb))
    level0.append(('xgb', xgb))
    level0.append(('rfr', RandomForestRegressor()))
    
    # define meta learner model
    level1 = cat
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=10)
    return model

In [29]:
# # get a list of models to evaluate
# def get_models():
#     models = dict()
#     models['knn'] = KNeighborsRegressor()
#     models['cart'] = DecisionTreeRegressor()
#     models['svm'] = SVR()
#     models['rfr'] = RandomForestRegressor()
#     models['stacking'] = get_stacking()
    
#     return models

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    models['lgb'] = LGBMRegressor(num_leaves=200, min_data_in_leaf=3,
                    objective='regression',
                    max_depth=-1,learning_rate=0.05, 
                    boosting_type='gbdt', 
                    feature_fraction=0.60,
                    lambda_l1=1,lambda_l2=1, 
                    metric='rmse', 
                    num_iterations=4000)
    models['xgb'] = XGBRegressor(n_estimators = 5000, max_depth = 30, 
                     reg_lambda = 80,random_state = 30,
                     learning_rate=0.1, gamma = 1.5)
    #models['cat'] = CatBoostRegressor(iterations = 50, depth = 3, learning_rate = 0.05, loss_function = 'RMSE')
    models['rfr'] = RandomForestRegressor()
    models['stacking'] = get_stacking()
    
    return models

In [34]:
# # get the models to evaluate
# models = get_models()
# # evaluate the models and store results
# results, names = list(), list()
# for name, model in models.items():
#     scores = evaluate_model(model)
#     results.append(scores)
#     names.append(name)
#     print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

>knn 15996386.148 (1380667.192)
>cart 17494565.609 (1241709.494)
>svm 19679744.937 (2095108.408)
>rfr 9477643.305 (857116.117)
>stacking 9356602.056 (895245.333)


In [None]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

In [None]:
# define the base models
level0 = list()
level0.append(('knn', KNeighborsRegressor()))
level0.append(('cart', DecisionTreeRegressor()))
level0.append(('svm', SVR()))
# define meta learner model
level1 = LinearRegression()
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
# fit the model on all available data
model.fit(X, y)
# make a prediction for one example
model.predict(test)

In [None]:
# xgb_scores = []
# xgb_test_pred = np.zeros(len(test))
# xgb_train_pred = np.zeros(len(train))

# for fold,(tr_in,te_in) in enumerate(kf.split(data)):
    
#     print(f"==================================Fold{fold}=============================================")
#     X_train,X_test = data.iloc[tr_in],data.iloc[te_in]
#     y_train,y_test = target.iloc[tr_in],target.iloc[te_in]
    
#     xgb.fit(X_train,y_train,eval_set = [(X_train,y_train),(X_test,y_test)],
#             early_stopping_rounds = 500)
    
#     xgb_scores.append(np.sqrt(mse(y_test,xgb.predict(X_test))))
    
#     xgb_train_pred += xgb.predict(data)
#     xgb_test_pred += xgb.predict(test)

In [None]:
# np.mean(xgb_scores)

In [None]:
# feat_import = pd.Series(xgb.feature_importances_, index = data.columns)
# feat_import

In [None]:
# feat_import = pd.Series(xgb.feature_importances_, index = data.columns)
# feat_import.sort_values(ascending = False).nlargest(20).plot(kind = 'barh')

In [None]:
# data2 = data.drop(['Item_ID', 'Item_Visibility'], axis = 1)

In [None]:
# test2 = test.drop(['Item_ID', 'Item_Visibility'], axis = 1)

In [None]:
final = xgb_test_pred/30

In [None]:
sub2 = pd.DataFrame(final, columns = ['Item_Store_Returns'])

In [None]:
sub2.head()

In [None]:
sub2['Item_Store_Returns'] = round(sub2['Item_Store_Returns']).astype('int')

In [None]:
sub2.head()

In [None]:
submission = pd.concat([submit, sub2], axis = 1)

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
submission.to_csv('My_submission2c2.csv', index = False)