In [171]:
## Importing required libraries
import pandas as pd #for data preprocessing

#Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

#Linear Algebra
import numpy as np
 
#Import Datetime module
from datetime import datetime

from sklearn.model_selection import KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV, train_test_split #For splitting

#Evaluation Metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error

#To ignore unnecessary warnings
import warnings

from sklearn.preprocessing import LabelEncoder,OneHotEncoder # for encoding categorical variables
from sklearn.linear_model import LogisticRegression

In [245]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')
date = pd.read_csv('dates.csv')
store = pd.read_csv('stores.csv')

In [246]:
print(f'The Size of the train set is: {train.shape}, and the size of the test set is {test.shape}')

The Size of the train set is: (2248884, 6), and the size of the test set is (99792, 4)


In [247]:
print(train.shape)
print(test.shape)
print(ss.shape)
print(date.shape)

(2248884, 6)
(99792, 4)
(14256, 2)
(1320, 15)


In [248]:
train.columns

Index(['date', 'store_id', 'category_id', 'target', 'onpromotion',
       'nbr_of_transactions'],
      dtype='object')

In [249]:
test.columns

Index(['date', 'store_id', 'category_id', 'onpromotion'], dtype='object')

In [250]:
train.head()

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions
0,365,store_1,category_24,0.0,0,0.0
1,365,store_1,category_21,0.0,0,0.0
2,365,store_1,category_32,0.0,0,0.0
3,365,store_1,category_18,0.0,0,0.0
4,365,store_1,category_26,0.0,0,0.0


In [251]:
test.head(2)

Unnamed: 0,date,store_id,category_id,onpromotion
0,1627,store_1,category_24,0
1,1627,store_1,category_21,0


In [252]:
ss.head()

Unnamed: 0,ID,target
0,year_week_425_store_1_category_0,0
1,year_week_426_store_1_category_0,0
2,year_week_427_store_1_category_0,1
3,year_week_428_store_1_category_0,3
4,year_week_429_store_1_category_0,2


In [253]:
train = pd.merge(train, date, on = 'date', how = 'left')
test = pd.merge(test, date, on = 'date', how = 'left')

### Data Processing

In [254]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2248884 entries, 0 to 2248883
Data columns (total 20 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 int64  
 1   store_id             object 
 2   category_id          object 
 3   target               float64
 4   onpromotion          int64  
 5   nbr_of_transactions  float64
 6   year                 int64  
 7   month                int64  
 8   dayofmonth           int64  
 9   dayofweek            int64  
 10  dayofyear            int64  
 11  weekofyear           int64  
 12  quarter              int64  
 13  is_month_start       bool   
 14  is_month_end         bool   
 15  is_quarter_start     bool   
 16  is_quarter_end       bool   
 17  is_year_start        bool   
 18  is_year_end          bool   
 19  year_weekofyear      int64  
dtypes: bool(6), float64(2), int64(10), object(2)
memory usage: 270.2+ MB


In [255]:
train.head(2)

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,365,store_1,category_24,0.0,0,0.0,1,1,1,2,1,1,1,True,False,True,False,True,False,101
1,365,store_1,category_21,0.0,0,0.0,1,1,1,2,1,1,1,True,False,True,False,True,False,101


In [256]:
test.head(2)

Unnamed: 0,date,store_id,category_id,onpromotion,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,1627,store_1,category_24,0,4,6,19,0,170,25,2,False,False,False,False,False,False,425
1,1627,store_1,category_21,0,4,6,19,0,170,25,2,False,False,False,False,False,False,425


In [257]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99792 entries, 0 to 99791
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              99792 non-null  int64 
 1   store_id          99792 non-null  object
 2   category_id       99792 non-null  object
 3   onpromotion       99792 non-null  int64 
 4   year              99792 non-null  int64 
 5   month             99792 non-null  int64 
 6   dayofmonth        99792 non-null  int64 
 7   dayofweek         99792 non-null  int64 
 8   dayofyear         99792 non-null  int64 
 9   weekofyear        99792 non-null  int64 
 10  quarter           99792 non-null  int64 
 11  is_month_start    99792 non-null  bool  
 12  is_month_end      99792 non-null  bool  
 13  is_quarter_start  99792 non-null  bool  
 14  is_quarter_end    99792 non-null  bool  
 15  is_year_start     99792 non-null  bool  
 16  is_year_end       99792 non-null  bool  
 17  year_weekofy

### Encoding categorical dataset

In [258]:
categ_cols = train.select_dtypes(include=['object', 'category', 'boolean']).columns
cate_cols = test.select_dtypes(include=['object', 'category', 'boolean']).columns

In [259]:
categ_cols
cate_cols

Index(['store_id', 'category_id', 'is_month_start', 'is_month_end',
       'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end'],
      dtype='object')

In [260]:
le = LabelEncoder()
for i in categ_cols:
    train[i] = train[i].astype('category').cat.codes
    test[i] = test[i].astype('category').cat.codes

In [261]:
for i in cate_cols:
    test[i] = test[i].astype('category').cat.codes    

In [262]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99792 entries, 0 to 99791
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   date              99792 non-null  int64
 1   store_id          99792 non-null  int8 
 2   category_id       99792 non-null  int8 
 3   onpromotion       99792 non-null  int64
 4   year              99792 non-null  int64
 5   month             99792 non-null  int64
 6   dayofmonth        99792 non-null  int64
 7   dayofweek         99792 non-null  int64
 8   dayofyear         99792 non-null  int64
 9   weekofyear        99792 non-null  int64
 10  quarter           99792 non-null  int64
 11  is_month_start    99792 non-null  int8 
 12  is_month_end      99792 non-null  int8 
 13  is_quarter_start  99792 non-null  int8 
 14  is_quarter_end    99792 non-null  int8 
 15  is_year_start     99792 non-null  int8 
 16  is_year_end       99792 non-null  int8 
 17  year_weekofyear   99792 non-nul

In [238]:
train['store_id'] = train['store_id'].astype('category')
train['store_id'] = train['store_id'].cat.codes

In [188]:
train['category_id'] = train['category_id'].astype('category')
train['category_id'] = train['category_id'].cat.codes

In [189]:
test['store_id'] = test['store_id'].astype('category')
test['store_id'] = test['store_id'].cat.codes

In [190]:
test['category_id'] = test['category_id'].astype('category')
test['category_id'] = test['category_id'].cat.codes

### Checking for null values

In [263]:
date.head()

Unnamed: 0,date,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,365,1,1,1,2,1,1,1,True,False,True,False,True,False,101
1,366,1,1,2,3,2,1,1,False,False,False,False,False,False,101
2,367,1,1,3,4,3,1,1,False,False,False,False,False,False,101
3,368,1,1,4,5,4,1,1,False,False,False,False,False,False,101
4,369,1,1,5,6,5,1,1,False,False,False,False,False,False,101


In [264]:
train.isnull().sum()

date                   0
store_id               0
category_id            0
target                 0
onpromotion            0
nbr_of_transactions    0
year                   0
month                  0
dayofmonth             0
dayofweek              0
dayofyear              0
weekofyear             0
quarter                0
is_month_start         0
is_month_end           0
is_quarter_start       0
is_quarter_end         0
is_year_start          0
is_year_end            0
year_weekofyear        0
dtype: int64

In [265]:
test.isnull().sum()

date                0
store_id            0
category_id         0
onpromotion         0
year                0
month               0
dayofmonth          0
dayofweek           0
dayofyear           0
weekofyear          0
quarter             0
is_month_start      0
is_month_end        0
is_quarter_start    0
is_quarter_end      0
is_year_start       0
is_year_end         0
year_weekofyear     0
dtype: int64

### Dropping some columns

In [266]:
train.shape

(2248884, 20)

In [267]:
test.shape

(99792, 18)

In [268]:
# Drop ID from train and test
train = train.drop('nbr_of_transactions', axis=1)
# test.drop('ID', axis=1, inplace= True)

In [270]:
train.shape

(2248884, 19)

## Checking correlation

In [271]:
corr = train.corr()

corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,date,store_id,category_id,target,onpromotion,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
date,1.0,-0.0,0.0,0.06,0.18,0.96,0.07,0.01,0.0,0.07,0.07,0.06,-0.0,0.0,-0.01,0.0,-0.01,0.01,0.99
store_id,-0.0,1.0,0.0,0.09,0.02,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0
category_id,0.0,0.0,1.0,0.07,0.03,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0
target,0.06,0.09,0.07,1.0,0.44,0.05,0.02,-0.01,0.04,0.02,0.02,0.02,0.01,0.0,-0.0,0.0,-0.02,0.01,0.05
onpromotion,0.18,0.02,0.03,0.44,1.0,0.17,0.03,0.0,-0.0,0.03,0.03,0.03,-0.0,0.0,-0.01,0.0,-0.01,0.0,0.18
year,0.96,-0.0,0.0,0.05,0.17,1.0,-0.21,-0.02,0.0,-0.21,-0.21,-0.21,0.0,-0.0,0.0,-0.01,0.01,-0.01,0.99
month,0.07,-0.0,-0.0,0.02,0.03,-0.21,1.0,0.01,-0.01,1.0,0.97,0.97,-0.0,-0.0,-0.03,0.03,-0.08,0.08,-0.07
dayofmonth,0.01,-0.0,0.0,-0.01,0.0,-0.02,0.01,1.0,0.0,0.09,0.06,0.01,-0.31,0.31,-0.18,0.17,-0.09,0.09,-0.0
dayofweek,0.0,-0.0,0.0,0.04,-0.0,0.0,-0.01,0.0,1.0,-0.01,-0.01,-0.0,-0.0,-0.0,0.01,-0.04,0.02,0.01,-0.0
dayofyear,0.07,-0.0,-0.0,0.02,0.03,-0.21,1.0,0.09,-0.01,1.0,0.97,0.97,-0.03,0.03,-0.05,0.05,-0.09,0.09,-0.06


## Modelling

In [272]:
#Split into X and y
X = train.drop('target', axis=1)
y = train['target']

In [273]:
#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1799107, 18)
(449777, 18)
(1799107,)
(449777,)


In [274]:
from lightgbm import LGBMRegressor
lgt = LGBMRegressor(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9234, learning_rate=0.1, max_depth=-1,
        metric='None', min_child_samples=399, min_child_weight=0.1,
        min_split_gain=0.0, n_estimators=5000, n_jobs=4, num_leaves=13,
        objective=None, random_state=314, reg_alpha=2, reg_lambda=5,
        silent=True, subsample=0.855, subsample_for_bin=200000,
        subsample_freq=0)
lgt.fit(X_train, y_train)

LGBMRegressor(colsample_bytree=0.9234, metric='None', min_child_samples=399,
              min_child_weight=0.1, n_estimators=5000, n_jobs=4, num_leaves=13,
              random_state=314, reg_alpha=2, reg_lambda=5, subsample=0.855)

In [275]:
y_pred = lgt.predict(test)

In [277]:
y_pred = abs(y_pred)

In [278]:
y_pred_log = np.log1p(y_pred)

### Submission File Creation

In [88]:
new_test = pd.read_csv('Test.csv')

In [89]:
new_test = pd.merge(new_test, date, on = 'date', how = 'left')

In [92]:
new_test['yw'] = 'year_week'

In [93]:
new_ = new_test['yw'] + '_' + new_test['year_weekofyear'].astype('str') + '_' + new_test['store_id'].astype('str') + '_' + new_test['category_id'].astype('str')

In [279]:
new_

0        year_week_425_store_1_category_24
1        year_week_425_store_1_category_21
2        year_week_425_store_1_category_32
3        year_week_425_store_1_category_18
4        year_week_425_store_1_category_26
                       ...                
99787    year_week_432_store_9_category_23
99788    year_week_432_store_9_category_20
99789    year_week_432_store_9_category_15
99790    year_week_432_store_9_category_29
99791    year_week_432_store_9_category_10
Length: 99792, dtype: object

In [280]:
final = pd.DataFrame({'ID': new_, 'target': y_pred_log })

In [281]:
final.to_csv('maybe.csv', index = False)

## .....

In [121]:
from lightgbm import LGBMRegressor
lgt = LGBMRegressor()
lgt.fit(X_train, y_train)



LGBMRegressor(learning_rate=0.005, max_depth=-4, num_iterations=20,
              num_leaves=50)

In [122]:
y_pred = lgt.predict(test)

In [123]:
y_pred = abs(y_pred)

In [124]:
y_pred_log = np.log1p(y_pred)

In [125]:
final = pd.DataFrame({'ID': new_, 'target': y_pred_log })

In [126]:
final.to_csv('finale.csv', index= False)

## Expanding the model

In [127]:
parameters = { 'loss' : ['ls', 'lad', 'huber', 'quantile'],
              'learning_rate' : (0.05,0.25,0.50,1),
              'criterion' : ['friedman_mse', 'mse', 'mae'],
              'max_features' : ['auto', 'sqrt', 'log2']
             }

In [130]:
parameters

{'loss': ['ls', 'lad', 'huber', 'quantile'],
 'learning_rate': (0.05, 0.25, 0.5, 1),
 'criterion': ['friedman_mse', 'mse', 'mae'],
 'max_features': ['auto', 'sqrt', 'log2']}

In [131]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [2,5]
# Minimum number of samples requerid to split a node
min_samples_split = [2,4]
# Minimum number of samples requerid at each leaf node
min_samples_leaf = [1,2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [282]:
#  Create a param grid
param_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap}
param_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [2, 5],
 'min_samples_split': [2, 4],
 'min_samples_leaf': [1, 2],
 'bootstrap': [True, False]}

In [284]:
 rf_grid= GridSearchCV(estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9234, learning_rate=0.1, max_depth=-1,
        metric='None', min_child_samples=399, min_child_weight=0.1,
        min_split_gain=0.0, n_estimators=5000, n_jobs=4, num_leaves=13,
        objective=None, random_state=314, reg_alpha=2, reg_lambda=5,
        silent=True, subsample=0.855, subsample_for_bin=200000,
        subsample_freq=0), param_grid=param_grid, cv=3, verbose=0, n_jobs=4)

In [None]:
# Fit the model
rf_grid.fit(X_train, y_train)
rf_grid.best_params_

In [136]:
# Returning classification report on random forest model prediction
rf = LGBMRegressor(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9234, learning_rate=0.1, max_depth=-1,
        metric='None', min_child_samples=399, min_child_weight=0.1,
        min_split_gain=0.0, n_estimators=5000, n_jobs=4, num_leaves=13,
        objective=None, random_state=314, reg_alpha=2, reg_lambda=5,
        silent=True, subsample=0.855, subsample_for_bin=200000,
        subsample_freq=0).set_params(**rf_grid.best_params_)
rf.fit(X_train, y_train)

LGBMRegressor(bootstrap=True, max_depth=5, max_features='auto',
              min_samples_leaf=1, min_samples_split=2)

In [149]:
predictions = rf.predict(X_test)

In [150]:
model_rf = rf.predict(test)

In [151]:
model_rf

array([   4.36155873,   19.01193319,   16.53329279, ..., 2751.70242716,
        242.38845713,   36.4009174 ])

In [None]:
final.groupby(['ID'])['target'].sum().reset_index()

In [143]:
model_rf = abs(model_rf)

In [144]:
model_rf_log = np.log1p(model_rf)

In [145]:
final = pd.DataFrame({'ID': new_, 'target': model_rf_log })

In [147]:
final.to_csv('model_rf.csv', index = False)

In [148]:
final.groupby(['ID'])['target'].sum().reset_index()

Unnamed: 0,ID,target
0,year_week_425_store_10_category_0,29.710238
1,year_week_425_store_10_category_1,24.700883
2,year_week_425_store_10_category_10,24.700883
3,year_week_425_store_10_category_11,24.700883
4,year_week_425_store_10_category_12,31.644321
...,...,...
14251,year_week_432_store_9_category_5,28.608878
14252,year_week_432_store_9_category_6,28.608878
14253,year_week_432_store_9_category_7,61.785235
14254,year_week_432_store_9_category_8,36.726636


In [161]:
LGBMRegressor(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9234, learning_rate=0.1, max_depth=-1,
        metric='None', min_child_samples=399, min_child_weight=0.1,
        min_split_gain=0.0, n_estimators=5000, n_jobs=4, num_leaves=13,
        objective=None, random_state=314, reg_alpha=2, reg_lambda=5,
        silent=True, subsample=0.855, subsample_for_bin=200000,
        subsample_freq=0)

LGBMRegressor(colsample_bytree=0.9234, metric='None', min_child_samples=399,
              min_child_weight=0.1, n_estimators=5000, n_jobs=4, num_leaves=13,
              random_state=314, reg_alpha=2, reg_lambda=5, subsample=0.855)

In [162]:
reg = LGBMRegressor(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9234, learning_rate=0.1, max_depth=-1,
        metric='None', min_child_samples=399, min_child_weight=0.1,
        min_split_gain=0.0, n_estimators=5000, n_jobs=4, num_leaves=13,
        objective=None, random_state=314, reg_alpha=2, reg_lambda=5,
        silent=True, subsample=0.855, subsample_for_bin=200000,
        subsample_freq=0)

In [163]:
reg.fit(X_train, y_train)

LGBMRegressor(colsample_bytree=0.9234, metric='None', min_child_samples=399,
              min_child_weight=0.1, n_estimators=5000, n_jobs=4, num_leaves=13,
              random_state=314, reg_alpha=2, reg_lambda=5, subsample=0.855)

In [164]:
y_pred = reg.predict(test)

array([ -11.4587582 ,    5.96342701,  -48.41554323, ..., 2120.32599176,
        134.84324932,    7.70816477])

In [165]:
y_pred = abs(y_pred)

In [166]:
y_pred = np.log1p(y_pred)

In [167]:
final = pd.DataFrame({'ID': new_, 'target': y_pred })

In [168]:
final.to_csv('new_gy.csv', index = False)