In [3]:
## Importing required libraries
import pandas as pd #for data preprocessing

#Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

#Linear Algebra
import numpy as np
 
#Import Datetime module
from datetime import datetime

from sklearn.model_selection import KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV, train_test_split #For splitting
from sklearn.linear_model import Ridge

#Evaluation Metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, explained_variance_score, r2_score, mean_absolute_error

#To ignore unnecessary warnings
import warnings

from sklearn.preprocessing import LabelEncoder,OneHotEncoder # for encoding categorical variables
from sklearn.linear_model import LogisticRegression

In [4]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')
date = pd.read_csv('dates.csv')
store = pd.read_csv('stores.csv')

In [5]:
print(f'The Size of the train set is: {train.shape}, and the size of the test set is {test.shape}')

The Size of the train set is: (2248884, 6), and the size of the test set is (99792, 4)


In [6]:
print(train.shape)
print(test.shape)
print(ss.shape)
print(date.shape)

(2248884, 6)
(99792, 4)
(14256, 2)
(1320, 15)


In [7]:
train.columns

Index(['date', 'store_id', 'category_id', 'target', 'onpromotion',
       'nbr_of_transactions'],
      dtype='object')

In [8]:
test.columns

Index(['date', 'store_id', 'category_id', 'onpromotion'], dtype='object')

In [9]:
train.head()

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions
0,365,store_1,category_24,0.0,0,0.0
1,365,store_1,category_21,0.0,0,0.0
2,365,store_1,category_32,0.0,0,0.0
3,365,store_1,category_18,0.0,0,0.0
4,365,store_1,category_26,0.0,0,0.0


In [10]:
train['nbr_of_transactions'].unique()

array([   0.,  840.,  487., ..., 4855., 5246., 4459.])

In [11]:
test.head(2)

Unnamed: 0,date,store_id,category_id,onpromotion
0,1627,store_1,category_24,0
1,1627,store_1,category_21,0


In [12]:
ss.head()

Unnamed: 0,ID,target
0,year_week_425_store_1_category_0,0
1,year_week_426_store_1_category_0,0
2,year_week_427_store_1_category_0,1
3,year_week_428_store_1_category_0,3
4,year_week_429_store_1_category_0,2


### Data Processing

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2248884 entries, 0 to 2248883
Data columns (total 6 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 int64  
 1   store_id             object 
 2   category_id          object 
 3   target               float64
 4   onpromotion          int64  
 5   nbr_of_transactions  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 102.9+ MB


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99792 entries, 0 to 99791
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         99792 non-null  int64 
 1   store_id     99792 non-null  object
 2   category_id  99792 non-null  object
 3   onpromotion  99792 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.0+ MB


### Checking for null values

In [15]:
train.isnull().sum()

date                   0
store_id               0
category_id            0
target                 0
onpromotion            0
nbr_of_transactions    0
dtype: int64

In [16]:
test.isnull().sum()

date           0
store_id       0
category_id    0
onpromotion    0
dtype: int64

### Dropping some columns

In [17]:
# Drop ID from train and test
train = train.drop('nbr_of_transactions', axis=1)
# test.drop('ID', axis=1, inplace= True)

### Label Encoding

In [18]:
train['store_id'] = train['store_id'].astype('category')
train['store_id'] = train['store_id'].cat.codes

In [19]:
train['category_id'] = train['category_id'].astype('category')
train['category_id'] = train['category_id'].cat.codes

In [20]:
test['store_id'] = test['store_id'].astype('category')
test['store_id'] = test['store_id'].cat.codes

In [21]:
test['category_id'] = test['category_id'].astype('category')
test['category_id'] = test['category_id'].cat.codes

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2248884 entries, 0 to 2248883
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   date         int64  
 1   store_id     int8   
 2   category_id  int8   
 3   target       float64
 4   onpromotion  int64  
dtypes: float64(1), int64(2), int8(2)
memory usage: 55.8 MB


In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99792 entries, 0 to 99791
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   date         99792 non-null  int64
 1   store_id     99792 non-null  int8 
 2   category_id  99792 non-null  int8 
 3   onpromotion  99792 non-null  int64
dtypes: int64(2), int8(2)
memory usage: 1.7 MB


## Checking correlation

In [24]:
corr = train.corr()

corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,date,store_id,category_id,target,onpromotion
date,1.0,-0.0,0.0,0.06,0.18
store_id,-0.0,1.0,0.0,0.09,0.02
category_id,0.0,0.0,1.0,0.07,0.03
target,0.06,0.09,0.07,1.0,0.44
onpromotion,0.18,0.02,0.03,0.44,1.0


## Modelling

In [25]:
#Split into X and y
X = train.drop('target', axis=1)
y = train['target']

In [26]:
#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1799107, 4)
(449777, 4)
(1799107,)
(449777,)


## Model Building

### 1. Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
# instantiate
logreg = LogisticRegression()
# train/fit
logreg.fit(X_train, y_train)
# predict
log_reg_pred = logreg.predict(X_test)

ValueError: Unknown label type: 'continuous'

In [101]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(n_estimators=50, learning_rate=0.01,)
xgb_model.fit(X_train, y_train, verbose=200)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [102]:
# prediction
xgb_pred = xgb_model.predict(X_test)

In [62]:
# validation
y_pred = xgb_model.predict(test)

In [80]:
y_pred_log = np.log1p(y_pred)

In [81]:
y_pred_log

array([1.7425308, 2.627808 , 3.0354896, ..., 6.014101 , 4.8341427,
       1.682076 ], dtype=float32)

In [123]:
mean_squared_error(y_test, xgb_pred)

1570449.832827278

In [124]:
np.sqrt(mean_squared_error(y_test, xgb_pred))

1253.175898598149

### Submission file

In [75]:
test['yw'] = 'year_week'

In [78]:
new_ = test['yw'] + '_' + test_df['year_weekofyear'].astype('str') + '_' + test['store_id'].astype('str') + '_' + test['category_id'].astype('str')

In [82]:
final = pd.DataFrame({'ID': new_, 'target': y_pred_log })

In [83]:
#Save to Csv for submission
final.to_csv('baseline.csv', index=False)

## Expanding on this model

In [90]:
# import shap
from sklearn.model_selection import RepeatedStratifiedKFold

In [91]:
# define search
space = dict()

search = GridSearchCV(xgb_model, space)

In [92]:
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
search = GridSearchCV(xgb_model, space,scoring='accuracy', n_jobs=-1, cv=cv)

In [93]:
# execute search
result = search.fit(X_train, y_train)

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.

# Doing Something Crazy

In [27]:
#Perform Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#Splitting the data into train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.2, random_state=42)

In [28]:
from time import time

from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [29]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [None]:
head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print("\tMean squared error:", mean_squared_error(y_test, y_pred))
    print("\tRoot mean squared error:", np.sqrt(mean_squared_error(y_test, xgb_pred)))
    print()

## Grid Search the best model

In [None]:
parameters = { 'loss' : ['ls', 'lad', 'huber', 'quantile'],
              'learning_rate' : (0.05,0.25,0.50,1),
              'criterion' : ['friedman_mse', 'mse', 'mae'],
              'max_features' : ['auto', 'sqrt', 'log2']
             }

In [None]:
grid = GridSearchCV(GradientBoostingRegressor(),parameters)
model = grid.fit(X_sc,y)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

In [None]:
{'criterion': 'friedman_mse', 'learning_rate': 0.25, 'loss': 'lad', 'max_features': 'sqrt'} 

GradientBoostingRegressor(learning_rate=0.25, loss='lad', max_features='sqrt') 

## 2. Catboost Regressor

In [None]:
from catboost import CatBoostRegressor
#Creating model 
cat_model=CatBoostRegressor()

#Training Catboost Model on train set
cat_model.fit(X_train,y_train)

#Predictiing on Test Set
y_pred_cat=cat_model.predict(X_test)

#Evaluating model using f1_score
# print("Catboost F1 score on validation set is : ",f1_score(y_test,y_pred_cat))

### Concatenating the original datasets with the date.csv

In [23]:
train_df = pd.merge(train, date, on = 'date', how = 'left')
test_df = pd.merge(test, date, on = 'date', how = 'left')

In [22]:
train_df.head(2)

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,365,store_1,category_24,0.0,0,0.0,1,1,1,2,1,1,1,True,False,True,False,True,False,101
1,365,store_1,category_21,0.0,0,0.0,1,1,1,2,1,1,1,True,False,True,False,True,False,101


In [66]:
test_df.head(2)

Unnamed: 0,date,store_id,category_id,onpromotion,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,1627,store_1,category_24,0,4,6,19,0,170,25,2,False,False,False,False,False,False,425
1,1627,store_1,category_21,0,4,6,19,0,170,25,2,False,False,False,False,False,False,425


In [26]:
print(train_df.shape)
print(test_df.shape)

(2248884, 20)
(99792, 18)


In [None]:
from sklearn import metrics

In [None]:
metrics.mean_squared_error(actual, predicted, squared = False)

In [None]:
np.sqrt(mean_squared_error(actual, predicted))