# Imports

In [1]:
# Data Manipulation, Linear Algebra
import pandas as pd
import numpy as np

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

## Getting the Data

In [2]:
base_path = "../input/tabular-playground-series-jan-2022/"

train_data = pd.read_csv(base_path + "train.csv")
test_data = pd.read_csv(base_path + "test.csv")
sample_submission_data = pd.read_csv(base_path + "sample_submission.csv")

In [3]:
display(train_data, test_data)

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911
...,...,...,...,...,...,...
26293,26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823
26294,26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250
26295,26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004
26296,26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441


Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat


## Concatenating Training and Testing Data

In [4]:
full_data = pd.concat([train_data.iloc[:, :-1], test_data])
full_data.reset_index(drop=True, inplace=True)
full_data

Unnamed: 0,row_id,date,country,store,product
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...,...
32863,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat
32864,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker
32865,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug
32866,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat


## Feature Engineering

In [5]:
# Credit to https://www.kaggle.com/ranjeetshrivastav/tps-jan-21-base-xgb

full_data['date'] = pd.to_datetime(full_data['date'])

full_data['year'] = full_data['date'].dt.year
full_data['month'] = full_data['date'].dt.month
full_data['day'] = full_data['date'].dt.day
full_data['dayofweek'] = full_data['date'].dt.dayofweek
full_data['dayofmonth'] = full_data['date'].dt.days_in_month
full_data['dayofyear'] = full_data['date'].dt.dayofyear
full_data['weekday'] = full_data['date'].dt.weekday

full_data.drop(columns=['date', 'row_id'], inplace = True)

In [6]:
full_data

Unnamed: 0,country,store,product,year,month,day,dayofweek,dayofmonth,dayofyear,weekday
0,Finland,KaggleMart,Kaggle Mug,2015,1,1,3,31,1,3
1,Finland,KaggleMart,Kaggle Hat,2015,1,1,3,31,1,3
2,Finland,KaggleMart,Kaggle Sticker,2015,1,1,3,31,1,3
3,Finland,KaggleRama,Kaggle Mug,2015,1,1,3,31,1,3
4,Finland,KaggleRama,Kaggle Hat,2015,1,1,3,31,1,3
...,...,...,...,...,...,...,...,...,...,...
32863,Sweden,KaggleMart,Kaggle Hat,2019,12,31,1,31,365,1
32864,Sweden,KaggleMart,Kaggle Sticker,2019,12,31,1,31,365,1
32865,Sweden,KaggleRama,Kaggle Mug,2019,12,31,1,31,365,1
32866,Sweden,KaggleRama,Kaggle Hat,2019,12,31,1,31,365,1


## OneHotEncoding Categorical Variables

In [7]:
full_data = pd.concat([pd.get_dummies(full_data[["country", "store", "product"]]), full_data[["year", "month", "day"]]], axis=1)

## Seperating Training and Testing Data

In [8]:
train = full_data.iloc[:len(train_data), :]
test = full_data.iloc[len(train_data):, :]

# Machine Learning Model
 - Code Credits to - https://www.kaggle.com/junhyeok99/catboost-baseline
 - TimeSeriesSplit Docs Link - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

In [9]:
# TimeSeriesSplit for Splitting the Data for training and validation data
from sklearn.model_selection import TimeSeriesSplit

# Different Mathematical functions to calculate the Accuracy of Regression Model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Using CatBoostRegressor, XGBRegressor and ExtraTreesRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [10]:
# Initializing the TimeSeriesSplit
kfold = TimeSeriesSplit(5) # n_splits = 10

# List to store the predictions
test_pred = []

## Using CatBoostRegressor

In [11]:
for fold, (train_id, test_id) in enumerate(kfold.split(train)):
    print('<------- fold', fold+1, '------->')

    # Getting the Training and Validation Data
    x_train, y_train = train.iloc[train_id], train_data["num_sold"].iloc[train_id]
    x_valid, y_valid = train.iloc[test_id], train_data["num_sold"].iloc[test_id]
    
    # Initializing the CatBoostRegressor
    cat = CatBoostRegressor(silent = True, n_estimators = 10000)

    # Training the CatBoostRegressor
    cat.fit(x_train, y_train, early_stopping_rounds = 1500)

    # Predict the Values for x_valid
    y_pred = cat.predict(x_valid)

    # Calculating R^2 Score
    cat_r2_score = r2_score(y_valid, y_pred)

    # Calculating Mean Squared Error
    cat_mse = mean_squared_error(y_valid, y_pred)

    # Calculating Root Mean Squared Error
    cat_rmse = np.sqrt(cat_mse)

    # Calculating Mean Absolute Error
    cat_mae = mean_absolute_error(y_valid, y_pred)

    print( f"""
    Regressor Name                    : {cat.__class__.__name__}

    Regressor R^2 Score               : {cat_r2_score}

    Regressor Mean Squared Error      : {cat_mse}

    Regressor Root Mean Squared Error : {cat_rmse}

    Regressor Mean Absolute Error     : {cat_mae}
    """)

    # Making predictions on the Testing Data
    test_pred.append(cat.predict(test))

<------- fold 1 ------->

    Regressor Name                    : CatBoostRegressor

    Regressor R^2 Score               : 0.8047636439709696

    Regressor Mean Squared Error      : 12416.246119942181

    Regressor Root Mean Squared Error : 111.42821061087798

    Regressor Mean Absolute Error     : 63.175178808170486
    
<------- fold 2 ------->

    Regressor Name                    : CatBoostRegressor

    Regressor R^2 Score               : 0.890391776904395

    Regressor Mean Squared Error      : 5465.820898663002

    Regressor Root Mean Squared Error : 73.93119029653859

    Regressor Mean Absolute Error     : 51.48867056847899
    
<------- fold 3 ------->

    Regressor Name                    : CatBoostRegressor

    Regressor R^2 Score               : 0.9079188197384185

    Regressor Mean Squared Error      : 7139.651994127223

    Regressor Root Mean Squared Error : 84.49646142961977

    Regressor Mean Absolute Error     : 55.01636523986748
    
<------- fold 4 ----

## XGBRegressor

In [12]:
for fold, (train_id, test_id) in enumerate(kfold.split(train)):
    print('<------- fold', fold+1, '------->')

    # Getting the Training and Validation Data
    x_train, y_train = train.iloc[train_id], train_data["num_sold"].iloc[train_id]
    x_valid, y_valid = train.iloc[test_id], train_data["num_sold"].iloc[test_id]
    
    # Initializing the XGBRegressor
    xgbr = XGBRegressor(silent = True, n_estimators = 10000)

    # Training the XGBRegressor
    xgbr.fit(x_train, y_train, verbose = 1000)

    # Predict the Values for x_valid
    y_pred = xgbr.predict(x_valid)

    # Calculating R^2 Score
    xgbr_r2_score = r2_score(y_valid, y_pred)

    # Calculating Mean Squared Error
    xgbr_mse = mean_squared_error(y_valid, y_pred)

    # Calculating Root Mean Squared Error
    xgbr_rmse = np.sqrt(xgbr_mse)

    # Calculating Mean Absolute Error
    xgbr_mae = mean_absolute_error(y_valid, y_pred)

    print( f"""
    Regressor Name                    : {xgbr.__class__.__name__}

    Regressor R^2 Score               : {xgbr_r2_score}

    Regressor Mean Squared Error      : {xgbr_mse}

    Regressor Root Mean Squared Error : {xgbr_rmse}

    Regressor Mean Absolute Error     : {xgbr_mae}
    """)

    # Making predictions on the Testing Data
    test_pred.append(xgbr.predict(test))

<------- fold 1 ------->
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.



    Regressor Name                    : XGBRegressor

    Regressor R^2 Score               : 0.7679480880775704

    Regressor Mean Squared Error      : 14757.567236112583

    Regressor Root Mean Squared Error : 121.48072783825664

    Regressor Mean Absolute Error     : 69.71864294130019
    
<------- fold 2 ------->
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.



    Regressor Name       

In [13]:
submission_num_sold = np.mean(test_pred, axis = 0)

# Submission File

In [14]:
sample_submission_data["num_sold"] = submission_num_sold
sample_submission_data.to_csv("submission.csv", index=False)