# DS-2.2 Baseline modeling pt.2

## **Description:**

Time to create wonders!! We are finally ready to produce our first ML model to meet customer requirements:
- Create your feature extraction step in accordance with our validation schema
- Intergrate your extraction algo into your validation class to ensure data leakage absence
- Create several ML models: 
    - Linear regression, SVR, Random Forest, XGBoost and play with them, find the best model and make your first submission

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import sys
sys.path.append('../')
from scripts.etl import transform_df_types
from scripts.baseline_modeling import TimeSeriesSplit, TrainModels # baseline_modeling.py module

# Load the train dataset with extracted features

In [2]:
train_df = pd.read_csv('../data/result_train.csv')
float_columns = train_df.select_dtypes(include=np.number).columns.tolist()
object_columns = train_df.select_dtypes(include=object).columns.tolist()

train_df = transform_df_types(train_df, int_columns=[], float_columns=float_columns, object_columns=object_columns)
train_df.head(3)

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,month,year,item_name,item_category_id,item_category_name,...,item_cnt_month_cat_shop__lag_2,item_cnt_month_cat_shop__lag_3,days,avg_item_price,avg_item_price_lag_1,avg_item_price_lag_2,avg_item_price_lag_3,avg_item_price_lag_4,avg_item_price_lag_5,avg_item_price_lag_6
0,0.0,2.0,27.0,1.0,2499.0,0.0,0.0,"007 Legends [PS3, русская версия]",19.0,Игры - PS3,...,0.0,0.0,31.0,2325.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17.0,2.0,27.0,1.0,498.0,5.0,1.0,"007 Legends [PS3, русская версия]",19.0,Игры - PS3,...,1.742424,1.970149,30.0,498.0,498.0,0.0,998.0,0.0,998.0,1048.0
2,2.0,2.0,30.0,1.0,359.0,2.0,0.0,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40.0,Кино - DVD,...,1.136364,0.0,31.0,388.716522,382.645966,0.0,0.0,0.0,0.0,0.0


# Clean train set from unneeded columns

In [3]:
train = train_df.select_dtypes(include=np.number)
y = train['item_cnt_month']
X = train.drop(['item_cnt_month', 'item_price', 'revenue', 'avg_item_price'], axis=1, inplace=False)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1608724 entries, 0 to 1608723
Data columns (total 30 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   date_block_num                  1608724 non-null  float32
 1   shop_id                         1608724 non-null  float32
 2   item_id                         1608724 non-null  float32
 3   month                           1608724 non-null  float32
 4   year                            1608724 non-null  float32
 5   item_category_id                1608724 non-null  float32
 6   months_since_last_sale          1608724 non-null  float32
 7   revenue_lag_1                   1608724 non-null  float32
 8   revenue_lag_2                   1608724 non-null  float32
 9   revenue_lag_3                   1608724 non-null  float32
 10  revenue_lag_6                   1608724 non-null  float32
 11  revenue_lag_12                  1608724 non-null  float32
 12  

# Dummy regressor

In [4]:
from sklearn.dummy import DummyRegressor

def train_dummy_regressor(X, y, dummy_strategy='median', cv_method="expanding", cv_n_splits=5, return_scores=False, return_model=False):
    """
    Returns on of these:
        - None
        - model
        - scores
        - model, scores
    """

    scores = []
    tscv = TimeSeriesSplit(n_splits=cv_n_splits, method=cv_method)

    for train_idx, test_idx in tscv.split(X):
        X_new = X.copy()
        X_new.drop("date_block_num", axis=1, inplace=True)
        X_train, X_test = X_new[train_idx], X_new[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = DummyRegressor(strategy=dummy_strategy)

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(score)

        print(f"{len(scores)} split RMSE: {score:.2f}\n")

    print(f"Average RMSE: {np.mean(scores):.2f}")

    if return_scores or return_model:
        to_return = []
        if return_model:
            to_return.append(model)
        if return_scores:
            to_return.append(scores)
        return tuple(to_return)

In [8]:
train_dummy_regressor(X, y, dummy_strategy='mean', cv_method="expanding", cv_n_splits=5)

1 split RMSE: 2.23

2 split RMSE: 2.03

3 split RMSE: 2.10

4 split RMSE: 2.56

5 split RMSE: 2.54

Average RMSE: 2.29


# Linear regression

In [5]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

In [6]:
def train_linear_regression(X, y, regularization=None, alpha=0.1, scaler=StandardScaler(), cv_method="expanding", cv_n_splits=5, return_scores=False, return_model=False):
    """
    Returns on of these:
        - None
        - model
        - scores
        - model, scores
    """

    scores = []
    tscv = TimeSeriesSplit(n_splits=cv_n_splits, method=cv_method)

    for train_idx, test_idx in tscv.split(X):
        X_new = X.copy()
        X_new.drop("date_block_num", axis=1, inplace=True)
        X_train, X_test = X_new[train_idx], X_new[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if scaler is not None:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        if regularization is None:
            model = LinearRegression()
        elif regularization == 'lasso':
            model = Lasso(alpha=alpha)
        elif regularization == 'ridge':
            model = Ridge(alpha=alpha)

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(score)

        print(f"{len(scores)} split RMSE: {score:.2f}\n")

    print(f"Average RMSE: {np.mean(scores):.2f}")

    if return_scores or return_model:
        to_return = []
        if return_model:
            to_return.append(model)
        if return_scores:
            to_return.append(scores)
        return tuple(to_return)

In [None]:
lr = train_linear_regression(X, y, scaler=StandardScaler(), return_model=True)

1 split RMSE: 1.77

2 split RMSE: 1.53

3 split RMSE: 1.64

4 split RMSE: 2.14

5 split RMSE: 2.24

Average RMSE: 1.86


In [None]:
lasso = train_linear_regression(X, y, regularization='lasso', scaler=StandardScaler(), alpha=0.5, return_model=True)

1 split RMSE: 1.87

2 split RMSE: 1.65

3 split RMSE: 1.77

4 split RMSE: 2.26

5 split RMSE: 2.30

Average RMSE: 1.97


In [None]:
ridge = train_linear_regression(X, y, regularization='ridge', scaler=StandardScaler(), alpha=0.5, return_model=True)

1 split RMSE: 1.77

2 split RMSE: 1.53

3 split RMSE: 1.64

4 split RMSE: 2.14

5 split RMSE: 2.24

Average RMSE: 1.86


# Support Vector Regression

In [7]:
from sklearn.svm import LinearSVR

In [8]:
def train_svr(X, y, C=1, epsilon=0.1, scaler=StandardScaler(), cv_method="expanding", cv_n_splits=1, return_scores=False, return_model=False):
    """
    Returns on of these:
        - None
        - model
        - scores
        - model, scores
    """

    scores = []
    tscv = TimeSeriesSplit(n_splits=cv_n_splits, method=cv_method)

    for train_idx, test_idx in tscv.split(X):
        X_new = X.copy()
        X_new.drop("date_block_num", axis=1, inplace=True)
        X_train, X_test = X_new[train_idx], X_new[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if scaler is not None:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        model = LinearSVR(C=C, epsilon=epsilon)

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(score)

        print(f"{len(scores)} split RMSE: {score:.2f}\n")

    print(f"Average RMSE: {np.mean(scores):.2f}")

    if return_scores or return_model:
        to_return = []
        if return_model:
            to_return.append(model)
        if return_scores:
            to_return.append(scores)
        return tuple(to_return)

In [None]:
train_svr(X, y, C=1, epsilon=0.1, scaler=StandardScaler())



1 split RMSE: 2.32

Average RMSE: 2.32




SVR required 9 minutes of training with linear kernel and 1 validation split..

# Random Forest Regressor

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
def train_rfr(X, y, max_depth=15, n_estimators=100, min_samples_leaf=5, random_state=42, scaler=StandardScaler(), cv_method="expanding", cv_n_splits=1, return_scores=False, return_model=False):
    """
    Returns on of these:
        - None
        - model
        - scores
        - model, scores
    """

    scores = []
    tscv = TimeSeriesSplit(n_splits=cv_n_splits, method=cv_method)

    for train_idx, test_idx in tscv.split(X):
        X_new = X.copy()
        X_new.drop("date_block_num", axis=1, inplace=True)
        X_train, X_test = X_new[train_idx], X_new[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if scaler is not None:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=random_state, n_jobs=-1)

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(score)

        print(f"{len(scores)} split RMSE: {score:.2f}\n")

    print(f"Average RMSE: {np.mean(scores):.2f}")

    if return_scores or return_model:
        to_return = []
        if return_model:
            to_return.append(model)
        if return_scores:
            to_return.append(scores)
        return tuple(to_return)

In [None]:
rfr = train_rfr(X, y, max_depth=15, n_estimators=100, min_samples_leaf=5, random_state=42, return_model=True)

1 split RMSE: 1.97

Average RMSE: 1.97


# CatBoost

In [14]:
from catboost import CatBoostRegressor

In [15]:
def train_catboost(X, y, scaler=StandardScaler(), cv_method="expanding", cv_n_splits=5, return_scores=False, return_model=False):
    """
    Returns on of these:
        - None
        - model
        - scores
        - model, scores
    """

    scores = []
    tscv = TimeSeriesSplit(n_splits=cv_n_splits, method=cv_method)

    for train_idx, test_idx in tscv.split(X):
        X_new = X.copy()
        X_new.drop("date_block_num", axis=1, inplace=True)
        X_train, X_test = X_new[train_idx], X_new[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if scaler is not None:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        model = CatBoostRegressor(loss_function="RMSE")

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        scores.append(score)

        print(f"{len(scores)} split RMSE: {score:.2f}\n")

    print(f"Average RMSE: {np.mean(scores):.2f}")

    if return_scores or return_model:
        to_return = []
        if return_model:
            to_return.append(model)
        if return_scores:
            to_return.append(scores)
        return tuple(to_return)

In [16]:
catboost_model = train_catboost(X, y, scaler=None, return_model=True)

Learning rate set to 0.129303
0:	learn: 2.4983178	total: 143ms	remaining: 2m 23s
1:	learn: 2.4100819	total: 233ms	remaining: 1m 56s
2:	learn: 2.3370619	total: 326ms	remaining: 1m 48s
3:	learn: 2.2790311	total: 416ms	remaining: 1m 43s
4:	learn: 2.2319430	total: 506ms	remaining: 1m 40s
5:	learn: 2.1917977	total: 597ms	remaining: 1m 38s
6:	learn: 2.1602691	total: 699ms	remaining: 1m 39s
7:	learn: 2.1341431	total: 781ms	remaining: 1m 36s
8:	learn: 2.1123324	total: 867ms	remaining: 1m 35s
9:	learn: 2.0947082	total: 956ms	remaining: 1m 34s
10:	learn: 2.0802136	total: 1.04s	remaining: 1m 33s
11:	learn: 2.0678869	total: 1.13s	remaining: 1m 33s
12:	learn: 2.0576614	total: 1.22s	remaining: 1m 32s
13:	learn: 2.0477718	total: 1.31s	remaining: 1m 32s
14:	learn: 2.0405832	total: 1.4s	remaining: 1m 31s
15:	learn: 2.0337211	total: 1.48s	remaining: 1m 31s
16:	learn: 2.0284081	total: 1.56s	remaining: 1m 30s
17:	learn: 2.0217775	total: 1.66s	remaining: 1m 30s
18:	learn: 2.0172553	total: 1.74s	remaining: 

# Function for creating submission

### Load the test set

In [17]:
test = pd.read_csv("../data/result_test.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 33 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   ID                              214200 non-null  int64  
 1   shop_id                         214200 non-null  int64  
 2   item_id                         214200 non-null  int64  
 3   item_name                       214200 non-null  object 
 4   item_category_id                214200 non-null  int64  
 5   item_category_name              214200 non-null  object 
 6   shop_name                       214200 non-null  object 
 7   months_since_last_sale          214200 non-null  int64  
 8   revenue_lag_1                   214200 non-null  float64
 9   revenue_lag_2                   214200 non-null  float64
 10  revenue_lag_3                   214200 non-null  float64
 11  revenue_lag_6                   214200 non-null  float64
 12  revenue_lag_12  

In [29]:
def get_submission(model, test, scaler=None, rounding=False, submission_tag=""):
    test_id = test["ID"]
    test = test.drop(["ID", "item_name", "shop_name", "item_category_name"], axis=1)

    if scaler is not None:
        test = scaler.transform(test)

    y_pred = model.predict(test)

    if rounding:
        y_pred = y_pred.round()
        
    y_pred = y_pred.clip(0, 20)

    submission = pd.DataFrame({"ID": test_id, "item_cnt_month": y_pred})
    submission.to_csv("submission_" + submission_tag + ".csv", index=False)