In [None]:
!pip install --q vectice[github]==22.3.5.1

In [1]:
!pip show vectice

Name: vectice
Version: 22.3.5.1
Summary: Vectice Python library
Home-page: https://www.vectice.com
Author: Vectice Inc.
Author-email: sdk@vectice.com
License: Apache License 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: python-dotenv, requests, urllib3
Required-by: 


In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


The training set has 125,497,040 (125 Million) rows and 6 columns: row id, date, store number, item number, unit sales (keep in mind that this can be an integer, float, or -1, which represents a returned item), and whether there was a promotion for a particular item.

Now here we will take some amount of the data for analysis purpose because we have large number of data.

In [3]:
items = pd.read_csv("items.csv")
holiday_events = pd.read_csv("holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv("stores.csv")
oil = pd.read_csv("oil.csv", parse_dates=['date'])
transactions = pd.read_csv("transactions.csv", parse_dates=['date'])
# the full training data's output: "125,497,040 rows | 6 columns"
#Here, we only load approx 5% of the data just to get a rough idea of what is in store for us.
train = pd.read_csv("train_trimmed.csv", parse_dates=['date'])
train_large = pd.read_csv('train_large.csv', parse_dates = ['date'])

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,0,2013-01-01,25,103665,7.0,
1,1,1,2013-01-01,25,105574,1.0,
2,2,2,2013-01-01,25,105575,2.0,
3,3,3,2013-01-01,25,108079,1.0,
4,4,4,2013-01-01,25,108701,1.0,


In [5]:
print("Nulls in Oil columns: {0} => {1}".format(oil.columns.values,oil.isnull().any().values))
print("="*70)
print("Nulls in holiday_events columns: {0} => {1}".format(holiday_events.columns.values,holiday_events.isnull().any().values))
print("="*70)
print("Nulls in stores columns: {0} => {1}".format(stores.columns.values,stores.isnull().any().values))
print("="*70)
print("Nulls in transactions columns: {0} => {1}".format(transactions.columns.values,transactions.isnull().any().values))

Nulls in Oil columns: ['date' 'dcoilwtico'] => [False  True]
Nulls in holiday_events columns: ['date' 'type' 'locale' 'locale_name' 'description' 'transferred'] => [False False False False False False]
Nulls in stores columns: ['store_nbr' 'city' 'state' 'type' 'cluster'] => [False False False False False]
Nulls in transactions columns: ['date' 'store_nbr' 'transactions'] => [False False False]


 The only missing data occurs in the oil data file, which provides the historical daily price for oil.

# feature engineering

**Here we analyze the data and select the features for our model to be trained on.**

**Train**
id, date, store_nbr, item_nbr, unit_scale, on_promotion

**Items**
item_nbr, family, class, perishable

**Holidays_events**
date, type, locale, locale_name, description, transferred

**Stores**
store_nbr, city, state, type, cluster

**Oil**
date, dcoilwtico

**Transactions**
date, store_nbr, transactions

**Selected features as inputs to the model**

date, holiday.type, holidaye.locale, holiday.locale_name, holiday_transfered, store_nbr, store.city, store.state, store.type, store.cluster, transactions, item_nbr, item.family, item.class, on_promotion, perishable, dcoilwtico.

**Selected features as outputs of the model**

transactions per store, unit_sales per item

# DATA pipeline

Here, we'll merge the different dataframes into one dataframe

In [6]:
import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin

class prepare_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("prepare_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        train_stores = X[0].merge(X[1], right_on = 'store_nbr', left_on='store_nbr')
        train_stores_oil = train_stores.merge(X[2], right_on='date', left_on='date')
        train_stores_oil_items = train_stores_oil.merge(X[3], right_on = 'item_nbr', left_on = 'item_nbr')
        train_stores_oil_items_transactions = train_stores_oil_items.merge(X[4], right_on = ['date', 'store_nbr'], left_on = ['date', 'store_nbr'])
        train_stores_oil_items_transactions_hol = train_stores_oil_items_transactions.merge(X[5], right_on = 'date', left_on = 'date')
        
        data_df = train_stores_oil_items_transactions_hol.copy(deep = True)
        
        # change the bool to int
        data_df['onpromotion'] = data_df['onpromotion'].astype(int)
        data_df['transferred'] = data_df['transferred'].astype(int)

        # change the names
        data_df.rename(columns={'type_x': 'st_type', 'type_y': 'hol_type'}, inplace=True)

        # drop the id
        data_df.drop(['id'], axis=1, inplace=True)
        
        print(data_df.head())
        
        # handle date
        data_df['date'] = pd.to_datetime(data_df['date'])
        data_df['date'] = data_df['date'].map(dt.datetime.toordinal)
                
        return data_df

### Custom transform for splitting the data

Here, we split dataframe into numerical values, categorical values and date

In [7]:
# split dataframe into numerical values, categorical values and date
class split_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("split_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Get columns for each type         
        df_ = X.drop(['date'], axis = 1)
        cols = df_.columns
        num_cols = df_._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        
        data_num_df = X[num_cols]
        data_cat_df = X[cat_cols]
        data_date_df = X['date']
        
        return data_num_df, data_cat_df, data_date_df

Herre, we handle the missing data, apply standard scaler to numerical attributes, and convert categorical data into numerical

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

class process_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("process_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ### numerical data
        # impute nulls in numerical attributes
        imputer = SimpleImputer(strategy="mean", copy="true")
        num_imp = imputer.fit_transform(X[0])
        data_num_df = pd.DataFrame(num_imp, columns=X[0].columns, index=X[0].index)
        
        # apply standard scaling
        scaler = StandardScaler()
        scaler.fit(data_num_df)
        num_scaled = scaler.transform(data_num_df)
        data_num_df = pd.DataFrame(num_scaled, columns=X[0].columns, index=X[0].index)
        
        ### categorical data
        # one hot encoder
        cat_encoder = OneHotEncoder(sparse=False)
        data_cat_1hot = cat_encoder.fit_transform(X[1])
        
        # convert it to datafram with n*99 where n number of rows and 99 is no. of categories
        data_cat_df = pd.DataFrame(data_cat_1hot, columns=cat_encoder.get_feature_names()) #, index=X[1].index)
                
        return data_num_df, data_cat_df, X[2]

In [9]:
class join_df(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("join_df -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ### numerical data
        data_df = X[0].join(X[1])
        data_df = data_df.join(X[2])
        
        return data_df

Applying the Pipeline to get our prepared data

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipe_processing = Pipeline([
        ('prepare_data', prepare_data()),
        ('split_data', split_data()),
        ('process_data', process_data()),
        ('join_data', join_df())
    ])

# our prepared data
data_df = pipe_processing.fit_transform([train_large, stores, oil, items, transactions, holiday_events])

# split it according to our feature engineering
X = data_df.drop(['unit_sales', 'transactions'], axis=1)
Y = data_df[['unit_sales', 'transactions']]

prepare_data -> init
split_data -> init
process_data -> init
join_df -> init
   Unnamed: 0       date  store_nbr  item_nbr  unit_sales  onpromotion  \
0      405070 2017-05-12         31    877514         2.0            0   
1      405071 2017-05-12         31    881701         6.0            0   
2      405074 2017-05-12         31    882624        12.0            0   
3      405076 2017-05-12         31    886396         1.0            0   
4      405077 2017-05-12         31    888763         4.0            1   

       city     state st_type  cluster  dcoilwtico     family  class  \
0  Babahoyo  Los Rios       B       10       47.83   CLEANING   3020   
1  Babahoyo  Los Rios       B       10       47.83  GROCERY I   1042   
2  Babahoyo  Los Rios       B       10       47.83      DAIRY   2116   
3  Babahoyo  Los Rios       B       10       47.83  GROCERY I   1042   
4  Babahoyo  Los Rios       B       10       47.83       DELI   2646   

   perishable  transactions hol_type locale l



### Generate test and training data


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

## Auhtenticate to Vectice

In [52]:
#Import the required packages
from vectice import Experiment
from vectice.api.json import ModelType
from vectice.api.json import JobType
from vectice.api.json import JobArtifactType
from vectice.api.json import ModelVersionStatus
from vectice.api.json import VersionStrategy
import logging
import os
logging.basicConfig(level=logging.INFO)

os.environ['VECTICE_API_ENDPOINT']= "app.vectice.com"

os.environ['VECTICE_API_TOKEN'] = "Token"

# Add you project id. The project id can be found in the project settings page in the Vectice UI
project_id = ID

Creating our modeling experiment

In [None]:
experiment = Experiment(job="Modeling", project=project_id, job_type=JobType.TRAINING, auto_code=True)

In [54]:
## Inputs
train_ds = experiment.add_dataset_version(dataset="cleaned_kc_house_data")

# Modelling and testing

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error

### Linear regression

In [22]:
experiment.start()

model = LinearRegression()

model.fit(x_train.values, y_train.values)
    
pred = model.predict(x_test.values)
    
RMSE = np.sqrt(mean_squared_error(y_test.values, pred))
MAE = mean_absolute_error(y_test.values, pred)

print("root_mean_squared_error: ",RMSE) 
print("mean_absolute_error: ", MAE)

# Let's log the model we trained along with its metrics, as a new version 
# of the "Regressor" model in Vectice.

metrics = {"RMSE": RMSE, "MAE": MAE}
model_version = experiment.add_model_version(model="Regressor", algorithm="Linear Regression", metrics=metrics)

# We complete the current experiment's run 
## The created model version will be automatically attached as output of the run
experiment.complete()

root_mean_squared_error:  0.7599447094988461
mean_absolute_error:  0.31084975149783933


### Random Forest

In [17]:
experiment.start(inputs=[train_ds])

print("Random Forest")
model = RandomForestRegressor(random_state=42)

model.fit(x_train.values, y_train.values)
    
pred = model.predict(x_test.values)
    
RMSE = np.sqrt(mean_squared_error(y_test.values, pred))
MAE = mean_absolute_error(y_test.values, pred)

print("root_mean_squared_error: ",RMSE) 
print("mean_absolute_error: ", MAE)

# Let's log the model we trained along with its metrics, as a new version 
# of the "Regressor" model in Vectice.

metrics = {"RMSE": RMSE, "MAE": MAE}
model_version = experiment.add_model_version(model="Regressor", algorithm="Linear Regression", metrics=metrics)

# We complete the current experiment's run 
## The created model version will be automatically attached as output of the run
experiment.complete()

Random Forest
mean_squared_error:  0.6279706411305517
mean_absolute_error:  0.36668117625129293


Random forest model has the lowest error