# Welcome!

In [92]:
# we start by loading some things
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from vf_portalytics.model import PredictionModel
from vf_portalytics.dataset import DataSet
from vf_portalytics.tool import create_train_test_sets, score_model, plot_feature_importance
from sklearn import linear_model, ensemble, svm

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bkcharts import Histogram, Bar, Line
from bkcharts.attributes import cat
output_notebook()

In [65]:
# we load data and create the basics
dataset = DataSet('x', path='/home/carst/')
df = dataset.data_df

# Investigate data

In [112]:
# distribution histograms 
# nb: normal histograms are for numeric columns only, others are top 20 bar charts 
# nb2: we filter out the bottom and top 0.5%)
check_list = ['discount_perc', 'account_banner', 'week_nr', 'baseline_units']

low_limit_def = 0.5
high_limit_def = 99.5
top_def = 20
df['count'] = 1  # nb: the count is the promotion - product combinations

# now plot
for col in check_list:
    if df[col].dtype in [np.int64, np.float64]:
        low_limit = np.percentile(df[col], low_limit_def)
        high_limit = np.percentile(df[col], high_limit_def)
        print('Showing ' + col + ' between ' + str(low_limit) + ' and ' + str(high_limit))
        mask = (df[col] >= low_limit) & (df[col] <= high_limit)
        p = Histogram(df[mask], values=col, bins=12)
        p.axis[1].axis_label = 'count'
        show(p)
    else:
        print('Showing ' + col + ' top ' + str(top_def))
        group = df.groupby([col], as_index=False)['count'].sum()
        group = group.nlargest(top_def, 'count')
        label = cat(columns=col, sort=False)
        p = Bar(group, label=label, values='count', legend=None)
        p.axis[1].axis_label = 'count'
        show(p) 
    

Showing discount_perc between 5.72013821 and 60.2089078296


Showing account_banner top 20


Showing week_nr between 1.0 and 53.0


Showing baseline_units between 7.0 and 20818.9533333


# Filter Dataframe

In [67]:
# filtering the contents
# df = df[df['field_1'] >= 110]
# print("\nAfter status filter: We have %d df with %d features." % (df.shape[0], df.shape[1]))

date_mask = df['field_21'] > 201500
df = df[date_mask]
print("\nAfter week filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

small_df_mask = df['baseline_units'] > 0.0
df = df[small_df_mask]
print("\nAfter 0 no baseline filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

huge_df_mask = df['baseline_units'] < 10.0**5
df = df[huge_df_mask]
print("\nAfter huge baseline filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

small_lift_mask = (df['lift'] > 1.2) & (df['lift'] < 40)
df = df[small_lift_mask]
print("\nAfter lift filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

min_discount_mask = (df['discount_perc'] >= 5.0) & (df['discount_perc'] < 80.0)
df = df[min_discount_mask]
print("\nAfter discount filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

mechanism_msk = df['mechanism'].notnull()
df = df[mechanism_msk]
print("\nAfter mechanism filter: %d rows with %d features." % (df.shape[0], df.shape[1]))


After week filter: 138131 rows with 229 features.

After 0 no baseline filter: 138131 rows with 229 features.

After huge baseline filter: 138109 rows with 229 features.

After lift filter: 138109 rows with 229 features.

After discount filter: 138109 rows with 229 features.

After mechanism filter: 138109 rows with 229 features.


# Create a model and select the features

In [18]:
# creating a prediction model
prediction_model = PredictionModel('carst_example', path='/home/carst/')

# set the features (C = categoric value for dimensional features)
prediction_model.features = {
    'baseline_units': [],
    'total_baseline_units': [],  # total to check the complete size of the promotion
    'total_nr_products': [],  # total to check the complete size of the promotion
    'base_price': [],
    'discount_perc': [],
    'discount_amt': [],
    'account_id': ['C'],  # account
    'product_brandkey': ['C'],  # brand
    'product_6_bc': ['C'],  # segment
    'product_3_cat': ['C'],  # category
    'week_nr': ['C'],
    'mechanism': ['C'],
    'multi_buy_x': [],
    'multi_buy_y': [],
    # 'field_102401': [],
    'promotion_dimension_136': ['C'],
    'promotion_dimension_137': ['C'],
    # 'promotion_dimension_138': ['C']
}

# we predict the lift normally or log?
prediction_model.target = {'lift': ['log']}

# are we doing logarithmic prediction?
if 'log' in prediction_model.target['lift']:
    log = True
else:
    log = False

In [19]:
# only use the needed columns
used_column_list = list(set(prediction_model.features.keys() + prediction_model.target.keys()))
df = df[used_column_list]

# Create train and test sets

In [20]:
# create a mask based on random selections or on a period
mask = np.random.rand(len(df)) < 0.8

# create train sets
train_df, train_lift, test_df, test_lift = create_train_test_sets(df, mask, prediction_model, prediction_target='lift')

# are we doing logarithmic predictions
if log:
    # we need to train everything based on the log value
    train_lift = train_lift.apply(np.log)


Train set: 113398
Test set: 28030


# Select a regressor

In [21]:
# create and train a regressor
regressor = ensemble.ExtraTreesRegressor(n_estimators=79,
                                         random_state=10,
                                         min_samples_split=4,
                                         n_jobs=-1)
regressor.fit(train_df, train_lift)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=4, min_weight_fraction_leaf=0.0,
          n_estimators=79, n_jobs=-1, oob_score=False, random_state=10,
          verbose=0, warm_start=False)

# Predict and score the model

In [22]:
# predict the lift
predict_lift = regressor.predict(test_df)

if log:
    # if it was logarithmic, expand the lift again
    predict_lift = np.exp(predict_lift)

# score the model
score_model(predict_lift, test_lift, baseline=test_df['baseline_units'])

R²:0.526000156358
MAE:1.38380570379
MAPE:26.179970984
Forecast error:21.9979622385


# Check Feature Importance

In [110]:
# Retrieve the feature importance
feature_importance = regressor.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
mask = feature_importance > 0.5
feature_importance = feature_importance[mask]
sorted_idx = np.argsort(feature_importance)
output_list = []
for val, feature in zip(feature_importance[sorted_idx], train_df.columns[sorted_idx]):
    output_list.append({'feature': feature, 'importance': val})
importance_df = pd.DataFrame(output_list)
importance_df = importance_df.sort('importance', ascending=False)

# now plot a chart
label = cat(columns='feature', sort=False)
p = Bar(importance_df, label=label, values='importance', legend=None)
p.axis[1].axis_label = 'Importance'
show(p)

# Investigate Results

In [None]:
# check where we are off
pass

# Save the Model

In [12]:
# save the model
prediction_model.model = regressor
prediction_model.save()

In [None]:
# explain the available columns
# plot charts: baseline vs total, promotion percentage vs lift
# line chart: promotion percentage vs avg lift

# result: prediction vs actual -> plot + line?