## Model

In [None]:
Markdown('Model used: **{}**'.format(model_name))

In [None]:
ets_ols_models = ['empWt',
                  'empWtBalanced',
                  'empWtDropNeg',
                  'empWtStep',
                  'empWtLasso',
                  "empWtNNLS",
                  'empWtDropNegLasso',
                  'empWtLassoBest']

ets_lasso_models = ['lassoWtLasso',
                    'lassoWtLassoBest']

In [None]:
if model_name in ets_ols_models:
    display(HTML('<h3>Weights assigned to each feature</h3>'))

In [None]:
# if we used an R OLS linear model, then we first just show a summary of that model
if model_name in ets_ols_models:
    summ = %R -i experiment_id,output_dir,model_name library(etsmodels); modelfile <- paste0(output_dir, "/", experiment_id, '_', model_name, ".Rmodel"); load(modelfile); summ <- summary(fit)
    print(summ)

In [None]:
markdown_str = """### Standardized and Relative Regression Coefficients (Betas)

The relative coefficients are intended to show relative contribution of different feature and their primary purpose is to indentify whether one of the features has an unproportionate effect over the final score. They are computed as standardized/(sum of absolute values of standardized coefficients). 

**Note**: if the model contains negative coefficients, relative values will not sum up to one and their interpretation is generally questionable. """

if model_name in ets_ols_models + ets_lasso_models:
    display(Markdown(markdown_str))

In [None]:
markdown_str = """
**Note**: The coefficients were estimated using LASSO regression. Unlike OLS (standard) linear regression, lasso estimation is based on an optimization routine and therefore the exact estimates may differ across different systems. """

if model_name in ets_lasso_models:
    display(Markdown(markdown_str))

In [None]:
if model_name in ets_ols_models + ets_lasso_models:
    df_weights = pd.read_csv(join(output_dir, '{}_coefficients.csv'.format(experiment_id)), index_col=0)
    df_weights.drop('Intercept', inplace=True)
    df_betas = df_weights.multiply(df_train_preproc[features_used].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.sort('feature', inplace=True)
    display(HTML(df_betas.to_html(index=False, float_format=float_format_func)))

In [None]:
if model_name in ets_ols_models + ets_lasso_models:
    display(Markdown('Here are the same values, shown graphically.'))

In [None]:
# this cell is if we have less than 15 features
if model_name in ets_ols_models + ets_lasso_models and len(features_used) <= 15:
    df_betas_sorted = df_betas.sort('standardized', ascending=False)
    df_betas_sorted.reset_index(drop=True, inplace=True)
    fig = plt.figure()
    fig.set_size_inches(8, 2.5)
    grey_colors = sns.color_palette('Greys', len(features_used))[::-1]
    with sns.axes_style('whitegrid'):
        ax1=fig.add_subplot(121)
        sns.barplot("feature","standardized", data=df_betas_sorted, 
                    x_order=df_betas_sorted['feature'].values,
                    palette=sns.color_palette("Greys", 1), ax=ax1)
        ax1.set_xticklabels(df_betas_sorted['feature'].values, rotation=90)
        ax1.set_title('Values of standardized coefficients')
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        if len(features_used) < 15:
            ax2=fig.add_subplot(133, aspect=True)
            ax2.pie(abs(df_betas_sorted['relative'].values), colors=grey_colors, 
                labels=df_betas_sorted['feature'].values)
            ax2.set_title('Proportional contribution of each feature')
        else:
            fig.set_size_inches(0.35*len(features_used), 2.5)
    plt.savefig(join(figure_dir, '{}_betas.svg'.format(experiment_id)))

In [None]:
# this cell is if we have more than 15 features (no pie chart)
if model_name in ets_ols_models + ets_lasso_models and len(features_used) > 15:
    df_betas_sorted = df_betas.sort('standardized', ascending=False)
    df_betas_sorted.reset_index(drop=True, inplace=True)
    fig = plt.figure()
    fig.set_size_inches(8, 2.5)
    grey_colors = sns.color_palette('Greys', len(features_used))[::-1]
    with sns.axes_style('whitegrid'):
        ax1=fig.add_subplot(121)
        sns.barplot("feature","standardized", data=df_betas_sorted, 
                    x_order=df_betas_sorted['feature'].values,
                    palette=sns.color_palette("Greys", 1), ax=ax1)
        ax1.set_xticklabels(df_betas_sorted['feature'].values, rotation=90)
        ax1.set_title('Values of standardized coefficients')
        ax1.set_ylabel('')
        ax2=fig.add_subplot(133, aspect=True)
        ax2.pie(abs(df_betas_sorted['relative'].values), colors=grey_colors, 
                labels=df_betas_sorted['feature'].values)
        ax2.set_title('Proportional contribution of each feature')
    plt.savefig(join(figure_dir, '{}_betas.svg'.format(experiment_id)))    

In [None]:
if model_name in ets_ols_models:
    display(Markdown('<h2>Model diagnostics</h2>'))
    display(Markdown("These are standard plots for model diagnostics for the main model. All information is computed based on the training set."))

In [None]:
if model_name in ets_ols_models:
    modelfile = join(output_dir, '{}_{}.Rmodel'.format(experiment_id, model_name))
    imgfile = join(figure_dir, '{}_{}_diagnostics.svg'.format(experiment_id, model_name))
    %R -i modelfile,imgfile library(etsmodels); load(modelfile); svg(imgfile); par(mfrow=c(2, 2)); plot(fit); dev.off();
    display(SVG(imgfile))