In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math

In [None]:
GOOGLE_ANALYTICS_START_DATE = datetime.date(2019,9,13)

df = pd.read_csv(
        '../../data/final/futurice_blog_data.csv', 
        sep='\t', 
        parse_dates=['time'],
        date_parser=lambda col: pd.to_datetime(col))
df = df[df['time'] >= pd.Timestamp(GOOGLE_ANALYTICS_START_DATE)]
df.info()

# Correlation matrix

In [None]:
import seaborn as sns
corr_matrix = df.corr()
performance_features = ['pageviews', 'avg_time', 'bounce_rate', 'exit%']
predictors = sorted(list(set(df.select_dtypes(include=['float64', 'int64']).columns) - set(performance_features) - set(['index', 'unique_pageviews'])))
corr_matrix = corr_matrix.loc[predictors, performance_features]
corr_matrix.style.background_gradient(cmap='RdBu_r')

# Feature importance with linear regression

## Lasso coefficients

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

alpha_log_limits = [(-2,3),(-3,2),(-4,-1),(-4,-1)]
lasso = Lasso(max_iter=10000)
opt_models = []
fig = plt.figure(figsize=(20,4))
for i, target_feature in enumerate(performance_features):
    alphas = np.logspace(*alpha_log_limits[i], 1000)
    X = df[predictors]
    y = df[target_feature]
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    # Normalize predictors
    scaler = StandardScaler().fit(X_train) 
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # Train a Lasso predictor for each alpha
    coefs = []
    for a in alphas:
        lasso.set_params(alpha=a)
        lasso.fit(X_train, y_train)
        coefs.append(lasso.coef_)
    # Train an optimal Lasso predictor using CV
    model = LassoCV(cv=5, random_state=0, max_iter=10000, eps=1e-4)
    model.fit(X_train, y_train)
    opt_models.append(model)
    opt_alpha = model.alpha_
    # Plot the coefficients over alpha
    ax = plt.subplot(1,4,i+1)
    ax.plot(alphas, coefs, label=predictors)
    ax.axvline(x=opt_alpha, color='r', ls=':', lw=0.5, label='alpha: CV estimation')
    ax.set_xscale('log')
    ax.axis('tight')
    ax.set_xlabel('alpha')
    ax.set_ylabel('Standardized Coefficients')
    ax.set_title(f'Lasso coefficients for {target_feature}') 
plt.legend(bbox_to_anchor=(1,1), loc="upper left")
plt.tight_layout()
plt.show()

## Model performance

In [None]:
plt.figure(figsize=(16,10))
for i, target_feature in enumerate(performance_features):
    X = df[predictors]
    y = df[target_feature]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    scaler = StandardScaler().fit(X_train) 
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    model = opt_models[i]
    top_features = [predictors[i] for i in np.argpartition(np.abs(model.coef_), -3)[-3:]]
    ax = plt.subplot(2,2,i+1)
    bar_colors = [3 if feat in top_features else 0 for feat in predictors]
    ax.axvline(0, color='black', lw=0.7)
    ax.barh(predictors, model.coef_, fc='royalblue', ec='firebrick', lw=bar_colors)
    ax.set_title(f'Importance scores for predicting {target_feature}')
    # ax.set_xticks(range(len(predictors)), predictors, rotation=45)
    print(f'-- Evaluating optimal Lasso model for predicting [{target_feature}]')
    print(' Top features: ', ", ".join(top_features))
    print(' RMSE: ', math.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    print()
plt.tight_layout()
plt.show()
    

The linear models don't do so well with predicting the performance of the blog, which indicates that the importance scores might also be unreliable. Let's try another approach. Instead of linear regression, we can use random tree forest regressors to asses importance of the features based on the reduction in the criterion used to select split points, like Gini or entropy.

# Random Forest Regressor feature selection

In [None]:
from sklearn.ensemble import RandomForestRegressor

plt.figure(figsize=(16,10))
for i, target_feature in enumerate(performance_features):
    X = df[predictors]
    y = df[target_feature]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    top_features = [predictors[i] for i in np.argpartition(model.feature_importances_, -3)[-3:]]
    bar_colors = [3 if feat in top_features else 0 for feat in predictors]
    ax = plt.subplot(2,2,i+1)
    std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
    ax.barh(predictors, np.abs(model.feature_importances_), xerr=std, fc='royalblue', ec='firebrick', lw=bar_colors)
    ax.set_title(f'Importance scores for predicting {target_feature}')
    ax.set_xlim(left=0)
    print(f'-- Evaluating RFR model for predicting [{target_feature}]')
    print(' Top features: ', ", ".join(top_features))
    print(' RMSE: ', math.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    print()
plt.tight_layout()
plt.show()