In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sys
sys.path.append('../')
from scripts.model_utils import (get_bayes_ratio_by_client, add_time_since_last_referral,
                                 add_referral_order_index, add_a_first_referral_dummy_variable)
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV

In [2]:
# Load up referrals dataframe
referrals = pd.read_csv('../../Welcome-Centre-DataCorps-Data/referrals.csv', low_memory=False)
clients = pd.read_csv('../../Welcome-Centre-DataCorps-Data/clients.csv', index_col=0)
referrals['ReferralTakenDate'] = pd.to_datetime(referrals['ReferralTakenDate'])

# Remove any double referrals in one day
referrals = referrals.reset_index().groupby(['ClientId', 'ReferralTakenDate']).first().reset_index()\
    .set_index('ReferralInstanceId')

# Calculate Empirical Bayes and append it to referrals
scores = get_bayes_ratio_by_client(referrals)
referrals = referrals.merge(scores, left_on='ClientId', right_index=True)

# Add extra features
referrals = (referrals.pipe(add_referral_order_index)
                        .pipe(add_time_since_last_referral)
                         .pipe(add_a_first_referral_dummy_variable)
            )


  sk = 2*(b-a)*sqrt(a + b + 1) / (a + b + 2) / sqrt(a*b)
  improvement from the last ten iterations.


In [3]:
# Prepare X and y
y = referrals['Empirical Bayes Ratio'].apply(np.log)

# Include referral metadata as features
referral_reasons = referrals.filter(like='ReferralDomestic')
referral_document = referrals.filter(like='ReferralDocument')
referral_benefit = referrals.filter(like='ReferralBenefit')
referral_issue = referrals.filter(like='ReferralIssue')
referral_reason = referrals.filter(like='ReferralReason')
referral_agency = pd.get_dummies(referrals['ReferralAgencyId'])
general = referrals[['DependantNumber', 'LivingWithPartner']]

X = pd.concat([
    referral_reasons,
    referral_document,
    referral_benefit,
    referral_issue,
    referral_reason,
    referral_agency,
    general
], axis=1).fillna(0)

X = pd.concat([referrals[['time_since', 'first', 'referral_no']],
                  X], axis=1)

In [6]:
# Define scoring function
scoring_function = make_scorer(lambda a,b: spearmanr(a,b)[0])

In [7]:
# Create Model Pipeline
pipe = Pipeline([
    ('imp', Imputer()),
    ('scale', StandardScaler()),
    ('cls', SGDRegressor(penalty='l1', n_iter=20))
])

In [8]:
# Fit Logistic Regression
param_grid = {
    'cls__alpha': np.logspace(-5,5,5)
}
gs = GridSearchCV(pipe, param_grid, cv=10, scoring=scoring_function, verbose=3)
gs.fit(X, y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] cls__alpha=1e-05 ................................................
[CV] ....................... cls__alpha=1e-05, score=0.098400 -   0.0s
[CV] cls__alpha=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ....................... cls__alpha=1e-05, score=0.119676 -   0.0s
[CV] cls__alpha=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV] ....................... cls__alpha=1e-05, score=0.114833 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.111229 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.053813 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ....................... cls__alpha=1e-05, score=0.031471 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.048589 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ....................... cls__alpha=1e-05, score=0.109419 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.043524 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] .

  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=1.0 ..................................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .............................. cls__alpha=1.0, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=316.227766017 ........................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] .................... cls__alpha=316.227766017, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s
[CV] cls__alpha=100000.0 .............................................


  c /= stddev[:, None]
  c /= stddev[None, :]
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond1 = (scale > 0) & (x > self.a) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   39.4s finished


[CV] ......................... cls__alpha=100000.0, score=nan -   0.0s


GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('cls', SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=20, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'cls__alpha': array([  1.00000e-05,   3.16228e-03,   1.00000e+00,   3.16228e+02,
         1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(<lambda>), verbose=3)

In [9]:
gs.best_params_, gs.best_score_

({'cls__alpha': 0.0031622776601683794}, 0.037960803772615584)