In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sys
sys.path.append('../')
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV

In [3]:
# Load up referrals dataframe
referrals = pd.read_csv('../../Welcome-Centre-DataCorps-Data/referrals.csv', low_memory=False)
clients = pd.read_csv('../../Welcome-Centre-DataCorps-Data/clients.csv', index_col=0)
referrals['ReferralTakenDate'] = pd.to_datetime(referrals['ReferralTakenDate'])

# Remove any double referrals in one day
referrals = referrals.reset_index().groupby(['ClientId', 'ReferralTakenDate']).first().reset_index()\
    .set_index('ReferralInstanceId')

In [181]:
test_df = referrals[['ClientId', 'ReferralTakenDate']].copy()
test_group = test_df[test_df['ClientId']==2890].copy().sort_values('ReferralTakenDate')

def calculate_burst_number(referrals_df, break_length=28):
    referrals_df = referrals_df.sort_values('ReferralTakenDate')
    referrals_df['day_diff'] = (referrals_df.groupby('ClientId')['ReferralTakenDate']
                                      .diff().dt.days) 
    referrals_df['start_of_burst'] = referrals_df['day_diff'] > break_length
    referrals_df['burst_number'] = referrals_df.groupby('ClientId')['start_of_burst'].cumsum() + 1
    referrals_df['burst_length'] = (referrals_df.groupby(['ClientId', 'burst_number'])['burst_number']
                                                .transform(lambda x: x.count()))
    referrals_df['index_in_burst'] = (referrals_df.groupby(['ClientId', 'burst_number'])['ReferralTakenDate']
                                      .rank())
    referrals_df['has_had_previous_burst'] = 1 * (referrals_df['burst_number'] > 1)         
    return referrals_df

    
data = referrals.pipe(calculate_burst_number)
data = data[data['index_in_burst'] == 3].copy()

In [183]:
# Test a model
# Include referral metadata as features
referral_reasons = data.filter(like='ReferralDomestic')
referral_document = data.filter(like='ReferralDocument')
referral_benefit = data.filter(like='ReferralBenefit')
referral_issue = data.filter(like='ReferralIssue')
referral_reason = data.filter(like='ReferralReason')
referral_agency = pd.get_dummies(data['ReferralAgencyId'])
general = data[['DependantNumber', 'LivingWithPartner']]
burst_params = data[['has_had_previous_burst', 'index_in_burst', 'day_diff']]

X = pd.concat([
    referral_reasons,
    referral_document,
    referral_benefit,
    referral_issue,
    referral_reason,
    referral_agency,
    general,
    burst_params
], axis=1).fillna(0)

y = data['burst_length']

In [185]:
# Look at correlations
corr_list = []
for col in X.columns:
    corr_list.append(np.corrcoef(X[col], y)[0][1])
correlations = pd.DataFrame({'columns':X.columns, 'corr':corr_list}).sort_values('corr')

  c /= stddev[:, None]
  c /= stddev[None, :]


In [186]:
# Look at the largest negative correlations
best_neg_cols = correlations.head(10)['columns'].values
correlations.head(10)
best_neg_cols

array(['day_diff', 86,
       'ReferralDomesticCircumstances_Cooking - All Facilities',
       'ReferralReason_Budgeting Issues',
       'ReferralBenefit_JSA (Job Seekers Allowance)',
       'ReferralBenefit_ESA (Employment Support Allowance)',
       'ReferralReason_Benefits - Change of Benefit/Circumstance', 29,
       'ReferralReason_Cold weather Issues',
       'ReferralReason_Bills - Utilities (Gas, Electricity, Water)'], dtype=object)

In [188]:
correlations[correlations['columns']=='has_had_previous_burst']

Unnamed: 0,columns,corr
171,has_had_previous_burst,0.000402


In [187]:
# Look at the largest negative correlations
best_pos_cols  = correlations[~correlations['corr'].isnull()].tail(10)['columns'].values
best_pos_cols

array([117, 'ReferralDomesticCircumstances_Cooking - Microwave',
       'ReferralDomesticCircumstances_Non-Cook - Non-cook at home',
       'ReferralIssue_Asylum Seeker',
       'ReferralDomesticCircumstances_Cooking - Kettle', 89,
       'ReferralReason_Loss of Job', 23,
       'ReferralReason_no income entitlement', 91], dtype=object)

In [149]:
# Create Model Pipeline
pipe = Pipeline([
    ('imp', Imputer()),
    ('scale', StandardScaler()),
    ('cls', Lasso())
])

In [169]:
np.concatenate([best_neg_cols, best_pos_cols])

array(['day_diff', 86,
       'ReferralDomesticCircumstances_Cooking - All Facilities',
       'ReferralReason_Budgeting Issues',
       'ReferralBenefit_JSA (Job Seekers Allowance)',
       'ReferralBenefit_ESA (Employment Support Allowance)',
       'ReferralReason_Benefits - Change of Benefit/Circumstance', 29,
       'ReferralReason_Cold weather Issues',
       'ReferralReason_Bills - Utilities (Gas, Electricity, Water)', 117,
       'ReferralDomesticCircumstances_Cooking - Microwave',
       'ReferralDomesticCircumstances_Non-Cook - Non-cook at home',
       'ReferralIssue_Asylum Seeker',
       'ReferralDomesticCircumstances_Cooking - Kettle', 89,
       'ReferralReason_Loss of Job', 23,
       'ReferralReason_no income entitlement', 91], dtype=object)

In [179]:
# Fit Logistic Regression
use_cols = np.concatenate([best_neg_cols, best_pos_cols])
X_min = X[use_cols]

param_grid = {
    'cls__alpha': np.logspace(-5,5,5)
}
gs = GridSearchCV(pipe, param_grid=param_grid, scoring='r2', cv=3, verbose=3)


gs.fit(X_min, y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.002792 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ....................... cls__alpha=1e-05, score=0.085069 -   0.0s
[CV] cls__alpha=1e-05 ................................................
[CV] ...................... cls__alpha=1e-05, score=-0.060013 -   0.0s
[CV] cls__alpha=0.00316227766017 .....................................
[CV] ........... cls__alpha=0.00316227766017, score=-0.001789 -   0.0s
[CV] cls__alpha=0.00316227766017 .....................................
[CV] ............ cls__alpha=0.00316227766017, score=0.085256 -   0.0s
[CV] cls__alpha=0.00316227766017 .....................................
[CV] ........... cls__alpha=0.00316227766017, score=-0.056512 -   0.0s
[CV] cls__alpha=1.0 ..................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('cls', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'cls__alpha': array([  1.00000e-05,   3.16228e-03,   1.00000e+00,   3.16228e+02,
         1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=3)

In [180]:
gs.best_score_

0.0089848921148035412

In [91]:
data.groupby('index_in_burst')['burst_length'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
index_in_burst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,11036.0,1.62867,1.879763,1.0,1.0,1.0,2.0,69.0
2.0,3009.0,3.305749,3.015734,2.0,2.0,2.0,3.0,69.0
3.0,1386.0,4.834776,3.926138,3.0,3.0,4.0,5.0,69.0


In [189]:
X_min

Unnamed: 0_level_0,day_diff,86,ReferralDomesticCircumstances_Cooking - All Facilities,ReferralReason_Budgeting Issues,ReferralBenefit_JSA (Job Seekers Allowance),ReferralBenefit_ESA (Employment Support Allowance),ReferralReason_Benefits - Change of Benefit/Circumstance,29,ReferralReason_Cold weather Issues,"ReferralReason_Bills - Utilities (Gas, Electricity, Water)",117,ReferralDomesticCircumstances_Cooking - Microwave,ReferralDomesticCircumstances_Non-Cook - Non-cook at home,ReferralIssue_Asylum Seeker,ReferralDomesticCircumstances_Cooking - Kettle,89,ReferralReason_Loss of Job,23,ReferralReason_no income entitlement,91
ReferralInstanceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
172,3.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
1333,3.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
409,2.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
663,7.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
1456,6.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
74,12.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
988,2.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
136,15.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
524,8.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0
1574,11.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0


In [200]:
data.groupby('has_had_previous_burst')['burst_length'].describe().T

has_had_previous_burst,0,1
count,641.0,745.0
mean,4.833073,4.836242
std,4.064545,3.805772
min,3.0,3.0
25%,3.0,3.0
50%,4.0,4.0
75%,5.0,5.0
max,54.0,69.0


In [207]:
np.histogram(data.burst_length, bins=[i for i in range(3,70)])

(array([641, 281, 164,  97,  54,  35,  30,  22,  11,   7,  10,   4,   5,
          5,   2,   3,   3,   0,   1,   1,   1,   0,   1,   1,   0,   0,
          1,   0,   0,   0,   1,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1]),
 array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]))