# Initial Model - Score Regression

For the initial model, we'll use a machine learning algorithm to predict the severity score that we derived previously. This severity score function increases as:
- The number of referrals increases
- The density of referrals increases

and decreases with each `break` defined as a referral free period longer than 28 days

It is also scaled so that different time scales can be considered together.

First: Some imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
%matplotlib inline
import seaborn as sns

from scipy.stats import beta, spearmanr
from datetime import datetime

# Problem set-up
Load the data

In [2]:
referrals = pd.read_csv('../../Welcome-Centre-DataCorps-Data/referrals.csv')


clients = pd.read_csv('../../Welcome-Centre-DataCorps-Data/clients.csv', index_col=0)

referrals['ReferralTakenDate'] = pd.to_datetime(referrals['ReferralTakenDate'])
    
referrals = referrals.sort_values('ReferralTakenDate')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def get_score(referrals, break_length=28, break_coefficient=1, min_beta_fit_days=365):
    gaps = (referrals.sort_values('ReferralTakenDate').groupby('ClientId')['ReferralTakenDate'].diff().dt.days > break_length).groupby(referrals['ClientId']).sum()
    days_active = (referrals['ReferralTakenDate'].max() - referrals.groupby('ClientId')['ReferralTakenDate'].min()).dt.days + 7
    weeks_active = days_active / 7
    referrals = (referrals.groupby('ClientId').size())
    simple_ratio = (referrals - gaps) / weeks_active
    a, b, loc, scale = beta.fit((simple_ratio[days_active > min_beta_fit_days]).values)
    adjusted_ratio = ((referrals - gaps * break_coefficient + a) / (weeks_active + a + b)).sort_values()
    score_df = pd.concat([referrals, gaps, weeks_active, simple_ratio, adjusted_ratio], axis=1)
    score_df.columns=['Referrals', 'Gaps', 'Weeks Active', 'Simple Ratio', 'Empirical Bayes Ratio']
    return score_df.sort_values('Empirical Bayes Ratio')

In [4]:
scores = get_score(referrals)

  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.


In [5]:
#best clients
scores.head(5)

Unnamed: 0_level_0,Referrals,Gaps,Weeks Active,Simple Ratio,Empirical Bayes Ratio
ClientId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
419,3,2.0,217.857143,0.00459,0.0053
138,2,1.0,217.857143,0.00459,0.0053
143,2,1.0,217.857143,0.00459,0.0053
336,2,1.0,217.857143,0.00459,0.0053
1056,5,4.0,217.857143,0.00459,0.0053


In [6]:
#worst clients
scores.tail(5)

Unnamed: 0_level_0,Referrals,Gaps,Weeks Active,Simple Ratio,Empirical Bayes Ratio
ClientId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5201,33,0.0,31.142857,1.059633,0.261161
4606,43,0.0,59.571429,0.721823,0.277528
3711,56,2.0,94.0,0.574468,0.285063
2890,91,4.0,137.0,0.635036,0.373406
287,108,8.0,171.142857,0.584307,0.374338


# Some Exploration

We want to be able to predict the eventual severity of a client's usage from there initial few referrals.

How do various features correlate with severity, say, for the first few referrals

In [7]:
referral_no = referrals.assign(count=1).groupby('ClientId').expanding()['count'].sum()
referral_no = referral_no.reset_index().set_index('level_1')['count']

In [8]:
subset = referrals[referral_no < 10]

  """Entry point for launching an IPython kernel.


In [9]:
def get_feature_matrix(referrals, clients):
    general = referrals[['DependantNumber', 'LivingWithPartner']]
    current_issues = get_current_referral_issues(referrals)
    any_issue = current_issues.groupby(subset['ClientId'], as_index=False, sort=False).expanding().sum() > 0
    any_issue.index = any_issue.index.droplevel(0)
    any_issue = any_issue.loc[referrals.index]
    referral_issues = pd.concat([general, current_issues.add_prefix('current_'), 
                      any_issue.add_prefix('ever_')], axis=1)
    client_issues = get_client_features(clients).loc[referrals['ClientId']]
    client_issues.index = referrals.index
    return pd.concat([referral_issues, client_issues], axis=1)

In [10]:
def get_current_referral_issues(referrals):
    referral_reasons = referrals.filter(like='ReferralDomestic').add_prefix('reasons_')
    referral_document = referrals.filter(like='ReferralDocument').add_prefix('documents_')
    referral_benefit = referrals.filter(like='ReferralBenefit').add_prefix('benefit_')
    referral_issue = referrals.filter(like='ReferralIssue').add_prefix('r_issue_')
    referral_reason = referrals.filter(like='ReferralReason').add_prefix('reason_')
    client_issue = referrals.filter(like='ClientIssue').add_prefix('c_issue_')
    referral_agency = pd.get_dummies(referrals['ReferralAgencyId']).add_prefix('agency_')

    X = pd.concat([
        referral_reasons,
        referral_document,
        referral_benefit,
        referral_issue,
        referral_reason,
        referral_agency,
        client_issue
    ], axis=1).fillna(False).astype(bool)
    
    return X

In [11]:
def get_client_features(clients):
    clients['ClientDateOfBirth'] = pd.to_datetime(clients['ClientDateOfBirth'])
    clients['AddressSinceDate'] = pd.to_datetime(clients['AddressSinceDate'])
    clients['Age'] = datetime.now() - clients['ClientDateOfBirth']
    clients['Age'] = clients['Age'].dt.days / 365
    clients.loc[clients['Age'] < 0, 'Age'] += 100
    clients['AddressLength'] = (datetime.now() - clients['AddressSinceDate']).dt.days / 365
    categories = pd.get_dummies(clients[['EthnicityDescription', 'ClientCountryID', 'ClientAddressTypeDescription', 'AddressPostCode', 
        'LocalityDescription', 'ResidencyDescription']].astype(str))
    clients['known_partner'] = clients['PartnerId'].notnull()

    client_features = pd.concat([clients[['Age', 'AddressLength', 'ClientIsMale', 'known_partner']], 
              categories], axis=1)
    client_features = client_features.fillna(client_features.median())
    return client_features

# Correlations

To sense check our system, let's consider looking at correlations.

If a correlation is positive, it means that when the values move in the same direction. For example, if Age is positively correlated to dependency, it would mean that older people are more likely to be dependent.

Similarly, even if the variable can take only two values, for example `is_male`, if that is positive, then it means males are more likely to be dependent.

Negative correlation implies the opposite effect

## Client Features

We look at the correlation between aspects of the client and the severity, creating features for aspects stored in the client table.

Do they line up to expectation?

In [12]:
client_features = get_client_features(clients)

client_features, bayes_score = client_features.align(scores['Empirical Bayes Ratio'], 'inner', 0)

client_features.apply(lambda k: spearmanr(bayes_score, k)[0]).sort_values().dropna()

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


AddressLength                                       -0.321583
ResidencyDescription_nan                            -0.183675
ClientAddressTypeDescription_Permanent              -0.134244
EthnicityDescription_White - British                -0.101761
ClientCountryID_nan                                 -0.090892
LocalityDescription_Dewsbury                        -0.080411
AddressPostCode_WF14                                -0.063139
LocalityDescription_Mirfield                        -0.061274
AddressPostCode_WF17                                -0.058797
AddressPostCode_WF12                                -0.058725
LocalityDescription_Batley                          -0.057303
AddressPostCode_WF15                                -0.056681
LocalityDescription_Liversedge                      -0.054246
AddressPostCode_WF13                                -0.044602
AddressPostCode_WF16                                -0.042508
LocalityDescription_Rawthorpe                       -0.039356
Locality

## Adding in simple referral features

We create features for referrals, using only the categorical datapoints.

We consider both:
- The current referral reasons/issues/etc.
- All given referral reasons up until that point

In [13]:
current_issues = get_current_referral_issues(subset)

In [14]:
aligned_score = scores['Empirical Bayes Ratio'].loc[subset['ClientId']].to_frame().set_index(subset.index)

In [15]:
current_issues.apply(lambda k: spearmanr(aligned_score.iloc[:,0], k)[0]).sort_values()

agency_86                                                           -0.123583
agency_45                                                           -0.061805
c_issue_ClientIssue_Estranged                                       -0.061243
agency_13                                                           -0.050352
agency_20                                                           -0.050011
agency_77                                                           -0.045763
reason_ReferralReason_Housing Issues - New Tenancy                  -0.044678
reason_ReferralReason_Bills - Utilities (Gas, Electricity, Water)   -0.042703
agency_103                                                          -0.041373
agency_36                                                           -0.038724
agency_29                                                           -0.037523
agency_1                                                            -0.037358
reason_ReferralReason_Domestic Issues                           

In [16]:
any_issue = X.groupby(subset['ClientId'], as_index=False, sort=False).expanding().sum() > 0
any_issue.index = any_issue.index.droplevel(0)
any_issue = any_issue.loc[subset.index]
any_issue.apply(lambda k: spearmanr(aligned_score.iloc[:,0], k)[0]).sort_values()

NameError: name 'X' is not defined

Domestic Circumstances - Cooking - All Facilities seems a little strange. Other than that, benefit changes are lead to some level of dependence

## Referral No.

As referral no. is a component of our score, it should have a very high correlation with score. 

In [None]:
referral_no.loc[subset.index].to_frame().corrwith(aligned_score.iloc[:,0])

# A simple model

In [48]:
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [49]:
# Define scoring function
scoring_function = make_scorer(lambda a,b: spearmanr(a,b)[0])

In [50]:
# Create Model Pipeline
pipe = Pipeline([
    ('imp', Imputer()),
    ('scale', StandardScaler()),
    ('reg', Ridge())
])

In [None]:
y = aligned_score.loc[subset.index].iloc[:,0]

## Baseline Model

Referral No. only

In [None]:
# Fit Logistic Regression
param_grid = {
    'reg__alpha': np.logspace(-5,5,11)
}
gs = GridSearchCV(pipe, param_grid, cv=10, scoring=scoring_function, verbose=3)
gs.fit(referral_no.loc[subset.index].values.reshape(-1,1), aligned_score.iloc[:,0])

As expected, referral number is very predictive:

Achieving a spearman correlation of:

In [None]:
gs.best_score_

In [None]:
res = pd.concat([np.log(bayes_score), pd.Series(pred, index=bayes_score.index, name='prediction')], axis=1)

# Client Features Only

In [None]:
param_grid = {
    'reg__alpha': np.logspace(-5,5,11)
}
gs = GridSearchCV(pipe, param_grid, cv=10, scoring=scoring_function, verbose=3)
gs.fit(client_features, np.log(bayes_score))

In [None]:
gs.best_score_

In [None]:
from sklearn.model_selection import cross_val_predict

pred = cross_val_predict(gs.best_estimator_, client_features, np.log(bayes_score), cv=10)

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
res = pd.concat([np.log(bayes_score), pd.Series(pred, index=bayes_score.index, name='prediction')], axis=1)

In [None]:
sns.lmplot('Empirical Bayes Ratio', 'prediction', res)

In [None]:
all_features = get_feature_matrix(subset, clients)

In [None]:
all_features.apply(lambda k: spearmanr(k, aligned_score.iloc[:,0])[0]).sort_values()

In [None]:
perc_scoring_function = make_scorer(top_n_match)

In [None]:
param_grid = {
    'reg__alpha': np.logspace(-5,5,11)
}
gs = GridSearchCV(pipe, param_grid, cv=3, scoring=perc_scoring_function, verbose=3)
gs.fit(all_features.assign(ref_no=referral_no), aligned_score.iloc[:,0])

In [None]:
gs.best_params_, gs.best_score_

In [None]:
gs.best_params_, gs.best_score_

In [None]:
pd.Series(gs.best_estimator_.steps[-1][1].coef_.ravel(), all_features.assign(ref_no=referral_no).columns).sort_values()

# Cutoff Score

In [18]:
referrals = referrals.sort_values('ReferralTakenDate')

In [20]:
referrals

Unnamed: 0,ReferralInstanceId,StatusId,ReferralOnHold,ReferralTakenDate,ReferralReadyDate,ReferralCollectedDate,ReferralWorkerID,ReferralPreparedWorkerId,ReferralHandedWorkerId,ClientId,...,ReferralDietaryRequirements_Diabetic,ReferralDietaryRequirements_Gluten Intolerant/coeliac,ReferralDietaryRequirements_Halal,ReferralDietaryRequirements_No Pork,ReferralDietaryRequirements_Nut Allergy,ReferralDietaryRequirements_Other Dietary Requirement,ReferralDietaryRequirements_Vegan,ReferralDietaryRequirements_Vegetarian,ReferralDietaryRequirements_lactose intolerent,ReferralDietaryRequirements_pregnancy
499,500,3,0,2013-01-06 00:00:00,01/06/13 00:00:00,01/06/13 00:00:00,4,,,303,...,,,,,,,,,,
328,329,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,206,...,,,,,,,,,,
671,672,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,410,...,,,,,,,,,,
1745,1556,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,1425,...,,,,,,,,,,
548,549,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,336,...,,,,,,,,,,
1258,1063,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,678,...,,,,,,,,,,
159,160,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,98,...,,,,,,,,,,
1492,1297,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,827,...,,,,,,,,,,
520,521,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,320,...,,,,,,,,,,
649,650,3,0,2013-02-13 00:00:00,02/13/13 00:00:00,02/13/13 00:00:00,4,,,400,...,,,,,,,,,,


# Each Point Calculation

In [21]:
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split

In [22]:
def look_ahead_ratios(referrals, window=365, break_length=28, break_coefficient=1, min_beta_fit_days=90):
    all_ratios = []
    for i in tqdm_notebook(range(1, int(referral_no.max()))):
        segment = referrals.loc[referral_no==i,:]
        reference_date = segment.set_index('ClientId')['ReferralTakenDate']
        referrals = referrals.assign(reference_date=reference_date.loc[referrals.ClientId].values)
        date_diff = (referrals['ReferralTakenDate']-referrals['reference_date']).dt.days
        year_range = referrals[(date_diff > 0) & (date_diff <= window)]

        gaps = (year_range.sort_values('ReferralTakenDate').groupby('ClientId')['ReferralTakenDate'].diff().dt.days > break_length).groupby(year_range['ClientId']).sum()
        days_active = ((year_range['ReferralTakenDate'].max() - year_range.groupby('ClientId')['ReferralTakenDate'].min()).dt.days + 7).clip(0, window)
        weeks_active = days_active / 7
        counts = (year_range.groupby('ClientId').size())
        simple_ratio = (counts - gaps * break_coefficient) / weeks_active
        segment_ratios = pd.concat([counts, simple_ratio, days_active, gaps, weeks_active], axis=1).loc[segment.ClientId]
        segment_ratios.columns = ['counts', 'simple', 'days', 'gaps', 'weeks']
        segment_ratios.index = segment.index
        all_ratios.append(segment_ratios)

    all_ratios_df = pd.concat(all_ratios)

    a, b, loc, scale = beta.fit(all_ratios_df[all_ratios_df['days'] > min_beta_fit_days]['simple'].values)

    adjusted_ratio = ((all_ratios_df['counts'] - all_ratios_df['gaps'] * break_coefficient + a) / (all_ratios_df['weeks'] + a + b)).sort_values()
    score_df = pd.concat([all_ratios_df, adjusted_ratio], axis=1)
    return score_df.sort_values(0).dropna()

In [23]:
look_ahead_score = look_ahead_ratios(referrals.sort_values('ReferralTakenDate'))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  from ipykernel import kernelapp as app
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  





  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.


In [26]:
def calc_look_ahead_stats(referrals, window=365, break_length=28, break_coefficient=1, min_beta_fit_days=90):
    all_ratios = []
    referral_no = referrals.assign(count=1).groupby('ClientId').expanding()['count'].sum()
    referral_no = referral_no.reset_index().set_index('level_1')['count']
    referrals['referral_no'] = referral_no.loc[referrals.index]
    for i in tqdm_notebook(range(1, int(referral_no.max()))):
        # Grab the segment for each no of referrals
        segment = referrals.loc[referral_no==i,:]
        reference_date = segment.set_index('ClientId')['ReferralTakenDate']
        referrals = referrals.assign(reference_date=reference_date.loc[referrals.ClientId].values)
        date_diff = (referrals['ReferralTakenDate']-referrals['reference_date']).dt.days
        year_range = referrals[(date_diff > 0) & (date_diff <= window)]
        
        gaps = (year_range.sort_values('ReferralTakenDate').groupby('ClientId')['ReferralTakenDate'].diff().dt.days > break_length).groupby(year_range['ClientId']).sum()
        days_active = ((year_range['ReferralTakenDate'].max() - year_range.groupby('ClientId')['ReferralTakenDate'].min()).dt.days + 7).clip(0, window)
        weeks_active = days_active / 7
        counts = (year_range.groupby('ClientId').size())
        simple_ratio = (counts - gaps * break_coefficient) / weeks_active
        segment_ratios = pd.concat([counts, simple_ratio, days_active, gaps, weeks_active], axis=1).loc[segment.ClientId]
        segment_ratios.columns = ['counts', 'simple', 'days', 'gaps', 'weeks']
        segment_ratios.index = segment.index
        # Fill in details for last referral per client
        segment_ratios['counts'] = segment_ratios['counts'].fillna(0)
        segment_ratios['gaps'] = segment_ratios['gaps'].fillna(0)
        segment_ratios['days'] = segment_ratios['days'].fillna(((segment['ReferralTakenDate'].max() 
                                   - segment['ReferralTakenDate']).dt.days + 7).clip(0, window))
        segment_ratios['weeks'] = segment_ratios['weeks'].fillna(segment_ratios['days']/7)
        segment_ratios['simple'] = segment_ratios['simple'].fillna(0)
        all_ratios.append(segment_ratios)
    all_ratios_df = pd.concat(all_ratios).loc[referrals.index]
    
    referrals[all_ratios_df.columns] = all_ratios_df
    return referrals

In [27]:
referrals2 = referrals.pipe(calc_look_ahead_stats)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  # Remove the CWD from sys.path while we load stuff.





Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [30]:
referrals2 = referrals2[referrals2['weeks']>52]

In [36]:
referrals2 = referrals2.sort_values('ReferralTakenDate')

In [37]:
referrals2.to_csv('cutoff_referral_score.csv')

# Results Exploration

In [32]:
from sklearn.model_selection import TimeSeriesSplit

In [38]:
all_features = get_feature_matrix(referrals2, clients)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [39]:
all_features

Unnamed: 0,DependantNumber,LivingWithPartner,current_reasons_ReferralDomesticCircumstances_Cooking - All Facilities,current_reasons_ReferralDomesticCircumstances_Cooking - Hob,current_reasons_ReferralDomesticCircumstances_Cooking - Kettle,current_reasons_ReferralDomesticCircumstances_Cooking - Microwave,current_reasons_ReferralDomesticCircumstances_Non-Cook - NFA,current_reasons_ReferralDomesticCircumstances_Non-Cook - Non-cook at home,current_documents_ReferralDocument_Benefit Issue letter,current_documents_ReferralDocument_Crime Number,...,LocalityDescription_Waterloo; Huddersfield,LocalityDescription_nan,ResidencyDescription_Asylum Seeker,ResidencyDescription_Destitute,ResidencyDescription_Migrant,ResidencyDescription_New to area,ResidencyDescription_North Kirklees,ResidencyDescription_Refugee with leave to stay,ResidencyDescription_South Kirklees Resident,ResidencyDescription_nan
499,0,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
196,4,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
627,0,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
423,1,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
247,1,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
439,1,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
1143,0,0,False,False,False,False,False,False,False,False,...,0,1,0,0,0,0,0,0,0,1
1209,0,0,False,False,False,False,False,False,False,False,...,0,1,0,0,0,0,0,1,0,0
745,0,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
950,4,0,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1


In [494]:
X = referral_no.to_frame()

level_1
0        1.0
1        2.0
2        3.0
1860     4.0
1986     5.0
2269     6.0
2396     7.0
3        1.0
4        2.0
5        1.0
6        1.0
5653     2.0
8        1.0
9        1.0
6897     2.0
7395     3.0
10       1.0
11       1.0
12       2.0
2739     3.0
2841     4.0
2981     5.0
15005    6.0
15342    7.0
15763    8.0
13       1.0
14       2.0
15       3.0
17       1.0
9189     2.0
        ... 
17847    1.0
17851    1.0
17979    2.0
17853    1.0
17862    1.0
17860    1.0
17864    1.0
17871    1.0
17874    1.0
17947    2.0
17881    1.0
17882    1.0
17883    1.0
17892    1.0
17893    1.0
17896    1.0
17903    1.0
17905    1.0
17907    1.0
17912    1.0
17916    1.0
17918    1.0
17924    1.0
17932    1.0
17937    1.0
17948    1.0
17952    1.0
17959    1.0
17968    1.0
17969    1.0
Name: count, Length: 17980, dtype: float64

In [43]:
referrals2.columns

Index(['ReferralInstanceId', 'StatusId', 'ReferralOnHold', 'ReferralTakenDate',
       'ReferralReadyDate', 'ReferralCollectedDate', 'ReferralWorkerID',
       'ReferralPreparedWorkerId', 'ReferralHandedWorkerId', 'ClientId',
       ...
       'ReferralDietaryRequirements_Vegetarian',
       'ReferralDietaryRequirements_lactose intolerent',
       'ReferralDietaryRequirements_pregnancy', 'referral_no',
       'reference_date', 'counts', 'simple', 'days', 'gaps', 'weeks'],
      dtype='object', length=181)

In [79]:
X, y = all_features.assign(referral_no=referrals2['referral_no']).align(referrals2['simple'], 'inner', axis=0)
# X = X[['referral_no']]
sort_index = referrals.loc[X.index].sort_values('ReferralTakenDate').index
X = X.loc[sort_index]
y = y.loc[sort_index]
# X, y = all_features.assign(ref_no=referral_no).align(look_ahead_score[0], 'inner', axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [80]:
def top_n_match(x, y, top_n_percent=10):
    p = 100 - top_n_percent
    return ((x > np.percentile(x, p)) == (y > np.percentile(y, p))).mean()

In [81]:
param_grid = {
    'reg__alpha': np.logspace(-5,5,11)
}
gs = GridSearchCV(pipe, param_grid, cv=3, scoring=scoring_function, verbose=3)
gs.fit(X_train, y_train)
print(gs.best_score_)

Fitting 3 folds for each of 11 candidates, totalling 33 fits
[CV] reg__alpha=1e-05 ................................................
[CV] ....... reg__alpha=1e-05, score=0.3238596019792812, total=   1.0s
[CV] reg__alpha=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ....... reg__alpha=1e-05, score=0.3566711599157679, total=   1.0s
[CV] reg__alpha=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV] ...... reg__alpha=1e-05, score=0.36639053587340564, total=   1.0s
[CV] reg__alpha=0.0001 ...............................................
[CV] ...... reg__alpha=0.0001, score=0.3238596019792812, total=   1.0s
[CV] reg__alpha=0.0001 ...............................................
[CV] ....... reg__alpha=0.0001, score=0.356668294070451, total=   1.0s
[CV] reg__alpha=0.0001 ...............................................
[CV] ..... reg__alpha=0.0001, score=0.36639055211461075, total=   1.0s
[CV] reg__alpha=0.001 ................................................
[CV] ....... reg__alpha=0.001, score=0.3238596019792812, total=   1.0s
[CV] reg__alpha=0.001 ................................................
[CV] ....... reg__alpha=0.001, score=0.3566683706259015, total=   0.9s
[CV] reg__alpha=0.001 ................................................
[CV] ...... reg__alpha=0.001, score=0.36639281657434114, total=   1.0s
[CV] reg__alpha=0.01 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:   44.0s finished


0.439880726648


In [82]:
y_test.to_frame().assign(pred=gs.predict(X_test)).corr(method='spearman')

Unnamed: 0,simple,pred
simple,1.0,0.440957
pred,0.440957,1.0


In [83]:
top_10 = y_test.to_frame().assign(pred=gs.predict(X_test)).sort_values('simple').rank(ascending=False) < 100

In [84]:
y_test.to_frame().assign(pred=gs.predict(X_test)).sort_values('simple').rank(ascending=False) < 100

Unnamed: 0,simple,pred
12560,False,False
10842,False,False
10833,False,False
10830,False,False
10825,False,False
10819,False,False
10814,False,False
10847,False,False
10808,False,False
10803,False,False


In [85]:
top_10[top_10['simple']]['pred'].mean()

0.61224489795918369

In [103]:
test = referrals2[referrals2['ClientId']==2]

In [107]:
test.set_index('ReferralTakenDate').resample('1W').asfreq()

Unnamed: 0_level_0,ReferralInstanceId,StatusId,ReferralOnHold,ReferralReadyDate,ReferralCollectedDate,ReferralWorkerID,ReferralPreparedWorkerId,ReferralHandedWorkerId,ClientId,PartnerName,...,ReferralDietaryRequirements_Vegetarian,ReferralDietaryRequirements_lactose intolerent,ReferralDietaryRequirements_pregnancy,referral_no,reference_date,counts,simple,days,gaps,weeks
ReferralTakenDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-02-02,,,,,,,,,,,...,,,,,NaT,,,,,
2014-02-09,,,,,,,,,,,...,,,,,NaT,,,,,
2014-02-16,,,,,,,,,,,...,,,,,NaT,,,,,
2014-02-23,,,,,,,,,,,...,,,,,NaT,,,,,
2014-03-02,,,,,,,,,,,...,,,,,NaT,,,,,
2014-03-09,,,,,,,,,,,...,,,,,NaT,,,,,
2014-03-16,,,,,,,,,,,...,,,,,NaT,,,,,
2014-03-23,,,,,,,,,,,...,,,,,NaT,,,,,
2014-03-30,,,,,,,,,,,...,,,,,NaT,,,,,
2014-04-06,,,,,,,,,,,...,,,,,NaT,,,,,


In [493]:
y_test.to_frame().assign(pred=gs.predict(X_test)).sort_values(0).rank(ascending=False)

Unnamed: 0,0,pred
14043,2398.0,1856.0
14033,2397.0,266.0
14093,2396.0,1912.0
14073,2394.5,1940.0
14151,2394.5,1957.0
14162,2393.0,1741.0
14122,2390.5,534.0
14148,2390.5,2320.0
14111,2390.5,2275.0
14030,2390.5,385.0
